## Web scraping

- Retrieving HTML data from a domain name
- Parsinig the data for specific info
- Store the info
- Move to another page and repeat the process

## Beautifulsoup

Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html, 'html.parser')

In [None]:
bs

In [None]:
bs.h1

In [None]:
bs.find('span', {'class': 'red'})

In [None]:
nameList = bs.find_all('span', {'class': 'green'})
for name in nameList:
    print(name.text)

## Beautifulsoup with RegEx

In [None]:
import re

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

images = bs.find_all('img',
    {'src':re.compile('../img/gifts/img\d*.jpg')})
for image in images: 
    print(image['src'])

## A live example: Amazon Bestsellers

https://www.amazon.com/Best-Sellers-Books/zgbs/books

In [None]:
from bs4 import BeautifulSoup
import requests

url = 'https://www.amazon.com/Best-Sellers-Books/zgbs/books'
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'}

seller_page = requests.get(url, headers = headers)
seller_soup = BeautifulSoup(seller_page.content, 'html.parser')

In [None]:
books = seller_soup.find_all(id = 'gridItemRoot')

In [None]:
book = books[0]

In [None]:
rank = book.find('span', {'class': 'zg-bdg-text'})
rank

In [None]:
print(rank.text)

In [None]:
children = book.find('div', {'class': 'zg-grid-general-faceout'})

In [None]:
title = children.div.contents[1].text
title

In [None]:
author = children.div.contents[2].text
author

In [None]:
price = children.div.contents[-1].text
price

In [None]:
# See the info in a DataFrame
import pandas as pd

data = []

for book in books: 
    book_data = {}
    
    rank = book.find('span', {'class': 'zg-bdg-text'}).text[1:]
    children = book.find('div', {'class': 'zg-grid-general-faceout'})
    title = children.div.contents[1].text
    author = children.div.contents[2].text
    price = children.div.contents[-1].text
    
    book_data['rank'] = rank
    book_data['title'] = title
    book_data['author'] = author
    book_data['price'] = price
    data.append(book_data)

In [None]:
df = pd.DataFrame(data, columns = ['rank', 'title', 'author', 'price'])
df

In [None]:
# Store the info in csv format 
import csv 

csv_headers = ['rank', 'title', 'author', 'price']

with open('amazon_books.csv', 'w', encoding = 'utf-8', newline = '') as f:
        writer = csv.writer(f)
        writer.writerow(csv_headers)

In [None]:
for book in books: 
    rank = book.find('span', {'class': 'zg-bdg-text'}).text[1:]
    children = book.find('div', {'class': 'zg-grid-general-faceout'})
    title = children.div.contents[1].text
    author = children.div.contents[2].text
    price = children.div.contents[-1].text
    
    with open('amazon_books.csv', 'a', encoding = 'utf-8', newline = '') as f:
        # param 'a' means 'append'
        writer = csv.writer(f)
        writer.writerow([rank, title, author, price])

## Further steps 

- To automatically visit multiple websites and extract data from them (aka web crawling), see [Scrapy](https://scrapy.org)
- Be mindful of legal issues when scraping websites. Use an API whenever possible. 