In [1]:
# $DELETE_BEGIN

# Scraping Books

In [2]:
import sys

import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib

## Defining the Functions

In [3]:
def fetch_page(page):
    # Informational print
    print(f"Scraping page {page + 1}")
    
    # Create the response using a header for the language
    response = requests.get(
        f'http://books.toscrape.com/catalogue/page-{page + 1}.html',
        headers = {"Accept-Language":"en-US"}
    )
    
    # Create the soup
    soup = BeautifulSoup(response.content, "html.parser")
    
    return soup

In [4]:
# Create a mapping for the ratings
ratings = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}

In [5]:
def add_books_to_dict(soup, dict):
    for book_html in soup.find_all(class_ = "product_pod"):
        # Find the title and price
        dict['Title'].append(book_html.find("h3").find("a").string)
        dict['Price'].append(float(book_html.find(class_ = "price_color").string.strip('£')))
        
        # Find the star rating and user the mapping to convert it to numbers
        stars_html = book_html.find(class_ = "star-rating")
        dict['Rating'].append(ratings.get(stars_html.attrs['class'][1], 0))

In [6]:
books_dict = { 'Title': [], 'Price': [], 'Rating': [] }

In [7]:
def create_books_df(max_page):
    for page in range(max_page):
        # Get page from the soup and add it to the books_dict
        soup = fetch_page(page)
        add_books_to_dict(soup, books_dict)

    return pd.DataFrame.from_dict(books_dict)

## Creating the `books_df`

In [8]:
books_df = create_books_df(50)
books_df.shape

### Visualizing

In [9]:
books_df.head()

In [10]:
books_df.groupby("Rating").count()["Title"].plot(kind="bar")

In [11]:
books_df['Price'].hist()

In [12]:
books_df.to_csv("books.csv")
#all_books_df.to_excel('books.xlsx', sheet_name='Books')

In [13]:
from nbresult import ChallengeResult

result = ChallengeResult('books',
    columns=books_df.columns,
    title=str(books_df.loc[0,'Title']),
    price=books_df.loc[0,'Price'],
    rating=books_df.loc[0,'Rating']
)
result.write()
print(result.check())

In [14]:
# $DELETE_END