# Scraping Books

In [None]:
import sys

import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib

## Defining the Functions

In [None]:
def fetch_page(page):
    # Informational print
    print(f"Scraping page {page + 1}")
    
    # Create the response using a header for the language
    response = requests.get(
        f'http://books.toscrape.com/catalogue/page-{page + 1}.html',
        headers = {"Accept-Language":"en-US"}
    )
    
    # Create the soup
    soup = BeautifulSoup(response.content, "html.parser")
    
    return soup

In [None]:
# Create a mapping for the ratings
ratings = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}

In [None]:
def add_books_to_dict(soup, dict):
    for book_html in soup.find_all(class_ = "product_pod"):
        # Find the title and price
        dict['Title'].append(book_html.find("h3").find("a").string)
        dict['Price'].append(float(book_html.find(class_ = "price_color").string.strip('£')))
        
        # Find the star rating and user the mapping to convert it to numbers
        stars_html = book_html.find(class_ = "star-rating")
        dict['Rating'].append(ratings.get(stars_html.attrs['class'][1], 0))

In [None]:
books_dict = { 'Title': [], 'Price': [], 'Rating': [] }

In [None]:
def create_books_df(max_page):
    for page in range(max_page):
        # Get page from the soup and add it to the books_dict
        soup = fetch_page(page)
        add_books_to_dict(soup, books_dict)

    return pd.DataFrame.from_dict(books_dict)

## Creating the `books_df`

In [None]:
books_df = create_books_df(50)
books_df.shape

### Visualizing

In [None]:
books_df.head()

In [None]:
books_df.groupby("Rating").count()["Title"].plot(kind="bar")

In [None]:
books_df['Price'].hist()

In [None]:
books_df.to_csv("books.csv")
#all_books_df.to_excel('books.xlsx', sheet_name='Books')

### Testing

Before we test our code, there is one special note about this particular exercise.

If you check the test below, you will see that we're passing in the `books_dict` variable. Since `books_dict` is quite large, the test runs into an issue when pickling, which is that it reaches the "maximum recursion depth" when trying to pickle the dictionary (more information in [this Stack Overflow post](https://stackoverflow.com/questions/2134706/hitting-maximum-recursion-depth-using-pickle-cpickle)), so we need to increase the recursion limit before testing.

In [None]:
# Checking the current limit
sys.getrecursionlimit()

In [None]:
# Setting a new one and double-checking
sys.setrecursionlimit(4500)
sys.getrecursionlimit()

In [None]:
from nbresult import ChallengeResult

result = ChallengeResult('books',
    books_dict=books_dict,
    columns=books_df.columns,
    title=str(books_df.loc[0,'Title']),
    price=books_df.loc[0,'Price'],
    rating=books_df.loc[0,'Rating']
)
result.write()
print(result.check())