In [1]:
#Import relevant python packages
import os
import pandas as pd
import time
import random
import requests
import random
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from mlxtend.frequent_patterns import apriori, association_rules

#### Goodreads Dataset Procurement and Processing

In [2]:
class GoodreadsDataset:
    
    def __init__(self, save_path: Path, num_pages: int):
        self.save_path = save_path
        self.num_pages = num_pages
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }
        self.home_url = "https://www.goodreads.com"
        self.base_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page="
        self.books_data = []

    @staticmethod
    def retrieve_genres(headers: dict, book_url: str):
        # Make a request to fetch the book page for the provided book url.
        response = requests.get(url=book_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Identify genres using the HTML structure belonging to the genre category.
        genres = soup.select('a[href*="/genres/"]')
        return genres
    
    @staticmethod
    def to_csv(books_data: list, file_name: str):
        df = pd.DataFrame(books_data)
        df.to_csv(file_name, index=False)
        
    @staticmethod
    def scrape_bookpage(books_data: list, headers: dict, base_url: str, home_url: str, page: int):
        # Formulate bookpage url. 
        url = base_url + str(page)
        # Make a request to fetch the page content.
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all books within the page content. 
        books = soup.find_all('tr', itemtype="http://schema.org/Book")
        for book in books:
            try:
                title = book.find('a', class_="bookTitle")
                author = book.find('a', class_="authorName")
                rating = book.find('span', class_="minirating")
                book_url = home_url + title['href']
                genres = GoodreadsDataset.retrieve_genres(headers, book_url)
                # Remove leading and trailing whitespaces.
                title = title.text.strip()
                author = author.text.strip()
                rating = rating.text.strip()[:4]
                genres = [genre.text.strip() for genre in genres]
                # Construct dictionary to store fetched book data. 
                book_data = {
                    'Title': title,
                    'Author': author,
                    'Rating': rating,
                    'Genres': genres
                }
                books_data.append(book_data)
                # Implement a request delay, to ensure continous access to scraping API.
                time.sleep(2)
            except (AttributeError, ValueError):
                continue
        
        return books_data
    
    def main(self):

        filename = "goodreads_dataset.csv"
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        filepath = os.path.join(self.save_path, filename)
        
        if not os.path.exists(filepath):
            for page in range(self.num_pages):
                print(f"Fetching books metadata from page {page}")
                books_data = self.scrape_bookpage(self.books_data, self.headers, self.base_url, self.home_url, page)
                self.books_data.extend(books_data)
                self.to_csv(self.books_data, filepath)
        else:
            print(f"Dataset CSV file already exists.")

In [3]:
filepath = os.path.join("./data", "goodreads_dataset.csv")

if not os.path.exists(filepath):
    dataset = GoodreadsDataset("./data", 10)
    dataset.main()

dataset_df = pd.read_csv(filepath)

In [4]:
dataset_df.head(10)

Unnamed: 0,Title,Author,Rating,Genres
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.34,"['Young Adult', 'Fiction', 'Fantasy', 'Science..."
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.5,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',..."
2,Pride and Prejudice,Jane Austen,4.29,"['Fiction', 'Historical Fiction', 'Historical'..."
3,To Kill a Mockingbird,Harper Lee,4.26,"['Fiction', 'Historical Fiction', 'School', 'L..."
4,The Book Thief,Markus Zusak,4.39,"['Historical Fiction', 'Fiction', 'Young Adult..."
5,"Twilight (The Twilight Saga, #1)",Stephenie Meyer,3.66,"['Fantasy', 'Young Adult', 'Romance', 'Fiction..."
6,Animal Farm,George Orwell,3.99,"['Fiction', 'Dystopia', 'Fantasy', 'School', '..."
7,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.61,"['Fantasy', 'Fiction', 'Classics', 'Adventure'..."
8,The Chronicles of Narnia (The Chronicles of Na...,C.S. Lewis,4.28,"['Fantasy', 'Classics', 'Fiction', 'Young Adul..."
9,The Fault in Our Stars,John Green,4.13,"['Young Adult', 'Fiction', 'Contemporary', 'Re..."
