### Web Scraping and Data Clean up
First part of web scraping to obtain book id uses selenium driver and was done on Windows system due to issue with firefox profile on Ubuntu. Rest of it was executed on Ubuntu which is where rest of the project was done.

#### Installing modules needed for web scraping and importing libraries

In [None]:
!pip install selenium

In [None]:
!pip install chromedriver-py

In [184]:
import selenium
from selenium import webdriver
from selenium.webdriver import chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import random
from urllib.parse import urlencode
import requests
from urllib.error import HTTPError, URLError
from selenium.common.exceptions import NoSuchElementException, WebDriverException

In [7]:
import pandas as pd
import numpy as np
import pickle

#### BookSummaries Data is from Carnegie Mellon University originally and obtained from Kaggle
https://www.kaggle.com/datasets/ymaricar/cmu-book-summary-dataset

In [3]:
filename = "data/booksummaries.txt"
lines = open(filename, encoding="utf8").read().splitlines()
len(lines)

16559

In [4]:
titles = []
authors = []
pubDates = []
genres = []
summaries = []
for line in range(0,len(lines)):
    content = lines[line].split('/m/', 1)[1].split('\t', 1)[1]
    fields = content.split('\t')
    title = fields[0]
    titles.append(title)
    author = fields[1]
    authors.append(author)
    pubDate = fields[2]
    if pubDate == '':
        pubDate = np.nan
    pubDates.append(pubDate)
    genreDict = fields[3]
    genre = ''
    summary = fields[-1]
    try:
        genreDict = eval(genreDict)
        for val in genreDict.values():
            genre += val + ' '
    except:
        genre = np.nan
    
    genres.append(genre)
    summary = summary.replace('\'', '')
    summaries.append(summary)

In [5]:
books_df = pd.DataFrame({'Title':titles, 'Author':authors, 'PublishedDate': pubDates,'Genre':genres, 'Summary':summaries})
books_df

Unnamed: 0,Title,Author,PublishedDate,Genre,Summary
0,Animal Farm,George Orwell,1945-08-17,Roman à clef Satire Children's literature Spec...,"Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,1962,Science Fiction Novella Speculative fiction Ut...,"Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,1947,Existentialism Fiction Absurdist fiction Novel,The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,Vernor Vinge,,Hard science fiction Science Fiction Speculati...,The novel posits that space around the Milky ...
...,...,...,...,...,...
16554,Under Wildwood,Colin Meloy,2012-09-25,,"Prue McKeel, having rescued her brother from ..."
16555,Transfer of Power,Vince Flynn,2000-06-01,Thriller Fiction,The reader first meets Rapp while he is doing...
16556,Decoded,Jay-Z,2010-11-16,Autobiography,The book follows very rough chronological ord...
16557,America Again: Re-becoming The Greatness We Ne...,Stephen Colbert,2012-10-02,,Colbert addresses topics including Wall Stree...


In [108]:
book_names = books_df['Title'].tolist()

In [109]:
author_names = books_df['Author'].tolist()

In [224]:
num_books = len(book_names)
num_books

16559

#### Login to goodreads

In [115]:
s = Service("chromedriver-win64/chromedriver.exe")
opts = Options()
opts.headless= True
browser = webdriver.Chrome(service=s, options=opts)
browser.get("https://www.goodreads.com/ap/signin?language=en_US&openid.assoc_handle=amzn_goodreads_web_na&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.goodreads.com%2Fap-handler%2Fsign-in&siteState=eyJyZXR1cm5fdXJsIjoiaHR0cHM6Ly93d3cuZ29vZHJlYWRzLmNvbS8ifQ%3D%3D")

In [116]:
log_email = browser.find_element(By.ID, value='ap_email')
log_pwd = browser.find_element(By.ID, value='ap_password')
log_email.send_keys("archana.gopal@gmail.com")
log_pwd.send_keys("francon77")
log_pwd.submit()

#### Obtain book id for each book in the dataset from goodreads. Will need to look up genres using the book id later.

In [None]:
book_id_list = []
for index in range(0, num_books):
    success = False
    while(not success):
        try:
            browser.get("https://www.goodreads.com/search")
            book_name = book_titles[index]
            author_full = book_authors[index]
            if(author_full == ''):
                book_id_list.append(np.nan)
                success=True
                continue
            search_field = browser.find_element(By.ID, value='search_query_main')
            search_button = browser.find_element(By.CLASS_NAME, value='searchBox__button')
            search_field.send_keys(book_name)
            search_button.click()
            page = urlopen(browser.current_url)
            html = page.read().decode("utf-8")
            soup = BeautifulSoup(html, "html.parser")
            books = soup.find_all('tr')
            link = ''
            for book in books:
                link = ''
                props = book.find_all('a')
                attributes_book = props[0].attrs
                bookname = attributes_book.get('title')
                link = attributes_book.get('href')
                span = props[2].find('span')
                author = span.string
                if ((book_name in bookname) and (author_full in author)):
                    book_ids = re.findall('show\/(\d*).',link)
                    book_id_list.append((book_name, book_ids[0]))
                    break
            if(link == ''):
                book_id_list.append(np.nan)
            time.sleep(random.randint(4,9))
            success=True
        except HTTPError:
            time.sleep(3)
            success = False
        except NoSuchElementException:
            time.sleep(3)
            success = False
        except WebDriverException:
            time.sleep(5)
            success = False
        except URLError:
            time.sleep(3)
            success = False

#### Save the Book ID list as pickle file for reference

In [None]:
'''with open('data/book_ids.pkl', 'wb') as f:
    pickle.dump(book_id_list, f)'''

#### Load the pickle file and create dataframe with book id

In [14]:
books_df['BookId'] = np.nan
with open('data/book_ids.pkl', 'rb') as f:
    book_id_list = pickle.load(f)
len(book_id_list)

12882

In [9]:
for val_tuple in book_id_list:
    if(isinstance(val_tuple, tuple)):
        name = val_tuple[0]
        book_id = val_tuple[1]
        books_df['BookId'] = np.where(name == books_df['Title'],book_id, books_df['BookId'])

In [None]:
books_df = books_df[['Title', 'BookId', 'Author', 'PublishedDate', 'Genre', 'Summary']]
books_df['BookId'] = books_df['BookId'].astype(str)
books_df['BookId'] = np.where(books_df['BookId'] == 'nan', np.nan, books_df['BookId'])
books_df['Author'] = books_df['Author'].astype(str)
books_df['Author'] = np.where(books_df['Author'] == '', np.nan, books_df['Author'])
books_df_clean = books_df.dropna(subset=['BookId', 'Author'])
books_df_clean  = books_df_clean.reset_index().drop(columns=['index'])
books_df_clean.info()

#### Main genres to classify books into

In [11]:
main_genres = ['humor', 'comedy', 'horror', 'fantasy', 'historical', 'romance', 'thriller', 'mystery','crime', 'science', 'philosophy']
main_genres

['humor',
 'comedy',
 'horror',
 'fantasy',
 'historical',
 'romance',
 'thriller',
 'mystery',
 'crime',
 'science',
 'philosophy']

#### Obtain the top genre for each book along with whether fiction or not

In [None]:
books_df_clean['NewGenre'] = ''
books_df_clean['Fiction'] = 1
book_base_url = 'https://www.goodreads.com/book/show/'
genre_set = 0
fiction_set = 0
success = 0
for row in range(0,books_df_clean.shape[0]):
    genre_set=0
    fiction_set = 0
    success = 0
    while(not success):
        id = books_df_clean.loc[row, 'BookId']
        book_url = book_base_url + str(id)
        response = requests.get(book_url) 
        if(response.status_code != 200):
             time.sleep(3)
             success = False
        else:
            doc = BeautifulSoup(response.text, 'html.parser')
            book_tags = doc.find_all('div', class_="BookPageMetadataSection__genres")
            try:
                spans = book_tags[0].find_all('span', class_='Button__labelItem')
                for span in spans:
                    genre = str(span.string).lower()
                    if(not fiction_set):
                        if(genre== 'nonfiction'):
                            books_df_clean.loc[row, 'Fiction'] = 0
                            fiction_set=1
                        elif(genre == 'fiction'):
                            books_df_clean.loc[row, 'Fiction'] = 1
                            fiction_set = 1
                    if(not genre_set):
                        if(any(genre in x  for x in main_genres)):
                            books_df_clean.loc[row, 'NewGenre'] = genre
                            genre_set=1
            except IndexError:
                print("no genre for " + id)
        success = True
        time.sleep(random.randint(1,4))

#### Save the final dataframe as pickle file

In [None]:
'''with open('data/books_df_clean.pkl', 'wb') as f:
    pickle.dump(books_df_clean,f)'''