# Most Read This Week on Goodreads

### Text Mining Models and Algorithms: PSET 3

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
import requests
import re
import os
import random
from bs4 import BeautifulSoup

In [2]:
# Define the gekodriver and download paths along with the url
geko_path = "D:\\BSE\\Cursuri\\2 Text Mining Models and Algorithms\\week 2\\geckodriver.exe"
download_path = "C:\\Users\\X1\\Desktop"
url = "https://www.goodreads.com/genres/most_read/"

In [3]:
def get_all_genres():
    '''
    Function that retrieves all genres available on the goodreads' website
    '''
    genres = []
    page = requests.get(url)
    content = BeautifulSoup(page.text, "lxml")
    for item in content.find_all("a", class_="genreList__genreLink"):
        genres += item
    return list(dict.fromkeys(genres))

def get_genre_top_class(genre):
    '''
    Function for scrapping the website
    '''
    genre_url = url + genre
    page = requests.get(genre_url)
    soup = BeautifulSoup(page.content, "html.parser")
    top_class = soup.find_all("div", class_="leftAlignedImage bookBox")
    return top_class

def get_book_details(genre):
    '''
    Function that retrieves all the details of the most read books in the current week
    '''
    top_class = get_genre_top_class(genre)
    book_genre = [genre for i in top_class]
    book_url = [re.findall(r'href\=\"([\/a-zA-Z0-9\-]+)', str(i)) for i in top_class]    
    book_title = [re.findall(r'alt\=\"(.+)\" class', str(i)) for i in top_class]
    book_author = [re.findall(r'\/author\/show\/\d+\.([\_\d\w]+)\"\>', str(i).replace("\\", "")) for i in top_class]
    book_rate = [re.findall(r'(\d\.\d{2}) avg rating', str(i).replace("\\", "")) for i in top_class]
    book_ratings = [re.findall(r'avg rating &mdash; ([\d\,]+) ratings', str(i).replace("\\", "")) for i in top_class]
    book_publisyear = [re.findall(r'&mdash; published (\d{4})n', str(i).replace("\\", "")) for i in top_class]
    book_abstract = [re.findall(r'<span id="freeText\d+" style="display:none">(.+)\.</span>n', str(i).replace("\\", "")) for i in top_class]
    
    df = pd.DataFrame(list(zip(book_genre, book_url, book_title, book_author, book_rate, book_ratings, book_publisyear, book_abstract)),
               columns =["Genre", "Url", "Title", "Author", "Rate", "Ratings", "Year", "Abstract"])

    return df

In [4]:
# Retrieving all the genres 
get_all_genres()
all_genres = pd.DataFrame(get_all_genres()) 
all_genres.head(50)

Unnamed: 0,0
0,Art
1,Biography
2,Business
3,Children's
4,Christian
5,Classics
6,Comics
7,Cookbooks
8,Ebooks
9,Fantasy


In [5]:
# Choose a genre
genres = get_all_genres()
for i in genres:
    print(i+ " /", end = " ")
print("/n")
# print(genres)
df_selected_genre = pd.DataFrame

while True:
    try:
        selected = input("Please choose a genre from below: ")
        if selected not in genres:
            raise Exception("sorry!")
    except Exception:
        print("Try again!")
        continue
    
    if selected == "All":
        if selected in genres:
            for genre in genres:
                data.append(get_book_details(genre))
                df_selected_genre = pd.concat(data)
            break
    else:
        df_selected_genre = get_book_details(selected)
        break

Art / Biography / Business / Children's / Christian / Classics / Comics / Cookbooks / Ebooks / Fantasy / Fiction / Graphic Novels / Historical Fiction / History / Horror / Memoir / Music / Mystery / Nonfiction / Poetry / Psychology / Romance / Science / Science Fiction / Self Help / Sports / Thriller / Travel / Young Adult / More Genres / /n
Please choose a genre from below: Art


In [6]:
df_selected_genre.shape

(100, 8)

In [7]:
# Dataframe visualization
df_selected_genre.head(10)

Unnamed: 0,Genre,Url,Title,Author,Rate,Ratings,Year,Abstract
0,Art,[/book/show/56416637-write-my-name-across-the-...,[Write My Name Across the Sky],[Barbara_O_Neal],[4.42],"[12,122]",[2021],[The USA Today bestselling author of When We B...
1,Art,[/book/show/59009957-there-is-no-devil],"[There Is No Devil (Sinners Duet, #2)]",[Sophie_Lark],[4.07],"[7,306]",[2021],[I Couldn’t Kill Mara…nnBut that doesn’t mean ...
2,Art,[/book/show/55073379-the-venice-sketchbook],[The Venice Sketchbook],[Rhys_Bowen],[4.27],"[39,658]",[2021],[Love and secrets collide in Venice during WWI...
3,Art,[/book/show/58437699-ain-t-burned-all-the-bright],[Ain't Burned All the Bright],[Jason_Reynolds],[4.64],"[1,425]",[2022],[Prepare yourself for something unlike anythin...
4,Art,[/book/show/57001545-still-life],[Still Life],[Sarah_Winman],[4.15],"[10,227]",[2021],"[Tuscany, 1944: As Allied troops advance and b..."
5,Art,[/book/show/45046808-big-lies-in-a-small-town],[Big Lies in a Small Town],[Diane_Chamberlain],[4.11],"[69,942]",[2020],[]
6,Art,[/book/show/50898151-oddball],"[Oddball (Sarah's Scribbles, #4)]",[Sarah_Andersen],[4.33],"[2,691]",[2021],[The newest Sarah's Scribbles collection from ...
7,Art,[/book/show/57094697-hooked],[Hooked: How Crafting Saved My Life],[Sutton_Foster],[4.12],"[3,024]",[2021],[From the 2-time Tony Award-winner and the sta...
8,Art,[/book/show/43263542-nothing-ventured],"[Nothing Ventured (Detective William Warwick, ...",[Jeffrey_Archer],[3.91],"[29,657]",[2019],[Nothing Ventured heralds the start of a brand...
9,Art,[/book/show/57386527-cat-kid-comic-club],[Cat Kid Comic Club: Perspectives],[Dav_Pilkey],[4.53],[709],[2021],[Cat Kid Comic Club is back in session in this...


In [8]:
# Saving the data as .csv
df_selected_genre.to_csv('Goodreads_MostReadThisWeek.csv')
print("Saved as .csv")

Saved as .csv


In [9]:
# Book with the highest rate 




In [10]:
# Book with most ratings




In [12]:
# Plotting ....something

# x = np.array(df_selected_genre["Rate"])
# y = np.array(df_selected_genre["Year"])

# plt.scatter(x, y, alpha=0.5)
# plt.show()