# Comparative Analysis between the List of "Best Books Ever" and the New York Times' 100 Notable Books of 2023

## the List of "Best Books Ever

In [8]:
import requests
from bs4 import BeautifulSoup
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Get HTML content of the book list page
url = "https://www.goodreads.com/list/show/1.Best_Books_Ever"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the parent elements containing book information
    book_items = soup.find_all('tr', itemtype='http://schema.org/Book')[:20]

    # Iterate through each book item
    for index, book in enumerate(book_items, start=1):
        # Extract title information
        title_elem = book.find('a', class_='bookTitle')
        title = title_elem.text.strip() if title_elem else "Title not found"

        # Extract author information
        author_elem = book.find('div', class_='authorName__container')
        author = author_elem.text.strip() if author_elem else "Author not found"

        # Extract rating information
        rating_elem = book.find('span', class_='minirating')
        rating = rating_elem.text.strip() if rating_elem else "Avg Rating not found"

        # Get the URL of the book's details page
        book_url = "https://www.goodreads.com" + title_elem['href']
        
        # Get the HTML content of the book's details page
        response_book = requests.get(book_url)
        if response_book.status_code == 200:
            soup_book = BeautifulSoup(response_book.content, 'html.parser')

            # Find the parent element containing Genres information
            genres_parent_elem = soup_book.find('span', class_='BookPageMetadataSection__genrePlainText')

            # Extract Genres information
            genres = [genre_elem.text.strip() for genre_elem in genres_parent_elem.find_all('a', class_='Button__labelItem')] if genres_parent_elem else ["Genre not found"]

            # If Genres is empty, try using other information from the page
            if not genres or genres == ["Genre not found"]:
                # Find other Genres information on the page
                genres_alternative = [genre_elem.text.strip() for genre_elem in soup_book.find_all('a', class_='Button--tag-inline')]
                genres = genres_alternative[:5] if genres_alternative else ["Genre not found"]

            # Use spaCy for tokenization, lemmatization, and part-of-speech tagging
            doc = nlp(title)
            tokens = [token.text for token in doc]
            lemmas = [token.lemma_ for token in doc]
            parts_of_speech = [token.pos_ for token in doc]

            # Print book information with spaCy analysis
            print(f"{index}. {title}\nAuthor: {author}\nAvg Rating: {rating}\nGenres: {genres}")
            print("Tokens:", tokens)
            print("Lemmas:", lemmas)
            print("Parts of Speech:", parts_of_speech)
            print("---")
        else:
            print(f"Failed to retrieve information for {book_url}. Status code: {response_book.status_code}")
else:
    print(f"Failed to retrieve information for {url}. Status code: {response.status_code}")


1. The Hunger Games (The Hunger Games, #1)
Author: Suzanne Collins
Avg Rating: 4.33 avg rating — 8,322,425 ratings
Genres: ['Young Adult', 'Fiction', 'Fantasy', 'Dystopia', 'Science Fiction']
Tokens: ['The', 'Hunger', 'Games', '(', 'The', 'Hunger', 'Games', ',', '#', '1', ')']
Lemmas: ['the', 'Hunger', 'Games', '(', 'the', 'Hunger', 'Games', ',', '#', '1', ')']
Parts of Speech: ['DET', 'PROPN', 'PROPN', 'PUNCT', 'DET', 'PROPN', 'PROPN', 'PUNCT', 'SYM', 'NUM', 'PUNCT']
---
2. Harry Potter and the Order of the Phoenix (Harry Potter, #5)
Author: J.K. Rowling
Avg Rating: 4.50 avg rating — 3,280,769 ratings
Genres: ['Young Adult', 'Fiction', 'Magic', 'Childrens', 'Audiobook']
Tokens: ['Harry', 'Potter', 'and', 'the', 'Order', 'of', 'the', 'Phoenix', '(', 'Harry', 'Potter', ',', '#', '5', ')']
Lemmas: ['Harry', 'Potter', 'and', 'the', 'order', 'of', 'the', 'Phoenix', '(', 'Harry', 'Potter', ',', '#', '5', ')']
Parts of Speech: ['PROPN', 'PROPN', 'CCONJ', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN',

## the New York Times' 100 Notable Books of 2023

In [7]:
import requests
from bs4 import BeautifulSoup
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to process the text with spaCy
def process_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, lemmas, pos_tags

# Get HTML content of the book list page
url = "https://www.goodreads.com/list/show/194660.New_York_Times_100_Notable_Books_of_2023"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the parent elements containing book information
    book_items = soup.find_all('tr', itemtype='http://schema.org/Book')[:20]

    # Iterate through each book item
    for index, book in enumerate(book_items, start=1):
        # Extract title information
        title_elem = book.find('a', class_='bookTitle')
        title = title_elem.text.strip() if title_elem else "Title not found"

        # Extract author information
        author_elem = book.find('div', class_='authorName__container')
        author = author_elem.text.strip() if author_elem else "Author not found"

        # Extract rating information
        rating_elem = book.find('span', class_='minirating')
        rating = rating_elem.text.strip() if rating_elem else "Avg Rating not found"

        # Get the URL of the book's details page
        book_url = "https://www.goodreads.com" + title_elem['href']

        # Get the HTML content of the book's details page
        response_book = requests.get(book_url)
        if response_book.status_code == 200:
            soup_book = BeautifulSoup(response_book.content, 'html.parser')

            # Find the parent element containing Genres information
            genres_parent_elem = soup_book.find('span', class_='BookPageMetadataSection__genrePlainText')

            # Extract Genres information
            genres = [genre_elem.text.strip() for genre_elem in genres_parent_elem.find_all('a', class_='Button__labelItem')] if genres_parent_elem else ["Genre not found"]

            # If Genres is empty, try using other information from the page
            if not genres or genres == ["Genre not found"]:
                # Find other Genres information on the page
                genres_alternative = [genre_elem.text.strip() for genre_elem in soup_book.find_all('a', class_='Button--tag-inline')]
                genres = genres_alternative[:5] if genres_alternative else ["Genre not found"]

            # Process text with spaCy
            tokens, lemmas, pos_tags = process_text(title)
            
            # Print book information
            print(f"{index}. {title}\nAuthor: {author}\nAvg Rating: {rating}\nGenres: {genres}")
            print(f"Tokens: {tokens}\nLemmas: {lemmas}\nParts of Speech: {pos_tags}")
            print("---")
        else:
            print(f"Failed to retrieve information for {book_url}. Status code: {response_book.status_code}")
else:
    print(f"Failed to retrieve information for {url}. Status code: {response.status_code}")


1. Yellowface
Author: R.F. Kuang (Goodreads Author)
Avg Rating: 3.87 avg rating — 241,978 ratings
Genres: ['Fiction', 'Contemporary', 'Audiobook', 'Literary Fiction', 'Thriller']
Tokens: ['Yellowface']
Lemmas: ['yellowface']
Parts of Speech: ['NOUN']
---
2. Cobalt Red: How the Blood of the Congo Powers Our Lives
Author: Siddharth Kara
Avg Rating: 4.38 avg rating — 3,920 ratings
Genres: ['Nonfiction', 'History', 'Africa', 'Politics', 'Economics']
Tokens: ['Cobalt', 'Red', ':', 'How', 'the', 'Blood', 'of', 'the', 'Congo', 'Powers', 'Our', 'Lives']
Lemmas: ['Cobalt', 'Red', ':', 'how', 'the', 'blood', 'of', 'the', 'Congo', 'Powers', 'our', 'life']
Parts of Speech: ['PROPN', 'PROPN', 'PUNCT', 'SCONJ', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PRON', 'NOUN']
---
3. King: A Life
Author: Jonathan Eig (Goodreads Author)
Avg Rating: 4.68 avg rating — 2,990 ratings
Genres: ['Biography', 'Nonfiction', 'History', 'Politics', 'Race']
Tokens: ['King', ':', 'A', 'Life']
Lemmas: ['king', ':', 'a

# Creat the Data Directory Containing Corpus Files in the Text Format
a. Corpus data of the list "Best Books Ever" will be save in the Directory named data1

b. Corpus data of the New York Times' 100 Notable Books of 2023 will be save in the Directory named data2  

In [16]:
import os
import requests
from bs4 import BeautifulSoup
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to process the text with spaCy
def process_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, lemmas, pos_tags

# Function to scrape book information from a Goodreads URL and save it to a text file
def scrape_and_save_book_info(url, output_directory):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        book_items = soup.find_all('tr', itemtype='http://schema.org/Book')[:20]

        for index, book in enumerate(book_items, start=1):
            title_elem = book.find('a', class_='bookTitle')
            title = title_elem.text.strip() if title_elem else "Title not found"

            author_elem = book.find('div', class_='authorName__container')
            author = author_elem.text.strip() if author_elem else "Author not found"

            rating_elem = book.find('span', class_='minirating')
            rating = rating_elem.text.strip() if rating_elem else "Avg Rating not found"

            # Get the URL of the book's details page
            book_url = "https://www.goodreads.com" + title_elem['href']

            # Get the HTML content of the book's details page
            response_book = requests.get(book_url)
            if response_book.status_code == 200:
                soup_book = BeautifulSoup(response_book.content, 'html.parser')

                # Find the parent element containing Genres information
                genres_parent_elem = soup_book.find('span', class_='BookPageMetadataSection__genrePlainText')
                genres = [genre_elem.text.strip() for genre_elem in genres_parent_elem.find_all('a', class_='Button__labelItem')] if genres_parent_elem else ["Genre not found"]

                # If Genres is empty, try using other information from the page
                if not genres or genres == ["Genre not found"]:
                    # Find other Genres information on the page
                    genres_alternative = [genre_elem.text.strip() for genre_elem in soup_book.find_all('a', class_='Button--tag-inline')]
                    genres = genres_alternative[:5] if genres_alternative else ["Genre not found"]

                # Process text with spaCy
                tokens, lemmas, pos_tags = process_text(title)

                # Save book information into a text file
                save_book_info_to_file(
                    title, author, rating, genres,
                    tokens, lemmas, pos_tags,
                    os.path.join(output_directory, f"Book_{index}.txt")
                )
            else:
                print(f"Failed to retrieve information for {book_url}. Status code: {response_book.status_code}")
    else:
        print(f"Failed to retrieve information for {url}. Status code: {response.status_code}")

# Function to save book information into a text file
def save_book_info_to_file(title, author, rating, genres, tokens, lemmas, pos_tags, filename):
    with open(filename, mode='w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n")
        file.write(f"Author: {author}\n")
        file.write(f"Avg Rating: {rating}\n")
        file.write(f"Genres: {', '.join(genres)}\n")
        file.write(f"Tokens: {tokens}\n")
        file.write(f"Lemmas: {lemmas}\n")
        file.write(f"Parts of Speech: {pos_tags}\n")
        print(f"Saved book information to {filename}")

# URLs for the two lists
url1 = "https://www.goodreads.com/list/show/1.Best_Books_Ever"
url2 = "https://www.goodreads.com/list/show/194660.New_York_Times_100_Notable_Books_of_2023"

# Create directories to store the text files
output_directory1 = "data1"
output_directory2 = "data2"
os.makedirs(output_directory1, exist_ok=True)
os.makedirs(output_directory2, exist_ok=True)

# Scrape and save book information for both URLs in separate directories
scrape_and_save_book_info(url1, output_directory1)
scrape_and_save_book_info(url2, output_directory2)


Saved book information to data1\Book_1.txt
Saved book information to data1\Book_2.txt
Saved book information to data1\Book_3.txt
Saved book information to data1\Book_4.txt
Saved book information to data1\Book_5.txt
Saved book information to data1\Book_6.txt
Saved book information to data1\Book_7.txt
Saved book information to data1\Book_8.txt
Saved book information to data1\Book_9.txt
Saved book information to data1\Book_10.txt
Saved book information to data1\Book_11.txt
Saved book information to data1\Book_12.txt
Saved book information to data1\Book_13.txt
Saved book information to data1\Book_14.txt
Saved book information to data1\Book_15.txt
Saved book information to data1\Book_16.txt
Saved book information to data1\Book_17.txt
Saved book information to data1\Book_18.txt
Saved book information to data1\Book_19.txt
Saved book information to data1\Book_20.txt
Saved book information to data2\Book_1.txt
Saved book information to data2\Book_2.txt
Saved book information to data2\Book_3.txt


## Creat a CSV File with All Corpus Data
The CSV file columns of Title, Author, Avg Rating, Genres, Tokens, Lemmas, Parts of Speech, Source, Rank.

In [20]:
import os
import requests
from bs4 import BeautifulSoup
import spacy
import csv

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to process the text with spaCy
def process_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, lemmas, pos_tags

# Function to scrape book information from a Goodreads URL and save it to a CSV file
def scrape_and_save_book_info(url, output_directory, source, rank_start=1):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        book_items = soup.find_all('tr', itemtype='http://schema.org/Book')[:20]

        rows = []
        for index, book in enumerate(book_items, start=rank_start):
            title_elem = book.find('a', class_='bookTitle')
            title = title_elem.text.strip() if title_elem else "Title not found"

            author_elem = book.find('div', class_='authorName__container')
            author = author_elem.text.strip() if author_elem else "Author not found"

            rating_elem = book.find('span', class_='minirating')
            rating = rating_elem.text.strip() if rating_elem else "Avg Rating not found"

            # Get the URL of the book's details page
            book_url = "https://www.goodreads.com" + title_elem['href']

            # Get the HTML content of the book's details page
            response_book = requests.get(book_url)
            if response_book.status_code == 200:
                soup_book = BeautifulSoup(response_book.content, 'html.parser')

                # Find the parent element containing Genres information
                genres_parent_elem = soup_book.find('span', class_='BookPageMetadataSection__genrePlainText')
                genres = [genre_elem.text.strip() for genre_elem in genres_parent_elem.find_all('a', class_='Button__labelItem')] if genres_parent_elem else ["Genre not found"]

                # If Genres is empty, try using other information from the page
                if not genres or genres == ["Genre not found"]:
                    # Find other Genres information on the page
                    genres_alternative = [genre_elem.text.strip() for genre_elem in soup_book.find_all('a', class_='Button--tag-inline')]
                    genres = genres_alternative[:5] if genres_alternative else ["Genre not found"]

                # Process text with spaCy
                tokens, lemmas, pos_tags = process_text(title)

                # Create a row for the CSV
                row = {
                    "Title": title,
                    "Author": author,
                    "Avg Rating": rating,
                    "Genres": ', '.join(genres),
                    "Tokens": tokens,
                    "Lemmas": lemmas,
                    "Parts of Speech": pos_tags,
                    "Source": source,
                    "Rank": index if source != "NYT_Notable_Books" else (index - rank_start + 1)
                }
                rows.append(row)
            else:
                print(f"Failed to retrieve information for {book_url}. Status code: {response_book.status_code}")

        return rows
    else:
        print(f"Failed to retrieve information for {url}. Status code: {response.status_code}")
        return []

# URLs for the two lists
url1 = "https://www.goodreads.com/list/show/1.Best_Books_Ever"
url2 = "https://www.goodreads.com/list/show/194660.New_York_Times_100_Notable_Books_of_2023"

# Create a directory to store the combined CSV file
output_directory_combined = "combined_data"
os.makedirs(output_directory_combined, exist_ok=True)

# Scrape and save book information for both URLs
rows_url1 = scrape_and_save_book_info(url1, output_directory_combined, source="Best_Books_Ever")
rows_url2 = scrape_and_save_book_info(url2, output_directory_combined, source="NYT_Notable_Books", rank_start=1)

# Combine the rows from both sources
combined_rows = rows_url1 + rows_url2

# Write the combined rows to a CSV file
csv_file_path = os.path.join(output_directory_combined, "combined_books.csv")
with open(csv_file_path, mode='w', encoding='utf-8', newline='') as csvfile:
    fieldnames = ["Title", "Author", "Avg Rating", "Genres", "Tokens", "Lemmas", "Parts of Speech"]
    fieldnames += ["Source", "Rank"]  # Adding Source and Rank columns to the end
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for row in combined_rows:
        writer.writerow(row)

print(f"Saved combined book information to {csv_file_path}")


Saved combined book information to combined_data\combined_books.csv
