In [1]:
from selenium import webdriver
import pandas as pd
import numpy as np
import time
import random

In [2]:
driver = webdriver.Chrome()

In [5]:
if driver.session_id is None:
  print("The Webdriver session is not active.")

In [6]:
driver.get('https://www.goodreads.com/list/best_of_year/2023?id=183940.Best_Books_of_2023')

In [3]:
from pathlib import Path
from bs4 import BeautifulSoup
import os
import json
import sys

def html_save(url):
    """
    Saves the HTML content of the given URL to a file and returns the file path.
    """
    # Initialize the browser
    browser = webdriver.Firefox()  # Ensure you have the Firefox WebDriver installed
    browser.get(url)
    
    # Get the HTML content from the webpage
    html_content = browser.page_source
    browser.close()

    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')

    # Use the current working directory to save the file
    html_save_path = Path(os.getcwd()) / "content.txt"

    # Write the prettified HTML content to the file
    with open(html_save_path, 'wt', encoding='utf-8') as html_file:
        html_file.write(soup.prettify())

    print(f"HTML content saved to: {html_save_path}")
    return html_save_path  # Return the path to the saved file


def getLanguage(file_path):
    """
    Extracts the language from the JSON data within the HTML content of the file.
    Deletes the file after extraction.
    """
    try:
        # Load the HTML content from the file
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Locate the <script> tag with type="application/ld+json" that contains the language information
        start = html_content.find('<script type="application/ld+json">') + len('<script type="application/ld+json">')
        end = html_content.find('</script>', start)
        json_data = html_content[start:end].strip()

        # Parse the JSON data
        parsed_data = json.loads(json_data)

        # Extract the "inLanguage" field
        language = parsed_data.get("inLanguage", "N/A")
    except Exception as e:
        print(f"Error extracting language: {e}")
        language = "N/A"

    # Delete the file
    os.remove(file_path)
    return language

## Srape for Task 1

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import sys

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

def get_ranks_for_page(driver, page_url):
    """
    Scrape the rankings from a specific Goodreads page.
    
    Args:
    - driver: WebDriver instance
    - page_url: URL of the Goodreads page to scrape
    
    Returns:
    - ranks: List of ranks extracted from the page
    """
    ranks = []
    try:
        print(f'Processing page: {page_url}')
        driver.get(page_url)
        time.sleep(5)  # Allow the page to load

        rows = driver.find_elements(By.XPATH, '//*[@id="all_votes"]/table/tbody/tr')
        print(f'Number of rows found: {len(rows)}')

        for row_index, row in enumerate(rows, start=1):
            try:
                rank = row.find_element(By.XPATH, './td[@class="number"]').text.strip()
                ranks.append(rank)

                # Print details
                print(f"Row {row_index}: Rank: {rank}")
            except Exception as e:
                print(f"Error extracting rank on row {row_index}: {e}")
                ranks.append("N/A")
    except Exception as e:
        print(f"Error processing page {page_url}: {e}")
    
    return ranks



In [None]:
def data_scraping(baseURL, log_name, num_web_page):

    # Create and open the log file for appending
    log_file = open(log_name, "a", encoding="utf-8")

    # Redirect print and errors to the log file
    sys.stdout = log_file
    sys.stderr = log_file

    # Initialize WebDriver
    driver = webdriver.Chrome()

    # Initialize lists for storing data
    titles, authors, genres = [], [], []
    languages, average_ratings, num_ratings = [], [], []
    publication_dates, pages, currently_reading, want_to_read = [], [], [], []
    all_ranks, book_links = [], []

    # Loop through pages
    for i in range(1, num_web_page + 1):  
        print(f'Processing page {i}')
        sys.stdout.flush()
        targetURL = f"{baseURL}&page={i}"
        driver.get(targetURL)
        time.sleep(5)  # Allow the page to load
        
        ranks = get_ranks_for_page(driver, targetURL)
        reattempt = 0
        while (ranks[0] == '' and reattempt < 3):
            print(f"Reattempting ranks collection for page {i}, attempt {reattempt + 1}")
            sys.stdout.flush()
            ranks = get_ranks_for_page(driver, targetURL)
            reattempt += 1

        if ranks[0] == '':
            print(f"Failed to collect ranks for page {i} after {reattempt} attempts")
            sys.stdout.flush()
            continue

        print(f'For page {i}, collected ranks: {ranks}')
        sys.stdout.flush()
        all_ranks.extend(ranks)

        rows = driver.find_elements(By.XPATH, '//*[@id="all_votes"]/table/tbody/tr')

        # Extract book links from each row
        for row in rows:
            try:
                href = row.find_element(By.XPATH, './td[3]/a').get_attribute('href')
                book_links.append(href)
            except Exception as e:
                print(f"Error extracting rank or book link: {e}")
                sys.stdout.flush()
                book_links.append("N/A")

    # Visit each book link to collect details
    for rank, href in zip(all_ranks, book_links):
        try:
            driver.get(href)
            time.sleep(2)  # Allow the page to load

            # Extract book details
            title = driver.find_element(By.XPATH, '//h1[@class="Text Text__title1"]').text
            author = driver.find_element(By.XPATH, '//span[@class="ContributorLink__name"]').text
            genre = driver.find_element(By.XPATH, '(//div[@class="BookPageMetadataSection__genres"]//span[@class="Button__labelItem"])[1]').text
            avg_rating = driver.find_element(By.XPATH, '//div[@class="RatingStatistics__rating"]').text
            num_rating = driver.find_element(By.XPATH, '//div[@class="RatingStatistics__meta"]').text.split(" ratings")[0].strip()
            page = driver.find_element(By.XPATH, '//p[@data-testid="pagesFormat"]').text.split(" pages")[0]
            pub_date = driver.find_element(By.XPATH, '//p[@data-testid="publicationInfo"]').text.replace("First published", "").strip()
            reading_now = driver.find_element(By.XPATH, '//div[@data-testid="currentlyReadingSignal"]').text.split(" people")[0]
            want_read = driver.find_element(By.XPATH, '//div[@data-testid="toReadSignal"]').text.split(" people")[0]

            # Use html_save and getLanguage to extract language
            file_path = html_save(href)
            language = getLanguage(file_path)

            # Append details to lists
            titles.append(title)
            authors.append(author)
            genres.append(genre)
            average_ratings.append(avg_rating)
            num_ratings.append(num_rating)
            pages.append(page)
            publication_dates.append(pub_date)
            currently_reading.append(reading_now)
            want_to_read.append(want_read)
            languages.append(language)

            # Log book details immediately
            print(f"Processed Book:")
            print(f"  Rank: {rank}")
            print(f"  Book Link: {href}")
            print(f"  Title: {title}")
            print(f"  Author: {author}")
            print(f"  Genre: {genre}")
            print(f"  Average Rating: {avg_rating}")
            print(f"  Number of Ratings: {num_rating}")
            print(f"  Number of Pages: {page}")
            print(f"  Publication Date: {pub_date}")
            print(f"  Currently Reading: {reading_now}")
            print(f"  Want to Read: {want_read}")
            print(f"  Language: {language}")
            print("-" * 40)
            sys.stdout.flush()
        except Exception as e:
            print(f"Error processing book at {href}: {e}")
            sys.stdout.flush()
            titles.append("N/A")
            authors.append("N/A")
            genres.append("N/A")
            average_ratings.append("N/A")
            num_ratings.append("N/A")
            pages.append("N/A")
            publication_dates.append("N/A")
            currently_reading.append("N/A")
            want_to_read.append("N/A")
            languages.append("N/A")

    driver.quit()

    # Restore stdout and stderr
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__
    log_file.close()

    # Create the DataFrame
    return pd.DataFrame({
        "Rank": all_ranks,
        "Book Link": book_links,
        "Title": titles,
        "Author": authors,
        "Genre": genres,
        "Language": languages,
        "Average Rating": average_ratings,
        "Number of Ratings": num_ratings,
        "Publication Date": publication_dates,
        "Number of Pages": pages,
        "Currently Reading": currently_reading,
        "Want to Read": want_to_read,
    })


In [None]:
books_df_1 = data_scraping("https://www.goodreads.com/list/best_of_year/2023?id=183940.Best_Books_of_2023", "scraping_log.txt", 12)

In [None]:
books_df_1.to_csv("data/books_data_1.csv", index=True, encoding='utf-8')

## Scrape for Task 2

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import sys

def get_book_links(baseURL, num_web_page, log_name):
    """
    Function to extract book links from the given pages and log the process.
    """
    # Open the log file for appending
    log_file = open(log_name, "a", encoding="utf-8")

    # Redirect print and errors to the log file
    sys.stdout = log_file
    sys.stderr = log_file

    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    book_links = []

    # Loop through pages to get all book links
    # Loop through pages
    for i in range(1, num_web_page + 1):  
        print(f'Processing page {i}')
        sys.stdout.flush()
        targetURL = f"{baseURL}?page={i}"
        driver.get(targetURL)
        time.sleep(5)  # Allow the page to load
        rows = driver.find_elements(By.XPATH, '//table[@class="tableList"]/tbody/tr')
        print(f'Number of rows found: {len(rows)}')

        # Extract book links from each row
        for row in rows:
            try:
                href = row.find_element(By.XPATH, './td[2]/a').get_attribute('href')
                book_links.append(href)
                print(f'Current href: {href}')
            except Exception as e:
                print(f"Error extracting rank or book link: {e}")
                sys.stdout.flush()
                book_links.append("N/A")

    driver.quit()

    # Restore stdout and stderr
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__
    log_file.close()

    return book_links


def data_scraping_2(book_links, log_name, start, end):
    """
    Function to scrape book details from each book link, including language using html_save and getLanguage.
    Retries up to three times with a wait time before retrying if data collection fails before moving to the next book.
    Sets 'N/A' for fields that cannot be found.
    """
    # Create and open the log file for appending
    log_file = open(log_name, "a", encoding="utf-8")

    # Redirect print and errors to the log file
    sys.stdout = log_file
    sys.stderr = log_file

    # Initialize WebDriver
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)

    # Initialize lists for storing data
    titles, authors, genres = [], [], []
    languages, average_ratings, num_ratings = [], [], []
    publication_dates, pages, currently_reading, want_to_read = [], [], [], []

    # Start indexing from the start index
    i = start  
    for href in book_links[start:end]:
        retries = 0
        success = False
        while retries < 3 and not success:
            try:
                driver.get(href)
                time.sleep(2)  # Allow initial page load

                # Extract book details
                try:
                    title = wait.until(EC.presence_of_element_located((By.XPATH, '//h1[@class="Text Text__title1"]'))).text
                except Exception:
                    title = "N/A"

                try:
                    author = wait.until(EC.presence_of_element_located((By.XPATH, '//span[@class="ContributorLink__name"]'))).text
                except Exception:
                    author = "N/A"

                try:
                    genre = wait.until(EC.presence_of_element_located((By.XPATH, '(//div[@class="BookPageMetadataSection__genres"]//span[@class="Button__labelItem"])[1]'))).text
                except Exception:
                    genre = "N/A"

                try:
                    avg_rating = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="RatingStatistics__rating"]'))).text
                except Exception:
                    avg_rating = "N/A"

                try:
                    num_rating = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="RatingStatistics__meta"]'))).text.split(" ratings")[0].strip()
                except Exception:
                    num_rating = "N/A"

                try:
                    page = wait.until(EC.presence_of_element_located((By.XPATH, '//p[@data-testid="pagesFormat"]'))).text.split(" pages")[0]
                except Exception:
                    page = "N/A"

                try:
                    pub_date = wait.until(EC.presence_of_element_located((By.XPATH, '//p[@data-testid="publicationInfo"]'))).text.replace("First published", "").strip()
                except Exception:
                    pub_date = "N/A"

                try:
                    reading_now = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@data-testid="currentlyReadingSignal"]'))).text.split(" people")[0]
                except Exception:
                    reading_now = "N/A"

                try:
                    want_read = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@data-testid="toReadSignal"]'))).text.split(" people")[0]
                except Exception:
                    want_read = "N/A"

                try:
                    # Use html_save and getLanguage to extract language
                    file_path = html_save(href)
                    language = getLanguage(file_path)
                except Exception:
                    language = "N/A"

                # Append details to lists
                titles.append(title)
                authors.append(author)
                genres.append(genre)
                average_ratings.append(avg_rating)
                num_ratings.append(num_rating)
                pages.append(page)
                publication_dates.append(pub_date)
                currently_reading.append(reading_now)
                want_to_read.append(want_read)
                languages.append(language)

                # Log book details immediately
                print(f"Processed Book {i + 1}:")
                print(f"  Book Link: {href}")
                print(f"  Title: {title}")
                print(f"  Author: {author}")
                print(f"  Genre: {genre}")
                print(f"  Average Rating: {avg_rating}")
                print(f"  Number of Ratings: {num_rating}")
                print(f"  Number of Pages: {page}")
                print(f"  Publication Date: {pub_date}")
                print(f"  Currently Reading: {reading_now}")
                print(f"  Want to Read: {want_read}")
                print(f"  Language: {language}")
                print("-" * 40)
                sys.stdout.flush()
                success = True  # Mark success if no exception occurred
            except Exception as e:
                retries += 1
                print(f"Error processing book at {href}, attempt {retries}: {e}")
                sys.stdout.flush()
                if retries < 3:
                    wait_time = retries * 5  # Increasing wait time with each retry (e.g., 5s, 10s, 15s)
                    print(f"Waiting for {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
                if retries == 3:
                    # If all retries fail, append "N/A" values for all fields
                    print(f"Failed to process book at {href} after 3 attempts. Moving on.")
                    titles.append("N/A")
                    authors.append("N/A")
                    genres.append("N/A")
                    average_ratings.append("N/A")
                    num_ratings.append("N/A")
                    pages.append("N/A")
                    publication_dates.append("N/A")
                    currently_reading.append("N/A")
                    want_to_read.append("N/A")
                    languages.append("N/A")

        # Increment counter after all retries, regardless of success
        i += 1

    driver.quit()

    # Restore stdout and stderr
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__
    log_file.close()

    # Create the DataFrame
    return pd.DataFrame({
        "Book Link": book_links[start:end],
        "Title": titles,
        "Author": authors,
        "Genre": genres,
        "Language": languages,
        "Average Rating": average_ratings,
        "Number of Ratings": num_ratings,
        "Publication Date": publication_dates,
        "Number of Pages": pages,
        "Currently Reading": currently_reading,
        "Want to Read": want_to_read,
    })


In [None]:
# Example Usage
baseURL = "https://www.goodreads.com/author/list/3389.Stephen_King"
log_name = "scraping_log_2.txt"
num_web_page = 88

# Step 1: Get all book links
book_links = get_book_links(baseURL, num_web_page, log_name)



In [8]:
print(len(book_links))
# 2616

In [9]:
# Step 2: Scrape book details
books_df_2_p1 = data_scraping_2(book_links, log_name, 0, 100)

2616
Data saved to ../data/book_scrape_2/books_df2_p1.csv
Data saved to ../data/book_scrape_2/books_df2_p2.csv
Data saved to ../data/book_scrape_2/books_df2_p3.csv
Data saved to ../data/book_scrape_2/books_df2_p5.csv
Data saved to ../data/book_scrape_2/books_df2_p1.csv
Data saved to ../data/book_scrape_2/books_df2_p2.csv
Data saved to ../data/book_scrape_2/books_df2_p3.csv
Data saved to ../data/book_scrape_2/books_df2_p5.csv
Data saved to ./data/book_scrape_2/books_df2_p1.csv
Data saved to ./data/book_scrape_2/books_df2_p2.csv
Data saved to ./data/book_scrape_2/books_df2_p3.csv
Data saved to ./data/book_scrape_2/books_df2_p5.csv


In [10]:
books_df_2_p2 = data_scraping_2(book_links, log_name, 100, 200)

In [11]:
books_df_2_p3 = data_scraping_2(book_links, log_name, 200, 300)

In [12]:
books_df_2_p5 = data_scraping_2(book_links, log_name, 400, 500)

In [None]:
books_df_2_p6 = data_scraping_2(book_links, log_name, 500, 600)

In [None]:
books_df_2_p7 = data_scraping_2(book_links, log_name, 600, 700)

In [None]:
books_df_2_p8 = data_scraping_2(book_links, log_name, 700, 800)

In [None]:
books_df_2_p9 = data_scraping_2(book_links, log_name, 800, 900)

In [None]:
books_df_2_p10 = data_scraping_2(book_links, log_name, 900, 1000)

In [None]:
books_df_2_p11 = data_scraping_2(book_links, log_name, 1000, 1100)

In [None]:
books_df_2_p12 = data_scraping_2(book_links, log_name, 1100, 1200)

In [None]:
books_df_2_p17 = data_scraping_2(book_links, log_name, 1600, 1700)

In [None]:
books_df_2_p26 = data_scraping_2(book_links, log_name, 2500, 2600)

In [None]:
books_df_2_p27 = data_scraping_2(book_links, log_name, 2600, 2627)

In [None]:
books_df_2_p13 = data_scraping_2(book_links, log_name, 1200, 1300)

In [None]:
books_df_2_p4 = data_scraping_2(book_links, log_name, 300, 400)

In [None]:
books_df_2_p21 = data_scraping_2(book_links, log_name, 2000, 2100)

In [None]:
books_df_2_p22 = data_scraping_2(book_links, log_name, 2100, 2200)

In [None]:
books_df_2_p23 = data_scraping_2(book_links, log_name, 2200, 2300)

In [None]:
books_df_2_p24 = data_scraping_2(book_links, log_name, 2300, 2400)

In [None]:
books_df_2_p25 = data_scraping_2(book_links, log_name, 2400, 2500)

In [None]:
books_df_2_p18 = data_scraping_2(book_links, log_name, 1700, 1800)

In [None]:
books_df_2_p19 = data_scraping_2(book_links, log_name, 1800, 1900)

In [None]:
books_df_2_p20 = data_scraping_2(book_links, log_name, 1900, 2000)

In [None]:
books_df_2_p14 = data_scraping_2(book_links, log_name, 1300, 1400)

In [None]:
books_df_2_p15 = data_scraping_2(book_links, log_name, 1400, 1500)

In [None]:
books_df_2_p16 = data_scraping_2(book_links, log_name, 1500, 1600)

In [13]:
import pandas as pd
import os

def save_to_csv(data, output_dir, file_name):
    """
    Saves a DataFrame to a CSV file in the specified directory.
    
    Parameters:
        data (pd.DataFrame or dict): The data to be saved. If it's a dictionary, it will be converted to a DataFrame.
        output_dir (str): The directory where the CSV file should be saved.
        file_name (str): The name of the CSV file (without extension).
    
    Returns:
        str: The full path to the saved CSV file.
    """
    # Ensure the data is a DataFrame
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Create the full file path
    file_path = os.path.join(output_dir, f"{file_name}.csv")
    
    # Save the DataFrame to CSV
    data.to_csv(file_path, index=False, encoding='utf-8')
    
    print(f"Data saved to {file_path}")
    return file_path


In [None]:
save_to_csv(books_df_2_p1, './data/book_scrape_2', 'books_df2_p1')
save_to_csv(books_df_2_p2, './data/book_scrape_2', 'books_df2_p2')
save_to_csv(books_df_2_p3, './data/book_scrape_2', 'books_df2_p3')
save_to_csv(books_df_2_p5, './data/book_scrape_2', 'books_df2_p5')
save_to_csv(books_df_2_p6, '../data/book_scrape_2', 'books_df2_p6')
save_to_csv(books_df_2_p7, '../data/book_scrape_2', 'books_df2_p7')
save_to_csv(books_df_2_p8, '../data/book_scrape_2', 'books_df2_p8')
save_to_csv(books_df_2_p9, '../data/book_scrape_2', 'books_df2_p9')
save_to_csv(books_df_2_p10, '../data/book_scrape_2', 'books_df2_p10')
save_to_csv(books_df_2_p11, '../data/book_scrape_2', 'books_df2_p11')
save_to_csv(books_df_2_p12, '../data/book_scrape_2', 'books_df2_p12')
save_to_csv(books_df_2_p17, '../data/book_scrape_2', 'books_df2_p17')
save_to_csv(books_df_2_p26, '../data/book_scrape_2', 'books_df2_p26')
save_to_csv(books_df_2_p27, '../data/book_scrape_2', 'books_df2_p27')
save_to_csv(books_df_2_p13, '../data/book_scrape_2', 'books_df2_p13')
save_to_csv(books_df_2_p4, '../data/book_scrape_2', 'books_df2_p4')
save_to_csv(books_df_2_p21, '../data/book_scrape_2', 'books_df2_p21')
save_to_csv(books_df_2_p22, '../data/book_scrape_2', 'books_df2_p22')
save_to_csv(books_df_2_p23, '../data/book_scrape_2', 'books_df2_p23')
save_to_csv(books_df_2_p24, '../data/book_scrape_2', 'books_df2_p24')
save_to_csv(books_df_2_p25, '../data/book_scrape_2', 'books_df2_p25')
save_to_csv(books_df_2_p18, '../data/book_scrape_2', 'books_df2_p18')
save_to_csv(books_df_2_p19, '../data/book_scrape_2', 'books_df2_p19')
save_to_csv(books_df_2_p20, '../data/book_scrape_2', 'books_df2_p20')
save_to_csv(books_df_2_p14, '../data/book_scrape_2', 'books_df2_p14')
save_to_csv(books_df_2_p15, '../data/book_scrape_2', 'books_df2_p15')
save_to_csv(books_df_2_p16, '../data/book_scrape_2', 'books_df2_p16')



'./data/book_scrape_2/books_df2_p5.csv'

In [4]:
# Rewriting the function after reset

import os
import pandas as pd

def combine_csv_files(input_folder, output_file, prefix, num_files):
    """
    Combines multiple CSV files in a specified order into a single CSV file.

    Parameters:
        input_folder (str): Path to the folder containing the CSV files.
        output_file (str): Path to the resulting combined CSV file.
        prefix (str): Prefix of the CSV files (e.g., "book_data_p").
        num_files (int): Total number of files to combine in sequential order.
    """
    combined_df = pd.DataFrame()

    for i in range(1, num_files + 1):
        file_path = os.path.join(input_folder, f"{prefix}{i}.csv")
        if os.path.exists(file_path):
            temp_df = pd.read_csv(file_path)
            combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
        else:
            print(f"File not found: {file_path}")

    combined_df.to_csv(output_file, index=False)
    print(f"Combined CSV file saved to: {output_file}")

# Define the input folder and output file paths
input_folder = "/Users/quynhanh2004/Documents/GitHub/take-home-final-annepham1512/exams/final/data/book_scrape_2"  # Adjusted path for execution environment
output_file = "/Users/quynhanh2004/Documents/GitHub/take-home-final-annepham1512/exams/final/data/books_data_2.csv"  # Save the combined file outside the folder

# Combine the CSV files
combine_csv_files(input_folder, output_file, prefix="books_df2_p", num_files=27)

output_file


Combined CSV file saved to: /Users/quynhanh2004/Documents/GitHub/take-home-final-annepham1512/exams/final/data/books_data_2.csv


'/Users/quynhanh2004/Documents/GitHub/take-home-final-annepham1512/exams/final/data/books_data_2.csv'