In [None]:
# @title Load the Drive helper and mount

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Installs and imports

!pip install google-colab-selenium

from bs4 import BeautifulSoup
import google_colab_selenium as gs
import pandas as pd
import requests
from selenium.webdriver.common.by import By
import time

In [None]:
# @title Scrape reviews

def close_pop_up(driver):
    try:
        # We try to find the "x" button that closes the pop-up
        popup_close_button = driver.find_element(By.XPATH, "/html/body/div[3]/div/div[1]/div/div/button")
        # We click the button
        popup_close_button.click()
        print("Pop-up has been closed.")

        # Wait for the pop-up to close
        time.sleep(1)
    except Exception as e:
        print(f"Exception occured during pop-up closing: {e}")
        return

def get_goodreads_book_reviews(url, max_reviews=100):
    # We first get the main genre of the book
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve web page: {response.status_code}.")
        return []
    soup = BeautifulSoup(response.text, 'html.parser')
    main_genre = soup.find_all("span", class_="BookPageMetadataSection__genreButton")[0].find("span", class_="Button__labelItem").get_text()

    try:
        # Open the browser
        driver = gs.Chrome()

        # Open the Goodreads page
        driver.get(url)

        # Wait for the page to load
        time.sleep(3)

        # When loading the page, a pop-up may appear. We try to close it if that happens
        close_pop_up(driver)

        # We click the "More reviews and ratings" button at the bottom of the page
        try:
            # We try to find the "More reviews and ratings" button
            more_reviews_button = driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/main/div[1]/div[2]/div[4]/div/div[6]/div[4]/a")

            # Scroll to the button
            driver.execute_script("arguments[0].scrollIntoView();", more_reviews_button)
            time.sleep(1)
            driver.execute_script("window.scrollBy(0, -100);") # Otherwise, the page header would overlap the button

            # Wait for the button to come into view
            time.sleep(1)

            # Click the button
            more_reviews_button.click()

            # Wait for the reviews to load
            time.sleep(3)
        except Exception as e:
            print(f"Exception occured while trying to access \"More reviews and ratings\" button: {e}")
            return

        # When loading the page, a pop-up may appear. We try to close it if that happens
        close_pop_up(driver)

        # We filter the comments by Romanian
        try:
            # We try to find the button for setting filters
            filters_button = driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/main/div[1]/div[2]/div[4]/div[1]/div[2]/div/button")
            # Scroll to the button
            driver.execute_script("arguments[0].scrollIntoView();", filters_button)
            # Wait for the button to come into view
            time.sleep(1)
            # Click the button
            filters_button.click()
            # Wait for the filter options to be displayed
            time.sleep(1)

            # We try to find the radio input for Romanian language
            language_filter_radio = driver.find_element(By.XPATH, '//label[@for="ro"]')
            # We check the radio
            language_filter_radio.click()

            # We try to find the button that applies the selected filters
            apply_filters_button = driver.find_element(By.XPATH, "//span[text()='Apply']")
            # We click the button
            apply_filters_button.click()

            # Wait for the filters to apply
            time.sleep(3)
        except Exception as e:
            print(f"Exception occured during comments filtering: {e}")
            return

        # We repeatedly click the "Show more reviews" button in order to load all of the reviews
        while True:
            try:
                # We try to find the "Show more reviews" button
                show_more_reviews_button = driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/main/div[1]/div[2]/div[5]/div[4]/div/button")

                # Scroll to the button
                driver.execute_script("arguments[0].scrollIntoView();", show_more_reviews_button)

                # Wait for the button to come into view
                time.sleep(1)

                # Click the button
                show_more_reviews_button.click()

                # Wait for the reviews to load
                time.sleep(3)
            except Exception as e:
                print(f"Exception occured while trying to access \"Show more reviews\" button: {e}")
                break

        html_source = driver.page_source
    except Exception as e:
        print(f"Exception occured during retrieval of book reviews: {e}")
    finally:
        # Close the browser
        driver.quit()

    soup = BeautifulSoup(html_source, 'html.parser')
    reviews = soup.find_all("article", class_="ReviewCard")

    reviews_list = []
    for review in reviews:
        try:
            rating = review.find("span", class_="RatingStars RatingStars__small")
            rating = rating['aria-label'] if rating else None
            if rating is None:
                continue
            else:
                rating = rating.split()[1]

            # Replace <br> tags with spaces
            for br in review.find_all("br"):
                br.replace_with(" ")
            text = review.find("span", class_="Formatted").get_text().replace("\n", " ")

            reviews_list.append((main_genre, rating, text))
        except Exception as e:
            print(f"Error parsing review: {e}")
            continue

    return reviews_list[:max_reviews]

book_id = "61431922-fourth-wing" # @param {type:"string"}
url = f"https://www.goodreads.com/book/show/{book_id}"
max_reviews = 10000 # @param {type:"integer"}
reviews = get_goodreads_book_reviews(url, max_reviews)

# @markdown The reviews will be stored in a variable called "reviews".

In [None]:
# @title We visualize the scraped reviews

print(reviews[0][0])
print(f"No. of reviews selected: {len(reviews)}\n")

cnt_label0, cnt_label1, cnt_label2 = 0, 0, 0
for r in reviews:
    label = 0
    if int(r[1]) < 3:
        label = 0
        cnt_label0 += 1
    elif int(r[1]) == 3:
        label = 1
        cnt_label1 += 1
    else:
        label = 2
        cnt_label2 += 1
print("Label 0 (1-2 stars): ", cnt_label0)
print("Label 1 (3 stars):   ", cnt_label1)
print("Label 2 (4-5 stars): ", cnt_label2)
print()

for review in reviews:
    print(review[1])
    print(review[2] + "\n")

In [3]:
# @title We load the dataset

df = pd.read_csv("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/dataset.csv")

In [None]:
# @title We add the scraped reviews to the dataset

print(f"Adding book with ID \"{book_id}\" to dataset...")

for r in reviews:
    label = 0
    if int(r[1]) < 3:
        label = 0
    elif int(r[1]) == 3:
        label = 1
    else:
        label = 2
    df.loc[len(df)] = [len(df), r[0], r[2], label]

print("Book added.")

In [None]:
# @title Here we remove any unwanted rows from the dataset

# df = df.drop([i for i in range(1736, 1957)])

In [None]:
# @title We visualize details about the dataset

label_counts = df['label'].value_counts()
print(label_counts)
print()
genre_counts = df['genre'].value_counts()
print(genre_counts)
print()
print(df.info())
print()
print(df.head())

In [None]:
# @title We save the dataset

# We want the entries to be in descending order of the "label" column
df = df.sort_values(by='label', ascending=False)
new_id = 0
for i, row in df.iterrows():
    df.loc[i, 'id'] = new_id
    new_id += 1
df = df.reset_index(drop=True)

# df.to_csv("/content/drive/MyDrive/Colab/Facultate/Master/Anul II/NLP2/dataset.csv", index=False)