Votes are rounded above 999 => need to load the review page to get exact values

Setting language to English not necessary from the DataLab?

Check if the number of scrapped matches the number of reviews displayed on the main page

In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from src.utils.db import PostgreSQLDatabase

import pandas as pd
import re
import time
import tqdm

In [None]:
movie_id = '0089885'

In [None]:
# Initialize Selenium WebDriver in headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)
print(f"[INFO] Launching browser")

In [None]:
# Load main page
driver.get(f"https://www.imdb.com/title/tt{movie_id}")
time.sleep(3)  # Allow page to load
print(f"[INFO] IMDb main page for movie #{movie_id} loaded")

# Consent to the collect of personal information
accept_button = driver.find_element(By.XPATH, '//button[@data-testid="accept-button"]')
ActionChains(driver).move_to_element(accept_button).click().perform()
time.sleep(2)
print(f"[INFO] Consenting to the collect of personal information")

In [None]:
# Set language to English
language_selector = WebDriverWait(driver, 5).until(
    EC.presence_of_element_located((By.XPATH, '//label[@for="nav-language-selector"]/span'))
)

current_language = language_selector.text.strip()
print(f"[INFO] Current language: {current_language}")

if current_language != "English (United States)":
    try:
        driver.execute_script("arguments[0].click();", language_selector)
        language_options = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, '//ul[contains(@class, "ipc-list")]//span'))
        )
        for option in language_options:
            if "English (United States)" in option.text:
                driver.execute_script("arguments[0].click();", option)
                print("[INFO] Language switched to English")
                break

    except Exception as e:
        print(f"[ERROR] Failed to switch language to English: {e}")

In [None]:
# Extract title
title_element = driver.find_element(By.XPATH, '//span[@data-testid="hero__primary-text"]')
movie_title = title_element.text.strip()
print(f"[INFO] Extracting movie title: {movie_title}")

# Extract release date
release_date_link = driver.find_element(By.XPATH, '//li[@data-testid="title-details-releasedate"]//a[@class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"]')
release_date = release_date_link.text.split(" (")[0].strip()
print(f"[INFO] Extracting release date: {release_date}")

In [None]:
# Load review page
driver.get(f"https://www.imdb.com/title/tt{movie_id}/reviews")
time.sleep(3)  # Allow page to load
print(f"[INFO] IMDb reviews page for movie #{movie_id} loaded")

# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Extract the total number of reviews
total_reviews_tag = soup.find("div", attrs={"data-testid": "tturv-total-reviews"})
if total_reviews_tag:
    # Remove text and convert to integer
    total_reviews_text = total_reviews_tag.get_text(strip=True).split(" reviews")[0]
    total_reviews = int(total_reviews_text.replace(",", ""))
else:
    total_reviews = None

print(f"[INFO] Found {total_reviews} reviews to scrap")

In [None]:
# Click the button to display all reviews
if total_reviews > 25:
    try:
        # Wait for the button that specifically contains "All"
        all_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, '//span[contains(@class, "ipc-see-more")]//button[.//span[contains(text(), "All")]]'))
        )
    
        # Click the button using JavaScript to avoid interception issues
        driver.execute_script("arguments[0].click();", all_button)
        print(f"[INFO] Clicking the button to display all reviews")

    except Exception as e:
        print(f"[WARNING] Button for displaying the reviews not found or not clickable: {e}")

In [None]:
# Click all the spoiler buttons until all reviews are displayed entirely
spoiler_buttons = driver.find_elements(By.CLASS_NAME, "review-spoiler-button")
print(f"[INFO] Found {len(spoiler_buttons)} spoiler buttons to click")
    
for i, spoiler_button in enumerate(tqdm.tqdm(spoiler_buttons, desc="Clicking Spoilers", unit="button")):
    try:
        ActionChains(driver).move_to_element(spoiler_button).click().perform()
        time.sleep(1)  # Add a small delay to ensure clicks register properly
    except Exception as e:
        print(f"[ERROR] Could not click spoiler button {i+1}: {e}")
        continue

In [None]:
# Extract reviews
soup = BeautifulSoup(driver.page_source, "html.parser")
data = []

# Find all review articles
reviews = soup.find_all("article", class_="user-review-item")

# Loop through each review and extract information
for review in reviews:
    # 1. Extract the review identifier (integer between /rw and /?)
    permalink_tag = review.find("a", class_="ipc-link ipc-link--base", attrs={"data-testid": "permalink-link"})
    if permalink_tag:
        identifier_match = re.search(r"/rw(\d+)", permalink_tag["href"])
        if identifier_match:
            review_id = identifier_match.group(1)
        else:
            review_id = None
    else:
        review_id = None

    # 2. Extract the review date (from <li> tag with class 'review-date')
    date_tag = review.find("li", class_="ipc-inline-list__item review-date")
    review_date = date_tag.get_text(strip=True) if date_tag else None

    # 3. Extract the review author (from <a> tag with class 'author-link')
    author_tag = review.find("a", class_="ipc-link ipc-link--base", attrs={"data-testid": "author-link"})
    author_name = author_tag.get_text(strip=True) if author_tag else None

    # 4. Extract the upvotes and downvotes (from ipc-voting__label__count classes)
    upvotes_tag = review.find("span", class_="ipc-voting__label__count--up")
    downvotes_tag = review.find("span", class_="ipc-voting__label__count--down")
    upvotes = upvotes_tag.get_text(strip=True) if upvotes_tag else 0
    downvotes = downvotes_tag.get_text(strip=True) if downvotes_tag else 0

    # 5. Extract the review text
    spoiler_content_tag = review.find("div", {"data-testid": "review-spoiler-content"})
    
    if spoiler_content_tag:
        # If the spoiler content exists, extract the inner HTML of the review
        review_text = spoiler_content_tag.find("div", class_="ipc-html-content-inner-div")
        review_text = review_text.get_text(separator="\n", strip=True) if review_text else None
    else:
        # If no spoiler content, extract the regular review text
        review_text_tag = review.find("div", class_="ipc-overflowText--children")
        review_text = review_text_tag.get_text(separator="\n", strip=True) if review_text_tag else None

    # 6. Extract the review title (from <h3> inside a <div> with class 'ipc-title')
    title_tag = review.find("div", class_="ipc-title").find("h3", class_="ipc-title__text")
    review_title = title_tag.get_text(strip=True) if title_tag else None

    # 7. Extract the rating (from <span> with class 'ipc-rating-star--maxRating')
    rating_tag = review.find("span", class_="ipc-rating-star--maxRating")
    rating = rating_tag.previous_sibling.get_text(strip=True) if rating_tag else None

    # 8. Append data to the list
    data.append({
        "movie_id": movie_id,
        "review_id": review_id,
        "author": author_name, 
        "title": review_title,
        "text": review_text,
        "rating": rating,
        "date": review_date,
        "upvotes": upvotes, 
        "downvotes": downvotes, 
        "scrapping_timestamp": datetime.now().strftime("%Y%m%d_%H%M%S")
    })

# Create a dataframe from the collected data
reviews_df = pd.DataFrame(data)

In [None]:
driver.quit()

In [None]:
def convert_to_int(value):
    if value is None:
        return 0
    if 'K' in value:
        return int(float(value.replace('K', '')) * 1000)
    try:
        return int(value)
    except ValueError:
        return 0

reviews_df['upvotes'] = reviews_df['upvotes'].apply(convert_to_int)
reviews_df['downvotes'] = reviews_df['downvotes'].apply(convert_to_int)

In [None]:
reviews_list = reviews_df.apply(lambda row: (
    row['movie_id'], row['review_id'], 
    f"{row['author']}", f"{row['title']}", 
    f'''{row['text']}''', row['rating'], 
    f"{row['date']}", row['upvotes'], row['downvotes'], 
    f"{row['scrapping_timestamp']}"
), axis=1).tolist()

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

db = PostgreSQLDatabase()
db.connect()
db.insert_data('movies', [(movie_id, movie_title, release_date, timestamp),])
db.insert_data('reviews_raw', reviews_list)
db.close_connection()