In [None]:
pip install chromedriver-py

Procedure:
- Click the button to display all reviews
- Scrap all reviews
  - Votes are rounded above 999 => need to load the review page
  - Reviews may be hidden behide spoiler markup => click the spoiler button or load the review page

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import re
import pandas as pd

In [None]:
# Initialize Selenium WebDriver
print("[INFO] Launching browser...")
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/title/tt6208148/reviews")
time.sleep(3)  # Allow initial page to load
print("[INFO] IMDb reviews page loaded.")

Button to display all reviews appears in French in the code.
Maybe in the datalab it will appear in English.

In [None]:
# Display all reviews
buttons = driver.find_elements(By.TAG_NAME, "button")
print(f"[INFO] Found {len(buttons)} buttons.")

try:
    all_button = driver.find_element(By.XPATH, "//button[.//span[contains(text(), 'Tout')]]")
    print("[INFO] Clicking 'Tout' button to load all reviews...")

    # Ensure the button is visible and clickable
    driver.execute_script("arguments[0].scrollIntoView();", all_button)
    time.sleep(1)  # Let scrolling settle

    # Use ActionChains to click
    actions = ActionChains(driver)
    actions.move_to_element(all_button).click().perform()

    time.sleep(5)  # Wait for reviews to load
    print("[INFO] All reviews should now be visible!")
except Exception as e:
    print(f"[WARNING] 'Tout' button not found or not clickable: {e}")

In the main page, votes are rounded above 1000 (e.g., 5K).
It is necessary to open the review page to get the detailed values.
Code in the main page consider votes as strings to avoid issues.

In [None]:
# Click all spoiler buttons repeatedly until no more buttons are left
while True:
    # Find all the "Spoiler" buttons on the page
    spoiler_buttons = driver.find_elements(By.CLASS_NAME, "review-spoiler-button")

    if not spoiler_buttons:
        print("[INFO] No more spoiler buttons to click.")
        break  # Exit the loop if there are no more buttons

    # Click each button
    for i, spoiler_button in enumerate(spoiler_buttons):
        try:
            # Click the spoiler button to reveal the spoiler content
            print(f"[INFO] Clicking spoiler button {i+1}...")
            ActionChains(driver).move_to_element(spoiler_button).click().perform()

            # Wait a moment to ensure the page updates (adjust as needed)
            time.sleep(1)

        except Exception as e:
            print(f"[ERROR] Could not click spoiler button {i+1}: {e}")
            continue

    # Wait before trying again in case new buttons appeared
    print("[INFO] Waiting for the page to load new content...")
    time.sleep(3)  # You can adjust this time based on how quickly new buttons load


In [None]:
# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Extract the total number of reviews
total_reviews_tag = soup.find("div", attrs={"data-testid": "tturv-total-reviews"})
if total_reviews_tag:
    total_reviews_text = total_reviews_tag.get_text(strip=True)
    # Remove non-breaking spaces and regular spaces, then convert to integer
    total_reviews = int(total_reviews_text.replace(' ', '').replace(' ', '').split()[0])
else:
    total_reviews = None

# Find all review articles (identified by 'user-review-item' class)
reviews = soup.find_all("article", class_="user-review-item")

# Create an empty list to store data
data = []

# Loop through each review and extract information
for review in reviews:
    # 1. Extract the review identifier (integer between /rw and /?)
    permalink_tag = review.find("a", class_="ipc-link ipc-link--base", attrs={"data-testid": "permalink-link"})
    if permalink_tag:
        identifier_match = re.search(r"/rw(\d+)", permalink_tag["href"])
        if identifier_match:
            review_id = identifier_match.group(1)
        else:
            review_id = None
    else:
        review_id = None

    # 2. Extract the review date (from <li> tag with class 'review-date')
    date_tag = review.find("li", class_="ipc-inline-list__item review-date")
    review_date = date_tag.get_text(strip=True) if date_tag else None

    # 3. Extract the review author (from <a> tag with class 'author-link')
    author_tag = review.find("a", class_="ipc-link ipc-link--base", attrs={"data-testid": "author-link"})
    author_name = author_tag.get_text(strip=True) if author_tag else None

    # 4. Extract the upvotes and downvotes (from ipc-voting__label__count classes)
    upvotes_tag = review.find("span", class_="ipc-voting__label__count--up")
    downvotes_tag = review.find("span", class_="ipc-voting__label__count--down")
    upvotes = upvotes_tag.get_text(strip=True) if upvotes_tag else 0
    downvotes = downvotes_tag.get_text(strip=True) if downvotes_tag else 0

    # 5. Extract the review text (from the ipc-overflowText--children div)
    spoiler_content_tag = review.find("div", {"data-testid": "review-spoiler-content"})
    
    if spoiler_content_tag:
        # If the spoiler content exists, extract the inner HTML of the review
        review_text = spoiler_content_tag.find("div", class_="ipc-html-content-inner-div")
        review_text = review_text.get_text(separator="\n", strip=True) if review_text else None
    else:
        # If no spoiler content, extract the regular review text
        review_text_tag = review.find("div", class_="ipc-overflowText--children")
        review_text = review_text_tag.get_text(separator="\n", strip=True) if review_text_tag else None

    # 6. Extract the review title (from <h3> inside a <div> with class 'ipc-title')
    title_tag = review.find("div", class_="ipc-title").find("h3", class_="ipc-title__text")
    review_title = title_tag.get_text(strip=True) if title_tag else None

    # 7. Extract the rating (from <span> with class 'ipc-rating-star--maxRating')
    rating_tag = review.find("span", class_="ipc-rating-star--maxRating")
    rating = rating_tag.previous_sibling.get_text(strip=True) if rating_tag else None

    # 8. Append data to the list
    data.append({
        "review_id": review_id, 
        "review_date": review_date, 
        "author": author_name, 
        "upvotes": upvotes, 
        "downvotes": downvotes, 
        "review_text": review_text,
        "review_title": review_title,
        "rating": rating
    })

# Create a dataframe from the collected data
df_reviews = pd.DataFrame(data)

In [None]:
def convert_to_int(value):
    if value is None:
        return 0
    if 'K' in value:
        return int(float(value.replace('K', '')) * 1000)
    try:
        return int(value)
    except ValueError:
        return 0

df_reviews['upvotes'] = df_reviews['upvotes'].apply(convert_to_int)
df_reviews['downvotes'] = df_reviews['downvotes'].apply(convert_to_int)

In [None]:
print(total_reviews)

In [None]:
df_reviews

In [None]:
driver.quit()