In [3]:
import os
import json
import time
import logging
import re
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure logging
logging.basicConfig(level=logging.INFO)

JSON_FILE = "letterboxd_movies.json"

# --- Original Driver Functions ---

def initialize_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(180)
    return driver

def safe_get(driver, url, max_retries=3, delay=3):
    """
    Attempts to load the URL using driver.get(). If a connection error occurs,
    it retries up to max_retries times. If all attempts fail, it reinitializes the driver.
    """
    for attempt in range(1, max_retries+1):
        try:
            driver.get(url)
            return driver  # Successful load
        except WebDriverException as e:
            logging.error(f"Error loading {url} on attempt {attempt}: {e}")
            time.sleep(delay)
            if attempt == max_retries:
                logging.info("Max retries reached. Reinitializing driver.")
                try:
                    driver.quit()
                except Exception:
                    pass
                driver = initialize_driver()
    return driver

# Global driver instance
driver = initialize_driver()

# --- Data Loading Function ---

def load_existing_movies(filename=JSON_FILE):
    """
    Load already scraped movies from a JSON file.
    Returns a list of movie dictionaries.
    """
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                return data
            except json.JSONDecodeError:
                return []
    return []

# --- Parsing and Scraping Functions ---

def scrape_letterboxd_popular_page(page_num):
    """
    Loads the Letterboxd popular films page for a given page number using Selenium.
    Returns a list of movies with basic info (title, URL).
    NOTE: Do not change this function as the HTML structure has been updated.
    """
    url = f"https://letterboxd.com/films/popular/page/{page_num}/"
    logging.info(f"Loading popular page: {url}")
    safe_get(driver, url)
    time.sleep(2)  # Short delay to let JS load content
    soup = BeautifulSoup(driver.page_source, "html.parser")
    movies = []
    for film in soup.select("li.listitem"):
        img_tag = film.find("img")
        if not img_tag:
            continue
        title = img_tag.get("alt", "Unknown")
        a_tag = film.find("a", class_="frame")
        if not a_tag:
            continue
        film_url = "https://letterboxd.com" + a_tag.get("href", "")
        movies.append({"title": title, "url": film_url})
    return movies

def get_watchers_count_selenium(movie_url, wait_time=10):
    """
    Uses Selenium (with explicit wait) to load the movie page and extract the watchers count.
    Expected tooltip format: "Watched by 316,343 members"
    """
    try:
        safe_get(driver, movie_url)
    except Exception as e:
        logging.error(f"Selenium error loading movie page {movie_url}: {e}")
        return 0
    try:
        a_elem = WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a.icon-watched"))
        )
        tooltip = a_elem.get_attribute("data-original-title") or a_elem.get_attribute("title")
        logging.info(f"Tooltip for {movie_url}: {tooltip}")
        if tooltip:
            match = re.search(r"Watched by ([\d,]+)\s*members", tooltip)
            if match:
                num_str = match.group(1).replace(",", "")
                return int(num_str)
            else:
                logging.warning(f"Regex did not match tooltip for {movie_url}")
    except Exception as e:
        logging.error(f"Selenium error extracting watchers for {movie_url}: {e}")
    return 0

# Test popular page 1
test_movies = scrape_letterboxd_popular_page(1)
logging.info(f"Sample movies from page 1: {test_movies[:5]}")

# Test watchers count on a sample movie
if test_movies:
    sample_url = test_movies[0]["url"]
    watchers = get_watchers_count_selenium(sample_url)
    logging.info(f"Selenium watchers for '{test_movies[0]['title']}': {watchers}")

# Allow nested asyncio loops
import nest_asyncio
nest_asyncio.apply()

async def fetch_detail(session, url):
    headers = {"User-Agent": "Mozilla/5.0"}
    for attempt in range(3):
        try:
            async with session.get(url, headers=headers, timeout=20) as response:
                if response.status == 200:
                    return await response.text()
        except Exception as e:
            logging.warning(f"Error fetching {url} (attempt {attempt+1}): {e}")
            await asyncio.sleep(2)
    return None

async def get_movie_details(session, movie):
    """
    Asynchronously fetches movie details from the movie's page.
    Uses Selenium fallback (synchronously) for watchers count (if not already present)
    and aiohttp for other details.
    Extracts synopsis, genres, themes, and language.
    
    For genres: first checks for an h3 with "Genres", if not found then "Genre".
    For language: first checks for "Language", then "Primary Language".
    
    Returns the movie dictionary with updated details.
    """
    # Use existing watchers count if present; otherwise, fetch via Selenium.
    watchers = movie.get("watchers")
    if not watchers:
        watchers = get_watchers_count_selenium(movie["url"])
    
    # Fetch the movie detail page asynchronously
    html = await fetch_detail(session, movie["url"])
    if not html:
        return None
    soup = BeautifulSoup(html, "html.parser")
    
    # Extract synopsis
    synopsis = ""
    syn_header = soup.find("h3", string="Synopsis")
    if syn_header:
        p_tag = syn_header.find_next("p")
        if p_tag:
            synopsis = p_tag.get_text(strip=True)
    
    # Extract genres: try "Genres" first, then "Genre"
    genres = []
    genres_header = soup.find("h3", string="Genres")
    if not genres_header:
        genres_header = soup.find("h3", string="Genre")
    if genres_header:
        genres_div = genres_header.find_next("div", class_="text-sluglist")
        if genres_div:
            genres = [a.get_text(strip=True) for a in genres_div.find_all("a")]
    
    # Extract themes (unchanged)
    themes = []
    themes_header = soup.find("h3", string="Themes")
    if themes_header:
        themes_div = themes_header.find_next("div", class_="text-sluglist")
        if themes_div:
            themes = [a.get_text(strip=True) for a in themes_div.find_all("a")]
    
    # Extract language: try "Language" first, then "Primary Language"
    language = ""
    language_header = soup.find("h3", string="Language")
    if not language_header:
        language_header = soup.find("h3", string="Primary Language")
    if language_header:
        language_elem = language_header.find_next(lambda tag: tag.name in ["div", "p"])
        if language_elem:
            language = language_elem.get_text(strip=True)
    
    # Apply defaults if still missing
    if not genres:
        genres = []  # or ["Unknown"] if preferred
    if not language:
        language = "Unknown"
    
    movie.update({
        "watchers": watchers,
        "synopsis": synopsis,
        "genres": genres,
        "themes": themes,
        "language": language
    })
    return movie

# Example test: (Assuming test_movies is defined in a previous cell)
if test_movies:
    async def test_details():
        async with aiohttp.ClientSession() as session:
            details = await get_movie_details(session, test_movies[0])
            logging.info(f"Details for '{test_movies[0]['title']}': {details}")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(test_details())

async def process_page(session, page_num, min_watchers=1000, existing_movies_by_title=None):
    logging.info(f"Processing page {page_num}")
    movies = scrape_letterboxd_popular_page(page_num)
    # Filter movies: include if not scraped, or if scraped but missing genres or language.
    filtered_movies = []
    for movie in movies:
        title = movie.get("title")
        if existing_movies_by_title is None or title not in existing_movies_by_title:
            filtered_movies.append(movie)
        else:
            existing = existing_movies_by_title[title]
            if not existing.get("genres") or not existing.get("language"):
                logging.info(f"Re-scraping movie '{title}' due to missing details.")
                # Pre-populate watchers count from existing data so we don't re-fetch it.
                if existing.get("watchers"):
                    movie["watchers"] = existing["watchers"]
                filtered_movies.append(movie)
            else:
                logging.info(f"Skipping movie '{title}' (already scraped with complete details).")
    tasks = [get_movie_details(session, movie) for movie in filtered_movies]
    results = await asyncio.gather(*tasks)
    # Only include movies with valid details and watchers count >= min_watchers.
    page_dataset = [movie for movie in results if movie is not None and movie.get("watchers", 0) >= min_watchers]
    return page_dataset

async def build_dataset(max_pages=200, min_watchers=1000, delay=1):
    # Load existing dataset if it exists
    if os.path.exists(JSON_FILE):
        with open(JSON_FILE, "r", encoding="utf-8") as f:
            try:
                dataset = json.load(f)
            except Exception:
                dataset = []
    else:
        dataset = []
    # Build a dictionary of already scraped movies keyed by title
    existing_movies_by_title = {movie["title"]: movie for movie in dataset if "title" in movie}
    
    async with aiohttp.ClientSession() as session:
        for page in range(1, max_pages + 1):
            page_data = await process_page(session, page, min_watchers, existing_movies_by_title)
            for movie in page_data:
                title = movie.get("title")
                if title in existing_movies_by_title:
                    # Update the existing movie if details are missing
                    existing_movies_by_title[title].update(movie)
                    # Also update the dataset list
                    for idx, m in enumerate(dataset):
                        if m.get("title") == title:
                            dataset[idx] = existing_movies_by_title[title]
                            break
                else:
                    dataset.append(movie)
                    existing_movies_by_title[title] = movie
            # Batch-save after processing each page to preserve progress
            with open(JSON_FILE, "w", encoding="utf-8") as f:
                json.dump(dataset, f, ensure_ascii=False, indent=2)
            logging.info(f"Page {page} processed. Total movies so far: {len(dataset)}")
            await asyncio.sleep(delay)
    return dataset

if __name__ == "__main__":
    final_dataset = asyncio.run(build_dataset(max_pages=400, min_watchers=1000, delay=1))
    logging.info(f"Final dataset size: {len(final_dataset)}")
    driver.quit()
    logging.info("Selenium driver closed.")
    

INFO:root:Loading popular page: https://letterboxd.com/films/popular/page/1/
INFO:root:Sample movies from page 1: [{'title': 'Barbie', 'url': 'https://letterboxd.com/film/barbie/'}, {'title': 'Parasite', 'url': 'https://letterboxd.com/film/parasite-2019/'}, {'title': 'Interstellar', 'url': 'https://letterboxd.com/film/interstellar/'}, {'title': 'Fight Club', 'url': 'https://letterboxd.com/film/fight-club/'}, {'title': 'La La Land', 'url': 'https://letterboxd.com/film/la-la-land/'}]
INFO:root:Tooltip for https://letterboxd.com/film/barbie/: Watched by 5,085,036 members
INFO:root:Selenium watchers for 'Barbie': 5085036
INFO:root:Tooltip for https://letterboxd.com/film/barbie/: Watched by 5,085,038 members
INFO:root:Details for 'Barbie': {'title': 'Barbie', 'url': 'https://letterboxd.com/film/barbie/', 'watchers': 5085038, 'synopsis': 'Barbie and Ken are having the time of their lives in the colorful and seemingly perfect world of Barbie Land. However, when they get a chance to go to the 

KeyboardInterrupt: 