### Load Letterboxd user data (watch history)

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from fake_useragent import UserAgent

In [13]:
# Function to create a new WebDriver instance with a random User-Agent
def create_driver_with_random_user_agent():
    ua = UserAgent()
    chrome_options = Options()
    chrome_options.add_argument(f'--user-agent={ua.random}')
    chrome_options.add_argument("--disable-extensions")  
    chrome_options.add_argument("--disable-gpu")  
    chrome_options.add_argument("--no-sandbox")  
    chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
    
    # Set up WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

# Instantiate WebDriver
driver = create_driver_with_random_user_agent()

# Go to the watch history page
url = "https://letterboxd.com/ard_s/watchlist/"
driver.get(url)

all_html = []  # Store HTML content from all pages

while True:
    try:
        # Wait for the page to fully load
        time.sleep(2)  

        # Scroll down to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        # Get and store the HTML
        html_content = driver.page_source
        all_html.append(html_content)

        # Find and click the "Next" button if it exists
        try:
            next_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "next"))
            )
            next_button.click()
            time.sleep(2)  # Allow new content to load
        except:
            print("No more pages found. Exiting loop.")
            break

    except Exception as e:
        print(f"Error: {e}")
        break

# Close the driver
driver.quit()

# Save all HTML to a file (optional)
with open("watch_history_pages.html", "w", encoding="utf-8") as f:
    f.write("\n".join(all_html))

print("Scraping complete. HTML saved.")


KeyboardInterrupt: 

In [14]:
from bs4 import BeautifulSoup
import json

# Load the HTML file
with open("watch_history_pages.html", "r", encoding="utf-8") as f:
    html_content = f.read()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

movies = []

for movie in soup.select("li.poster-container"):
    title_tag = movie.select_one("a.frame")
    title = title_tag["data-original-title"] if title_tag else None
    link = title_tag["href"] if title_tag else None
    poster_tag = movie.select_one("img.image")
    poster_url = poster_tag["src"] if poster_tag else None
    film_slug_tag = movie.select_one("div.film-poster")
    film_slug = film_slug_tag["data-film-slug"] if film_slug_tag else None

    movies.append({
        "title": title,
        "link": f"https://letterboxd.com{link}" if link else None,
        "poster_url": poster_url,
        "film_slug": film_slug
    })

for movie in movies:
    print(movie)


{'title': 'The Cook, the Thief, His Wife & Her Lover (1989)', 'link': 'https://letterboxd.com/film/the-cook-the-thief-his-wife-her-lover/', 'poster_url': 'https://a.ltrbxd.com/resized/sm/upload/cy/cy/d9/jb/the-cook-the-thief-0-125-0-187-crop.jpg?v=7ca169fca2', 'film_slug': 'the-cook-the-thief-his-wife-her-lover'}
{'title': 'Because My Bike Was There... (1966)', 'link': 'https://letterboxd.com/film/because-my-bike-was-there/', 'poster_url': 'https://a.ltrbxd.com/resized/film-poster/6/8/5/3/4/0/685340-because-my-bike-was-there--0-125-0-187-crop.jpg?v=485eda20cf', 'film_slug': 'because-my-bike-was-there'}
{'title': 'Turkish Delight (1973)', 'link': 'https://letterboxd.com/film/turkish-delight/', 'poster_url': 'https://a.ltrbxd.com/resized/film-poster/3/8/6/3/8/38638-turkish-delight-0-125-0-187-crop.jpg?v=d3e648b137', 'film_slug': 'turkish-delight'}
{'title': 'Inland Empire (2006)', 'link': 'https://letterboxd.com/film/inland-empire/', 'poster_url': 'https://a.ltrbxd.com/resized/film-poste