## Global setup
Setup of everything needed to globally

In [1]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

TMDB_API_TOKEN = os.getenv('TMDB_API_TOKEN')

## Update Letterboxd Data

Update the imported Letterboxd data to always use the most up to date files

In [2]:
# Boolean flag to enable/disable downloading
# This should ONLY be set to True if you have an .env file containing valid letterboxd login credentials.
# Otherwise this could possibly render the given data unusable.

UPDATE_DATA = False  # Set True to enable download

### Code updating data should the flag be set to True

In [3]:
import time
import zipfile
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By


if not UPDATE_DATA:
    print("Download flag is OFF. Exiting without downloading.")
    exit(0)
else:
    USERNAME = os.getenv('LETTERBOXD_USERNAME')
    PASSWORD = os.getenv('LETTERBOXD_PASSWORD')
    DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR')

    # Set up Firefox options
    options = Options()
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.dir", DOWNLOAD_DIR)
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip")
    options.set_preference("pdfjs.disabled", True)

    driver = webdriver.Firefox(options=options)

    try:
        # Step 1: Log in
        print("Logging in...")
        driver.get("https://letterboxd.com/sign-in/")
        time.sleep(2)

        driver.find_element(By.ID, "field-username").send_keys(USERNAME)
        driver.find_element(By.ID, "field-password").send_keys(PASSWORD)
        driver.find_element(By.TAG_NAME, "button").click()

        time.sleep(5)  # Wait for login to complete

        # Step 2: Go to data page
        print("Navigating to export page...")
        driver.get("https://letterboxd.com/settings/data/")
        time.sleep(3)

        # Step 3: Click export button
        print("Clicking export button...")
        export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/user/exportdata')]")

        export_link.click()
        time.sleep(3)

        # Step 4: Click confirm export button
        print("Clicking confirm export button...")
        confirm_export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/data/export/')]")

        confirm_export_link.click()

        print("Waiting for download to complete...")
        time.sleep(15)  # Adjust this depending on your connection speed

    finally:
        driver.quit()
        print("Browser closed.")

    # Step 4: Unzip and Load
    print("Looking for ZIP file in download directory...")
    zip_path = None
    for file in os.listdir(DOWNLOAD_DIR):
        if file.endswith(".zip") and "letterboxd" in file.lower():
            zip_path = os.path.join(DOWNLOAD_DIR, file)
            break

    if not zip_path:
        raise FileNotFoundError("Export ZIP file not found!")

    extract_path = os.path.join(DOWNLOAD_DIR, "letterboxd_export")
    os.makedirs(extract_path, exist_ok=True)

    print(f"Unzipping to {extract_path}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Delete the zip file after extraction
    os.remove(zip_path)
    print(f"Deleted ZIP file: {zip_path}")


Download flag is OFF. Exiting without downloading.


## Setup

Imports and files setup

In [1]:
import pandas as pd

ratings = pd.read_csv("data/letterboxd_export/ratings.csv")
watched = pd.read_csv("data/letterboxd_export/watched.csv")
likes = pd.read_csv("data/letterboxd_export/likes/films.csv")
diary = pd.read_csv("data/letterboxd_export/diary.csv")

orphaned_diary = pd.read_csv("data/letterboxd_export/orphaned/diary.csv")

## See Data

First check on how the tables look

In [None]:
ratings.tail(15)

In [None]:
watched.tail(15)

In [None]:
likes.tail(15)

In [None]:
diary.tail(15)

In [None]:
orphaned_diary.tail(15)

## Connect tables
Connect letterboxd tables to get one table with following data for every movie:
- Title and Year
- Letterboxd URI
- User Rating
- Was the movie liked
- When was the movie last watched
- How many times has the movie been watched

In [2]:
merged = pd.merge(
    watched[['Name', 'Year', 'Letterboxd URI']],
    ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
    on=['Name', 'Year', 'Letterboxd URI'],
    how='left'
)

# Add "Liked" information
liked_uris = set(likes["Letterboxd URI"])

merged["Liked"] = merged["Letterboxd URI"].apply(
    lambda uri: "Yes" if uri in liked_uris else "No"
)


# Add "Last Watched" information

# Step 1: Group diary entries by movie and take the latest date as a string
last_watched = diary.groupby(["Name", "Year"])["Watched Date"].max().reset_index()
last_watched.rename(columns={"Watched Date": "Last watched date"}, inplace=True)

# Step 2: Merge with the existing merged DataFrame
merged = pd.merge(
    merged,
    last_watched,
    on=["Name", "Year"],
    how="left"
)

# Add "Times watched" information

# Step 1: Sort diary by "Watched Date" so earliest watches come first
diary_sorted = diary.sort_values("Watched Date")

# Step 2: Define a function to count watches and check if first was a rewatch
def get_times_watched(group):
    count = len(group)
    first_rewatch = group.iloc[0]["Rewatch"] == "Yes"
    return f"{count}+" if first_rewatch else str(count)

# Step 3: Group and apply, excluding group keys from the inner DataFrame
times_watched = diary_sorted.groupby(["Name", "Year"], group_keys=False).apply(
    get_times_watched
).reset_index(name="Times watched")


# Step 4: Merge into the main DataFrame
merged = (pd.merge(
    merged,
    times_watched,
    on=["Name", "Year"],
    how="left"
).rename(columns={'Name': 'Title'}).sort_values(by="Last watched date", ascending=False))

merged.head(15)


  times_watched = diary_sorted.groupby(["Name", "Year"], group_keys=False).apply(


Unnamed: 0,Title,Year,Letterboxd URI,Rating,Liked,Last watched date,Times watched
446,Okja,2017,https://boxd.it/dvXe,4.0,Yes,2025-07-26,2
746,Dominion,2018,https://boxd.it/gXqy,4.0,Yes,2025-07-19,1
745,Superman,2025,https://boxd.it/E9IU,3.5,Yes,2025-07-16,1
744,The Hunger Games: Mockingjay – Part 1,2014,https://boxd.it/4hka,3.0,Yes,2025-07-04,1
743,The Hunger Games: Catching Fire,2013,https://boxd.it/3sAw,4.0,Yes,2025-06-18,1
742,Ballerina,2025,https://boxd.it/jKqG,4.0,Yes,2025-06-15,1
741,The Hunger Games,2012,https://boxd.it/2uds,2.5,No,2025-06-13,1
740,Soldier Monika,2024,https://boxd.it/LhRk,3.5,No,2025-05-09,1
292,The Big Lebowski,1998,https://boxd.it/2b6C,4.5,Yes,2025-05-08,2
739,Eraserhead,1977,https://boxd.it/299u,5.0,Yes,2025-02-22,1


## Connect Letterboxd Data with TMDB data

In [5]:
import requests
from time import sleep
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

TMDB_API_TOKEN = os.getenv('TMDB_API_TOKEN')

HEADERS = {
    "Authorization": "Bearer " + TMDB_API_TOKEN,
    "Content-Type": "application/json;charset=utf-8"
}


def search_movie(title, year=None):
    url = "https://api.themoviedb.org/3/search/movie"
    params = {"query": title}
    if year:
        params["year"] = year
    response = requests.get(url, headers=HEADERS, params=params)
    if response.status_code == 200:
        results = response.json().get("results")
        return results[0] if results else None
    else:
        print(f"Error at {title}: {response.status_code}")
        return None


def get_movie_details(tmdb_id):
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    else:
        return None


def enrich_dataframe(df):
    base_url = "https://image.tmdb.org/t/p/"
    size = "original"  # Use original size for poster images

    # Create empty columns for TMDB data
    df["tmdb_id"] = None
    df["overview"] = None
    df["genres"] = None
    df["runtime"] = None
    df["vote_average"] = None
    df["poster_url"] = None

    total = len(df)
    print(f"Starting enrichment for {total} films using TMDB data...\n")

    for idx, row in df.iterrows():
        title = row["Title"]
        year = row.get("Year", None)

        print(f"[{idx+1}/{total}] Searching: '{title}' ({year})", end="")

        result = search_movie(title, year)
        if result:
            print(" ✅ Match found")
            details = get_movie_details(result["id"])
            if details:
                df.at[idx, "tmdb_id"] = result["id"]
                df.at[idx, "overview"] = details.get("overview")
                df.at[idx, "genres"] = [g["name"] for g in details.get("genres", [])]
                df.at[idx, "runtime"] = details.get("runtime")
                df.at[idx, "vote_average"] = details.get("vote_average")

                poster_path = details.get("poster_path")
                if poster_path:
                    df.at[idx, "poster_url"] = base_url + size + poster_path
                else:
                    df.at[idx, "poster_url"] = None
            else:
                print(" ⚠️  Details not found")
        else:
            print(" ❌ No match")

        sleep(0.25)  # Avoid rate limit

    print("\n✔️  Enrichment completed.")
    return df



enriched_merged = enrich_dataframe(merged)
# enriched_merged.to_csv("enriched_merged.csv", index=False)



Starting enrichment for 747 films using TMDB data...

[447/747] Searching: 'Okja' (2017) ✅ Match found
[747/747] Searching: 'Dominion' (2018) ✅ Match found
[746/747] Searching: 'Superman' (2025) ✅ Match found
[745/747] Searching: 'The Hunger Games: Mockingjay – Part 1' (2014) ✅ Match found
[744/747] Searching: 'The Hunger Games: Catching Fire' (2013) ✅ Match found
[743/747] Searching: 'Ballerina' (2025) ✅ Match found
[742/747] Searching: 'The Hunger Games' (2012) ✅ Match found
[741/747] Searching: 'Soldier Monika' (2024) ✅ Match found
[293/747] Searching: 'The Big Lebowski' (1998) ✅ Match found
[740/747] Searching: 'Eraserhead' (1977) ✅ Match found
[739/747] Searching: 'The Elephant Man' (1980) ✅ Match found
[738/747] Searching: 'David Lynch: The Art Life' (2016) ✅ Match found
[737/747] Searching: 'Sonic the Hedgehog 3' (2024) ✅ Match found
[736/747] Searching: 'David Lynch Cooks Quinoa' (2007) ❌ No match
[735/747] Searching: 'The Straight Story' (1999) ✅ Match found
[734/747] Searchin

# TODO

- Find out why the last cell can not use variables from the first cell
- Find out if there is a useful way to include orphaned diary entries as well
- TMDB search is often wrong (check the low vote_average column entries)