## Update Letterboxd Data

Update the imported Letterboxd data to always use the most up to date files

In [None]:
# Boolean flag to enable/disable downloading
# This should ONLY be set to True if you have an .env file containing valid letterboxd login credentials.
# Otherwise this could possibly render the given data unusable.

UPDATE_DATA = False  # Set True to enable download

### Code updating data should the flag be set to True

In [None]:
import os
import time
import zipfile
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By


if not UPDATE_DATA:
    print("Download flag is OFF. Exiting without downloading.")
    exit(0)
else:
    # Load environment variables
    load_dotenv()
    USERNAME = os.getenv('LETTERBOXD_USERNAME')
    PASSWORD = os.getenv('LETTERBOXD_PASSWORD')
    DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR')

    # Set up Firefox options
    options = Options()
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.dir", DOWNLOAD_DIR)
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip")
    options.set_preference("pdfjs.disabled", True)

    driver = webdriver.Firefox(options=options)

    try:
        # Step 1: Log in
        print("Logging in...")
        driver.get("https://letterboxd.com/sign-in/")
        time.sleep(2)

        driver.find_element(By.ID, "field-username").send_keys(USERNAME)
        driver.find_element(By.ID, "field-password").send_keys(PASSWORD)
        driver.find_element(By.TAG_NAME, "button").click()

        time.sleep(5)  # Wait for login to complete

        # Step 2: Go to data page
        print("Navigating to export page...")
        driver.get("https://letterboxd.com/settings/data/")
        time.sleep(3)

        # Step 3: Click export button
        print("Clicking export button...")
        export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/user/exportdata')]")

        export_link.click()
        time.sleep(3)

        # Step 4: Click confirm export button
        print("Clicking confirm export button...")
        confirm_export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/data/export/')]")

        confirm_export_link.click()

        print("Waiting for download to complete...")
        time.sleep(15)  # Adjust this depending on your connection speed

    finally:
        driver.quit()
        print("Browser closed.")

    # Step 4: Unzip and Load
    print("Looking for ZIP file in download directory...")
    zip_path = None
    for file in os.listdir(DOWNLOAD_DIR):
        if file.endswith(".zip") and "letterboxd" in file.lower():
            zip_path = os.path.join(DOWNLOAD_DIR, file)
            break

    if not zip_path:
        raise FileNotFoundError("Export ZIP file not found!")

    extract_path = os.path.join(DOWNLOAD_DIR, "letterboxd_export")
    os.makedirs(extract_path, exist_ok=True)

    print(f"Unzipping to {extract_path}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Delete the zip file after extraction
    os.remove(zip_path)
    print(f"Deleted ZIP file: {zip_path}")


## Setup

Imports and files setup

In [None]:
import pandas as pd

ratings = pd.read_csv("data/letterboxd_export/ratings.csv")
watched = pd.read_csv("data/letterboxd_export/watched.csv")
likes = pd.read_csv("data/letterboxd_export/likes/films.csv")
diary = pd.read_csv("data/letterboxd_export/diary.csv")

## See Data

First check on how the tables look

In [None]:
ratings.tail(15)

In [None]:
watched.tail(15)

In [None]:
likes.tail(15)

In [None]:
diary.tail(15)

## Connect tables

In [44]:
merged = pd.merge(
    watched[['Name', 'Year', 'Letterboxd URI']],
    ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
    on=['Name', 'Year', 'Letterboxd URI'],
    how='left'
)

# Add "Liked" information
liked_uris = set(likes["Letterboxd URI"])

merged["Liked"] = merged["Letterboxd URI"].apply(
    lambda uri: "Yes" if uri in liked_uris else "No"
)


# Add "Last Watched" information

# Step 1: Group diary entries by movie and take the latest date as a string
last_watched = diary.groupby(["Name", "Year"])["Watched Date"].max().reset_index()
last_watched.rename(columns={"Watched Date": "Last watched date"}, inplace=True)

# Step 2: Merge with the existing merged DataFrame
merged = pd.merge(
    merged,
    last_watched,
    on=["Name", "Year"],
    how="left"
)

# Add "Times watched" information

# Step 1: Sort diary by "Watched Date" so earliest watches come first
diary_sorted = diary.sort_values("Watched Date")

# Step 2: Define a function to count watches and check if first was a rewatch
def get_times_watched(group):
    count = len(group)
    first_rewatch = group.iloc[0]["Rewatch"] == "Yes"
    return f"{count}+" if first_rewatch else str(count)

# Step 3: Group and apply, excluding group keys from the inner DataFrame
times_watched = diary_sorted.groupby(["Name", "Year"], group_keys=False).apply(
    get_times_watched
).reset_index(name="Times watched")


# Step 4: Merge into the main DataFrame
merged = pd.merge(
    merged,
    times_watched,
    on=["Name", "Year"],
    how="left"
)

merged = merged.sort_values(by="Last watched date", ascending=True)

merged.head(15)


  times_watched = diary_sorted.groupby(["Name", "Year"], group_keys=False).apply(


Unnamed: 0,Name,Year,Letterboxd URI,Rating,Liked,Last watched date,Times watched
124,Frankenweenie,2012,https://boxd.it/bPS,5.0,Yes,2019-12-09,1
0,Bird Box,2018,https://boxd.it/eh1i,3.0,No,2020-06-07,1+
1,The Meyerowitz Stories (New and Selected),2017,https://boxd.it/dSp0,5.0,Yes,2020-06-07,1
2,Marriage Story,2019,https://boxd.it/hJAw,5.0,Yes,2020-06-07,1
3,Zodiac,2007,https://boxd.it/27MO,5.0,Yes,2020-06-07,1
4,Children of Men,2006,https://boxd.it/1Y2i,5.0,Yes,2020-06-09,1
5,Knives Out,2019,https://boxd.it/jWEA,5.0,Yes,2020-06-24,1
6,Jojo Rabbit,2019,https://boxd.it/iEBG,5.0,Yes,2020-06-24,1
7,Event Horizon,1997,https://boxd.it/20d8,2.0,No,2020-06-24,1
9,Easy A,2010,https://boxd.it/17DK,3.5,No,2020-06-26,1
