## Global setup
Setup of everything needed to globally

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

TMDB_API_TOKEN = os.getenv('TMDB_API_TOKEN')

## Update Letterboxd Data

Update the imported Letterboxd data to always use the most up-to-date files

In [None]:
# Boolean flag to enable/disable downloading
# This should ONLY be set to True if you have an .env file containing valid letterboxd login credentials.
# Otherwise, this could possibly render the given data unusable.

UPDATE_DATA = True  # Set True to enable download

### Code updating data should the flag be set to True

In [None]:
import time
import zipfile
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By


if not UPDATE_DATA:
    print("Download flag is OFF. Exiting without downloading.")
    exit(0)
else:
    USERNAME = os.getenv('LETTERBOXD_USERNAME')
    PASSWORD = os.getenv('LETTERBOXD_PASSWORD')
    DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR')

    # Set up Firefox options
    options = Options()
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.dir", DOWNLOAD_DIR)
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip")
    options.set_preference("pdfjs.disabled", True)

    driver = webdriver.Firefox(options=options)

    try:
        # Step 1: Log in
        print("Logging in...")
        driver.get("https://letterboxd.com/sign-in/")
        time.sleep(2)

        driver.find_element(By.ID, "field-username").send_keys(USERNAME)
        driver.find_element(By.ID, "field-password").send_keys(PASSWORD)
        driver.find_element(By.TAG_NAME, "button").click()

        time.sleep(5)  # Wait for login to complete

        # Step 2: Go to data page
        print("Navigating to export page...")
        driver.get("https://letterboxd.com/settings/data/")
        time.sleep(3)

        # Step 3: Click export button
        print("Clicking export button...")
        export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/user/exportdata')]")

        export_link.click()
        time.sleep(3)

        # Step 4: Click confirm export button
        print("Clicking confirm export button...")
        confirm_export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/data/export/')]")

        confirm_export_link.click()

        print("Waiting for download to complete...")
        time.sleep(15)  # Adjust this depending on your connection speed

    finally:
        driver.quit()
        print("Browser closed.")

    # Step 4: Unzip and Load
    print("Looking for ZIP file in download directory...")
    zip_path = None
    for file in os.listdir(DOWNLOAD_DIR):
        if file.endswith(".zip") and "letterboxd" in file.lower():
            zip_path = os.path.join(DOWNLOAD_DIR, file)
            break

    if not zip_path:
        raise FileNotFoundError("Export ZIP file not found!")

    extract_path = os.path.join(DOWNLOAD_DIR, "letterboxd_export")
    os.makedirs(extract_path, exist_ok=True)

    print(f"Unzipping to {extract_path}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Delete the zip file after extraction
    os.remove(zip_path)
    print(f"Deleted ZIP file: {zip_path}")


In [None]:
'''
The code in the previous cell is in big parts AI generated by the free version of ChatGPT and was afterwards adapted by me.
These following prompts were used:

    "i want to write a python script that automatically exports my letterboxd data and loads it into my jupyter notebook. can you help me with that?"


    "i can manually download my data from this page
        https://letterboxd.com/settings/data/

        can the script navigate there and download it?"


    "I want to use Firefox and store the credentioals in an .env"


    "add the following functionality:

        there is a simple boolean flag that is per default on false. only if the flag is set to true, the download of the letterboxd data triggers. otherwise it does nothing"


    "its the only button on the website. can I just try to locate any button element?"


    "<a href="/data/export/" class="button -action button-action export-data-button">Export Data</a>

        now i want to locate this button here"


    "i want to write a python script that unzips a file for me and then deletes said file"


    "import os
        import time
        import zipfile
        from dotenv import load_dotenv
        from selenium import webdriver
        from selenium.webdriver.firefox.options import Options
        from selenium.webdriver.common.by import By


        if not UPDATE_DATA:
            print("Download flag is OFF. Exiting without downloading.")
            exit(0)
        else:
            # Load environment variables
            load_dotenv()
            USERNAME = os.getenv('LETTERBOXD_USERNAME')
            PASSWORD = os.getenv('LETTERBOXD_PASSWORD')
            DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR')

            # Set up Firefox options
            options = Options()
            options.set_preference("browser.download.folderList", 2)
            options.set_preference("browser.download.dir", DOWNLOAD_DIR)
            options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip")
            options.set_preference("pdfjs.disabled", True)

            driver = webdriver.Firefox(options=options)

            try:
                # Step 1: Log in
                print("Logging in...")
                driver.get("https://letterboxd.com/sign-in/")
                time.sleep(2)

                driver.find_element(By.ID, "field-username").send_keys(USERNAME)
                driver.find_element(By.ID, "field-password").send_keys(PASSWORD)
                driver.find_element(By.TAG_NAME, "button").click()

                time.sleep(5)  # Wait for login to complete

                # Step 2: Go to data page
                print("Navigating to export page...")
                driver.get("https://letterboxd.com/settings/data/")
                time.sleep(3)

                # Step 3: Click export button
                print("Clicking export button...")
                export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/user/exportdata')]")

                export_link.click()
                time.sleep(3)

                # Step 4: Click confirm export button
                print("Clicking confirm export button...")
                confirm_export_link = driver.find_element(By.XPATH, "//a[contains(@href, '/data/export/')]")

                confirm_export_link.click()

                print("Waiting for download to complete...")
                time.sleep(15)  # Adjust this depending on your connection speed

            finally:
                driver.quit()
                print("Browser closed.")

            # Step 4: Unzip and Load
            print("Looking for ZIP file in download directory...")
            zip_path = None
            for file in os.listdir(DOWNLOAD_DIR):
                if file.endswith(".zip") and "letterboxd" in file.lower():
                    zip_path = os.path.join(DOWNLOAD_DIR, file)
                    break

            if not zip_path:
                raise FileNotFoundError("Export ZIP file not found!")

            extract_path = os.path.join(DOWNLOAD_DIR, "letterboxd_export")
            os.makedirs(extract_path, exist_ok=True)

            print(f"Unzipping to {extract_path}...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_path)

            # Delete the zip file after extraction
            os.remove(zip_path)
            print(f"Deleted ZIP file: {zip_path}")



        Zu beginn des else zweiges möchte ich den kompletten order "Users/tschaffel/Documents/PycharmProjects/JupyterProject/data/" löschen."


    ""

'''

## Setup

Imports and files setup

In [None]:
import pandas as pd

ratings = pd.read_csv("data/letterboxd_export/ratings.csv")
watched = pd.read_csv("data/letterboxd_export/watched.csv")
likes = pd.read_csv("data/letterboxd_export/likes/films.csv")
diary = pd.read_csv("data/letterboxd_export/diary.csv")

orphaned_diary = pd.read_csv("data/letterboxd_export/orphaned/diary.csv")

## See Data

First check on how the tables look

In [None]:
ratings.tail(10)

In [None]:
watched.tail(10)

In [None]:
likes.tail(10)

In [None]:
diary.tail(10)

In [None]:
orphaned_diary.tail(10)

## Connect tables
Connect letterboxd tables to get one table with following data for every movie:
- Title and Year
- Letterboxd URI
- User Rating
- Was the movie liked
- When was the movie last watched
- How many times has the movie been watched

In [None]:
merged = pd.merge(
    watched[['Name', 'Year', 'Letterboxd URI']],
    ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
    on=['Name', 'Year', 'Letterboxd URI'],
    how='left'
)

# Add "Liked" information
liked_uris = set(likes["Letterboxd URI"])

merged["Liked"] = merged["Letterboxd URI"].apply(
    lambda uri: "Yes" if uri in liked_uris else "No"
)


# Add "Last Watched" information

# Step 1: Group diary entries by movie and take the latest date as a string
last_watched = diary.groupby(["Name", "Year"])["Watched Date"].max().reset_index()
last_watched.rename(columns={"Watched Date": "Last watched date"}, inplace=True)

# Step 2: Merge with the existing merged DataFrame
merged = pd.merge(
    merged,
    last_watched,
    on=["Name", "Year"],
    how="left"
)

# Add "Times watched" information

# Step 1: Sort diary by "Watched Date" so the earliest watches come first
diary_sorted = diary.sort_values("Watched Date")

# Step 2: Define a function to count watches and check if first was a rewatch
def get_times_watched(group):
    count = len(group)
    first_rewatch = group.iloc[0]["Rewatch"] == "Yes"
    return f"{count}+" if first_rewatch else str(count)

# Step 3: Group and apply, excluding group keys from the inner DataFrame
times_watched = diary_sorted.groupby(["Name", "Year"], group_keys=False).apply(
    get_times_watched
).reset_index(name="Times watched")


# Step 4: Merge into the main DataFrame
merged = (pd.merge(
    merged,
    times_watched,
    on=["Name", "Year"],
    how="left"
).rename(columns={'Name': 'Title'}).sort_values(by="Last watched date", ascending=False))

merged.head(15)





In [None]:
'''
The code in the previous cell is in big parts AI generated by the free version of ChatGPT and was afterwards adapted by me.
These following prompts were used:

    "wie kann ich selbst technisch die beiden tabellen verknüpfen?"


    "merged = pd.merge(
            watched,
            ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
            on=['Name', 'Year', 'Letterboxd URI'],
            how='left'
        )
        merged.head()

        adaptiere mir diesen code bitte so, dass das CSV file unter "likes/films" auch eingelesen wird. in der resultierenden tabelle gibt es eine neue spalte namens "Liked", in der jeder film, der in "likes/films" enhalten ist, einen eintrag "Yes" bekommt, jeder andere einen eintrag "No" "


    "Anstelle von "ratings", benutze "diary" und mach eine neue Spalte "Anzahl", die beinhaltet wie oft der selbe Film in "diary" vorkommt. als rating soll immer das datumsmäßig letzte verwendet werden."


    "das funktioniert nicht, date rating und anzahl sind alles NaN jetzt"


    "date ist jetzt NaT, rating und number immer noch NaN"

    "merged = pd.merge(
            watched[['Name', 'Year', 'Letterboxd URI']],
            ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
            on=['Name', 'Year', 'Letterboxd URI'],
            how='left'
        )

        # Add "Liked" information
        liked_uris = set(likes["Letterboxd URI"])

        merged["Liked"] = merged["Letterboxd URI"].apply(
            lambda uri: "Yes" if uri in liked_uris else "No"
        )

        this is my combined table so far. Now i want to add further information.
        I want to add this:
        First, from the table "diary" i want to add the LAST watched date of the movie. use the column "watched date", and if the movie has multiple entries i want the last one. call this new column "Last watched date". For movies that don't exist in "diary" we will have NaN."


    "I dont want to convert the date to datetime"


    "merged = pd.merge(
            watched[['Name', 'Year', 'Letterboxd URI']],
            ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
            on=['Name', 'Year', 'Letterboxd URI'],
            how='left'
        )

        # Add "Liked" information
        liked_uris = set(likes["Letterboxd URI"])

        merged["Liked"] = merged["Letterboxd URI"].apply(
            lambda uri: "Yes" if uri in liked_uris else "No"
        )


        # Add "Last Watched" information

        # Step 1: Group diary entries by movie and take the latest date as a string
        last_watched = diary.groupby(["Name", "Year"])["Watched Date"].max().reset_index()
        last_watched.rename(columns={"Watched Date": "Last watched date"}, inplace=True)

        # Step 2: Merge with the existing merged DataFrame
        merged = pd.merge(
            merged,
            last_watched,
            on=["Name", "Year"],
            how="left"
        )

        this is my code rn

        now I want another thing:
        I want a new column called "Times watched". From diary use the following information:
        - Count the number of entries and display the number in the column "Times watched".
        - If the oldest entry has "Yes" in the "Rewatch" column, then add a "+" after the number. Another way to do this is if every entry has "Yes" in the "Rewatch" column, do whatever is easier."


    "it is not chronologically correct, so i would prefer the safer option"


'''

## Connect Letterboxd Data with TMDB data

In [None]:
import requests
from time import sleep
import os
from dotenv import load_dotenv
import unicodedata
import re

# Load environment variables
load_dotenv()

TMDB_API_TOKEN = os.getenv('TMDB_API_TOKEN')

HEADERS = {
    "Authorization": "Bearer " + TMDB_API_TOKEN,
    "Content-Type": "application/json;charset=utf-8"
}

def normalize_title(title):
    if not title:
        return ""
    # Unicode normalize, convert to ASCII-compatible
    title = unicodedata.normalize("NFKC", title)

    # Replace common visually similar characters
    substitutions = {
        "–": "-",  # en dash
        "—": "-",  # em dash
        "−": "-",  # minus
        "×": "x",  # multiplication sign
        "’": "'",  # curly apostrophe
        "“": '"',
        "”": '"',
        "…": "...",
        "&": "and",  # optional
    }

    for orig, repl in substitutions.items():
        title = title.replace(orig, repl)

    # Collapse multiple spaces and lowercase
    title = re.sub(r"\s+", " ", title).strip().lower()
    return title

def search_exact_match(results, search_title):
    norm_search = normalize_title(search_title)
    for r in results:
        tmdb_title = r.get("title") or r.get("name") or ""
        if normalize_title(tmdb_title) == norm_search:
            return r
    return None

def search_movie_or_tv(title, year=None):
    # First: try movie search
    movie_url = "https://api.themoviedb.org/3/search/movie"
    params = {"query": title}
    if year:
        params["year"] = year

    response = requests.get(movie_url, headers=HEADERS, params=params)
    if response.status_code == 200:
        results = response.json().get("results", [])
        match = search_exact_match(results, title)
        if match:
            match["media_type"] = "movie"
            return match

    # Second: try TV search
    tv_url = "https://api.themoviedb.org/3/search/tv"
    params = {"query": title}
    if year:
        params["first_air_date_year"] = year

    response = requests.get(tv_url, headers=HEADERS, params=params)
    if response.status_code == 200:
        results = response.json().get("results", [])
        match = search_exact_match(results, title)
        if match:
            match["media_type"] = "tv"
            return match
    return None

def get_details(tmdb_id, media_type):
    url = f"https://api.themoviedb.org/3/{media_type}/{tmdb_id}"
    response = requests.get(url, headers=HEADERS)
    return response.json() if response.status_code == 200 else None

def get_credits(tmdb_id, media_type):
    url = f"https://api.themoviedb.org/3/{media_type}/{tmdb_id}/credits"
    response = requests.get(url, headers=HEADERS)
    return response.json() if response.status_code == 200 else None

def enrich_dataframe(df):
    tmdb_media_base_url = "https://www.themoviedb.org/"
    tmdb_poster_base_url = "https://image.tmdb.org/t/p/"
    tmdb_person_base_url = "http://www.themoviedb.org/person/"
    tmdb_genre_base_url = "https://www.themoviedb.org/genre/"
    tmdb_company_base_url = "https://www.themoviedb.org/company/"
    size = "original"

    # Create empty columns for TMDB data
    df["tmdb_url"] = None
    df["overview"] = None
    df["genres"] = None
    df["runtime"] = None
    df["vote_average"] = None
    df["poster_url"] = None
    df["media_type"] = None
    df["director"] = None
    df["actors"] = None
    df["characters"] = None
    df["origin_country"] = None # from here
    df["original_language"] = None
    df["popularity"] = None
    df["production_companies"] = None
    df["production_countries"] = None
    df["spoken_languages"] = None

    total = len(df)
    print(f"Starting enrichment for {total} titles using TMDB data...\n")

    for idx, row in df.iterrows():
        title = row["Title"]
        year = row.get("Year", None)

        print(f"[{idx+1}/{total}] Searching: '{title}' ({year})", end="")

        result = search_movie_or_tv(title, year)
        if result:
            print(" ✅ Match found")
            media_type = result["media_type"]
            tmdb_id = result["id"]

            details = get_details(tmdb_id, media_type)
            credits = get_credits(tmdb_id, media_type)

            if details:
                df.at[idx, "media_type"] = media_type
                df.at[idx, "tmdb_url"] = tmdb_media_base_url + f"{media_type}/" + str(tmdb_id)
                df.at[idx, "overview"] = details.get("overview")
                df.at[idx, "genres"] = [str(g["name"]) + ":" + tmdb_genre_base_url + str(g["id"]) for g in details.get("genres", [])]
                df.at[idx, "vote_average"] = details.get("vote_average")
                df.at[idx, "origin_country"] = details.get("origin_country")
                df.at[idx, "original_language"] = details.get("original_language")
                df.at[idx, "popularity"] = details.get("popularity")
                df.at[idx, "production_companies"] = [str(p["name"]) + ":" + tmdb_company_base_url + str(p["id"]) + ":" + str(p["origin_country"]) for p in details.get("production_companies", [])]
                df.at[idx, "production_countries"] = [str(p["name"]) + ":" + str(p["iso_3166_1"]) for p in details.get("production_countries", [])]
                df.at[idx, "spoken_languages"] = [str(s["english_name"]) + ":" + str(s["iso_639_1"]) for s in details.get("spoken_languages", [])]

                if media_type == "movie":
                    df.at[idx, "runtime"] = details.get("runtime")
                else:
                    df.at[idx, "runtime"] = None

                poster_path = details.get("poster_path")
                if poster_path:
                    df.at[idx, "poster_url"] = tmdb_poster_base_url + size + poster_path

            if credits:
                # Directors (may be multiple)
                crew = credits.get("crew", [])
                directors = [str(p["name"]) + ":" + tmdb_person_base_url + str(p["id"]) for p in crew if p.get("job") == "Director"]
                df.at[idx, "director"] = directors if directors else None

                # Top 5 actors and their characters
                cast = credits.get("cast", [])[:5]
                actor_names = [str(a["name"]) + ":" + tmdb_person_base_url + str(a["id"]) for a in cast]
                character_names = [a["character"] for a in cast]
                df.at[idx, "actors"] = actor_names if actor_names else None
                df.at[idx, "characters"] = character_names if character_names else None
        else:
            print(" ❌ No exact match found")

        sleep(0.25)

    print("\n✔️  Enrichment completed.")
    return df




enriched_merged = enrich_dataframe(merged)
enriched_merged.to_csv("data/enriched_merged.csv", index=False)


In [None]:
'''
The code in the previous cell is in big parts AI generated by the free version of ChatGPT and was afterwards adapted by me.
These following prompts were used:


    "ich baue einen Knowledge graph basierend auf Letterboxd User-Daten in einem Jupyter Notebook. Diese möchte ich jetzt mit TMDB Daten anreichern. Dazu habe ich mir gerade API Zugang verschafft, ich habe jetzt einen API Read Access Token.
        Wie beginne ich, meine bestehende Tabelle mit weitern Film-Daten anzureichern?"


    "def enrich_dataframe(df):
            new_data = []
            for idx, row in df.iterrows():
                title = row["Title"]
                year = row.get("Year", None)

                result = search_movie(title, year)
                if result:
                    details = get_movie_details(result["id"])
                    if details:
                        new_data.append({
                            "title": title,
                            "year": year,
                            "tmdb_id": result["id"],
                            "overview": details.get("overview"),
                            "genres": [g["name"] for g in details.get("genres", [])],
                            "runtime": details.get("runtime"),
                            "vote_average": details.get("vote_average"),
                            "poster_path": details.get("poster_path")
                        })
                else:
                    new_data.append({
                        "title": title,
                        "year": year,
                        "tmdb_id": None,
                        "overview": None,
                        "genres": None,
                        "runtime": None,
                        "vote_average": None,
                        "poster_path": None
                    })

                sleep(0.25)  # Vermeide Rate-Limits

            return pd.DataFrame(new_data)

        add print statements to this method so the user gets feedback while the method is running"


    "bitte englische kommentare stattdessen"


    "Gerade erstellt die Funktion "enrich_dataframe" eine völlig neue Tabelle mit TMDB daten und beinhaltet aber keine meiner Daten die ich bereits in einer Tabelle habe.

        Das ist mein Code bisher:
        merged = pd.merge(
            watched[['Name', 'Year', 'Letterboxd URI']],
            ratings[['Name', 'Year', 'Letterboxd URI', 'Rating']],
            on=['Name', 'Year', 'Letterboxd URI'],
            how='left'
        )

        # Add "Liked" information
        liked_uris = set(likes["Letterboxd URI"])

        merged["Liked"] = merged["Letterboxd URI"].apply(
            lambda uri: "Yes" if uri in liked_uris else "No"
        )


        # Add "Last Watched" information

        # Step 1: Group diary entries by movie and take the latest date as a string
        last_watched = diary.groupby(["Name", "Year"])["Watched Date"].max().reset_index()
        last_watched.rename(columns={"Watched Date": "Last watched date"}, inplace=True)

        # Step 2: Merge with the existing merged DataFrame
        merged = pd.merge(
            merged,
            last_watched,
            on=["Name", "Year"],
            how="left"
        )

        # Add "Times watched" information

        # Step 1: Sort diary by "Watched Date" so earliest watches come first
        diary_sorted = diary.sort_values("Watched Date")

        # Step 2: Define a function to count watches and check if first was a rewatch
        def get_times_watched(group):
            count = len(group)
            first_rewatch = group.iloc[0]["Rewatch"] == "Yes"
            return f"{count}+" if first_rewatch else str(count)

        # Step 3: Group and apply, excluding group keys from the inner DataFrame
        times_watched = diary_sorted.groupby(["Name", "Year"], group_keys=False).apply(
            get_times_watched
        ).reset_index(name="Times watched")


        # Step 4: Merge into the main DataFrame
        merged = (pd.merge(
            merged,
            times_watched,
            on=["Name", "Year"],
            how="left"
        ).rename(columns={'Name': 'Title'}).sort_values(by="Last watched date", ascending=False))

        merged.head(15)

        Meine Tabelle beinhaltet die Spalten:
        Title, Year, Letterboxd URI, Rating, Liked, Last watched date, Times watched

        Nun möchte ich die TMDB daten an diese tabelle anheften."


    "automatically include the full poster path in the table. use size "original""


    "def search_movie(title, year=None):
            url = "https://api.themoviedb.org/3/search/movie"
            params = {"query": title}
            if year:
                params["year"] = year
            response = requests.get(url, headers=HEADERS, params=params)
            if response.status_code == 200:
                results = response.json().get("results")
                return results[0] if results else None
            else:
                print(f"Error at {title}: {response.status_code}")
                return None

        def get_movie_details(tmdb_id):
            url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
            response = requests.get(url, headers=HEADERS)
            if response.status_code == 200:
                return response.json()
            else:
                return None


        def enrich_dataframe(df):
            tmdb_base_url = "https://www.themoviedb.org/movie/"
            poster_base_url = "https://image.tmdb.org/t/p/"
            size = "original"  # Use original size for poster images

            # Create empty columns for TMDB data
            df["tmdb_url"] = None
            df["overview"] = None
            df["genres"] = None
            df["runtime"] = None
            df["vote_average"] = None
            df["poster_url"] = None

            total = len(df)
            print(f"Starting enrichment for {total} films using TMDB data...\n")

            for idx, row in df.iterrows():
                title = row["Title"]
                year = row.get("Year", None)

                print(f"[{idx+1}/{total}] Searching: '{title}' ({year})", end="")

                result = search_movie(title, year)
                if result:
                    print(" ✅ Match found")
                    details = get_movie_details(result["id"])
                    if details:
                        df.at[idx, "tmdb_url"] = tmdb_base_url + str(result["id"])
                        df.at[idx, "overview"] = details.get("overview")
                        df.at[idx, "genres"] = [g["name"] for g in details.get("genres", [])]
                        df.at[idx, "runtime"] = details.get("runtime")
                        df.at[idx, "vote_average"] = details.get("vote_average")

                        poster_path = details.get("poster_path")
                        if poster_path:
                            df.at[idx, "poster_url"] = poster_base_url + size + poster_path
                        else:
                            df.at[idx, "poster_url"] = None
                    else:
                        print(" ⚠️  Details not found")
                else:
                    print(" ❌ No match")

                sleep(0.25)  # Avoid rate limit

            print("\n✔️  Enrichment completed.")
            return df


        i want the following changes to this code:

        instead of only searching for movies, it searches for movies and tv shows.
        the algorithm works like this:
        first it searches for it in the movie database always with title and year. from the list of search results it will try to match the title EXACTLY. only if an exact title match fails, it then tries to search for it in the tv show database. again, it tries to match the title exactly."


    "if it is a TV show, i dont want a runtime in my results"


    "add a new column that includes info on whether the list entry is a TV show or a Movie"


    "this works well! I have an additional problem:

        in my testing i found problems with for example these two movies:

        In my letterboxd data, the movie is called
        The Hunger Games: Mockingjay – Part 1

        TMDB has it listed as
        The Hunger Games: Mockingjay - Part 1

        the algorithm is unable to match it, i suspect because of the two different hypthens.

        another case is this, letterboxd has the movie as
        Godzilla × Kong: The New Empire

        whereas TMDB lists it as
        Godzilla x Kong: The New Empire

        the algorithm is unable to match it, again likely because the "x" is different.

        how can I solve this problem?"


    "import requests
        from time import sleep
        import os
        from dotenv import load_dotenv
        import unicodedata
        import re

        # Load environment variables
        load_dotenv()

        TMDB_API_TOKEN = os.getenv('TMDB_API_TOKEN')

        HEADERS = {
            "Authorization": "Bearer " + TMDB_API_TOKEN,
            "Content-Type": "application/json;charset=utf-8"
        }

        def normalize_title(title):
            if not title:
                return ""
            # Unicode normalize, convert to ASCII-compatible
            title = unicodedata.normalize("NFKC", title)

            # Replace common visually similar characters
            substitutions = {
                "–": "-",  # en dash
                "—": "-",  # em dash
                "−": "-",  # minus
                "×": "x",  # multiplication sign
                "’": "'",  # curly apostrophe
                "“": '"',
                "”": '"',
                "…": "...",
                "&": "and",  # optional
            }

            for orig, repl in substitutions.items():
                title = title.replace(orig, repl)

            # Collapse multiple spaces and lowercase
            title = re.sub(r"\s+", " ", title).strip().lower()
            return title

        def search_exact_match(results, search_title):
            norm_search = normalize_title(search_title)
            for r in results:
                tmdb_title = r.get("title") or r.get("name") or ""
                if normalize_title(tmdb_title) == norm_search:
                    return r
            return None

        def search_movie_or_tv(title, year=None):
            # First: try movie search
            movie_url = "https://api.themoviedb.org/3/search/movie"
            params = {"query": title}
            if year:
                params["year"] = year

            response = requests.get(movie_url, headers=HEADERS, params=params)
            if response.status_code == 200:
                results = response.json().get("results", [])
                match = search_exact_match(results, title)
                if match:
                    match["media_type"] = "movie"
                    return match

            # Second: try TV search
            tv_url = "https://api.themoviedb.org/3/search/tv"
            params = {"query": title}
            if year:
                params["first_air_date_year"] = year

            response = requests.get(tv_url, headers=HEADERS, params=params)
            if response.status_code == 200:
                results = response.json().get("results", [])
                match = search_exact_match(results, title)
                if match:
                    match["media_type"] = "tv"
                    return match

            # print(f" Error or no match for '{title}' ")
            return None

        def get_movie_details(tmdb_id):
            url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
            response = requests.get(url, headers=HEADERS)
            if response.status_code == 200:
                return response.json()
            else:
                return None


        def get_details(tmdb_id, media_type):
            url = f"https://api.themoviedb.org/3/{media_type}/{tmdb_id}"
            response = requests.get(url, headers=HEADERS)
            return response.json() if response.status_code == 200 else None

        def enrich_dataframe(df):
            tmdb_base_url = "https://www.themoviedb.org/"
            poster_base_url = "https://image.tmdb.org/t/p/"
            size = "original"

            # Create empty columns for TMDB data
            df["tmdb_url"] = None
            df["overview"] = None
            df["genres"] = None
            df["runtime"] = None
            df["vote_average"] = None
            df["poster_url"] = None
            df["media_type"] = None  # New column: "movie" or "tv"

            total = len(df)
            print(f"Starting enrichment for {total} titles using TMDB data...\n")

            for idx, row in df.iterrows():
                title = row["Title"]
                year = row.get("Year", None)

                print(f"[{idx+1}/{total}] Searching: '{title}' ({year})", end="")

                result = search_movie_or_tv(title, year)
                if result:
                    print(" ✅ Match found")
                    media_type = result["media_type"]
                    details = get_details(result["id"], media_type)

                    if details:
                        df.at[idx, "media_type"] = media_type
                        df.at[idx, "tmdb_url"] = tmdb_base_url + f"{media_type}/" + str(result["id"])
                        df.at[idx, "overview"] = details.get("overview")
                        df.at[idx, "genres"] = [g["name"] for g in details.get("genres", [])]
                        df.at[idx, "vote_average"] = details.get("vote_average")

                        # Only add runtime for movies
                        if media_type == "movie":
                            df.at[idx, "runtime"] = details.get("runtime")
                        else:
                            df.at[idx, "runtime"] = None

                        poster_path = details.get("poster_path")
                        if poster_path:
                            df.at[idx, "poster_url"] = poster_base_url + size + poster_path
                else:
                    print(" ❌ No exact match found")

                sleep(0.25)

            print("\n✔️  Enrichment completed.")
            return df



        enriched_merged = enrich_dataframe(merged)
        enriched_merged.to_csv("data/enriched_merged.csv", index=False)

        this is my code. Adapt it so that it includes a new column that includes the directors name. another column should include an array of actor names, and another column should include an array of character names."


    "i want to store them like the genres:
        ['Adventure', 'Drama', 'Science Fiction']
        also, it should be possible to have multiple directors.
        please give me an updated "enrich dataframe" function"

'''

## Create Knowledge Graph

In [8]:
import pandas as pd
from ast import literal_eval
import re

# Load the CSV
file_path = "data/enriched_merged.csv"
df = pd.read_csv(file_path)

# Helper: convert TMDB URLs to local IDs like movie123, person456
def get_local_id(tmdb_url):
    if not isinstance(tmdb_url, str):
        return None  # Invalid or missing
    match = re.search(r'themoviedb\.org/(movie|tv|person|genre|company)/(\d+)', tmdb_url)
    if match:
        entity_type, entity_id = match.groups()
        return f"{entity_type}{entity_id}"
    return None

triples = []

for _, row in df.iterrows():
    tmdb_url = row.get('tmdb_url')
    if not isinstance(tmdb_url, str) or not tmdb_url.startswith("http"):
        continue  # Skip rows with invalid or missing TMDB URL

    # Extract ID and media type
    try:
        tmdb_id = tmdb_url.rstrip('/').split("/")[-1]
        media_type = str(row.get("media_type", "movie")).strip().lower()
        if media_type not in ["movie", "tv"]:
            media_type = "movie"  # default fallback
        subj = f"{media_type}{tmdb_id}"
    except Exception as e:
        continue  # Skip this row if any error occurs

    # Basic movie/tv info
    triples.append((subj, "rdf:type", f"schema:{media_type.capitalize()}"))
    triples.append((subj, "schema:name", row["Title"]))
    triples.append((subj, "schema:datePublished", str(row["Year"])))
    triples.append((subj, "schema:aggregateRating", str(row["vote_average"])))
    triples.append((subj, "schema:review", str(row["Rating"])))
    triples.append((subj, "ex:liked", str(row["Liked"])))
    triples.append((subj, "ex:lastWatched", str(row["Last watched date"])))
    triples.append((subj, "ex:timesWatched", str(row["Times watched"])))
    triples.append((subj, "ex:originalLanguage", str(row["original_language"])))
    triples.append((subj, "ex:popularity", str(row["popularity"])))


    if not pd.isna(row["runtime"]):
        triples.append((subj, "schema:duration", str(int(row["runtime"]))))

    # Directors
    try:
        directors_raw = literal_eval(row["director"])
        for entry in directors_raw:
            if ":" not in entry:
                continue  # skip malformed
            name, url = entry.split(":", 1)
            director_id = get_local_id(url)  # e.g. person1673654
            if not director_id:
                continue
            triples.append((subj, "schema:director", director_id))
            triples.append((director_id, "rdf:type", "schema:Person"))
            triples.append((director_id, "schema:name", name.strip()))
    except:
        pass


    # Actors
    try:
        actors_raw = literal_eval(row["actors"])
        for entry in actors_raw:
            if ":" not in entry:
                continue  # skip malformed
            name, url = entry.split(":", 1)
            actor_id = get_local_id(url)  # e.g. person12345
            if not actor_id:
                continue
            triples.append((subj, "schema:actor", actor_id))
            triples.append((actor_id, "rdf:type", "schema:Person"))
            triples.append((actor_id, "schema:name", name.strip()))
    except:
        pass


    # Characters
    try:
        characters = literal_eval(row["characters"])
        for character in characters:
            triples.append((subj, "schema:character", character))
    except:
        pass


    # Genres
    try:
        genres_raw = literal_eval(row["genres"])
        for entry in genres_raw:
            if ":" not in entry:
                continue  # skip malformed
            name, url = entry.split(":", 1)
            genre_id = get_local_id(url)  # e.g. genre18
            if not genre_id:
                continue
            triples.append((subj, "schema:genre", genre_id))
            triples.append((genre_id, "rdf:type", "schema:Text"))
            triples.append((genre_id, "ex:name", name.strip()))
    except:
        pass


    # Countries Of Origin
    try:
        countriesOfOrigin = literal_eval(row["origin_country"])
        for entry in countriesOfOrigin:
            triples.append((subj, "schema:countryOfOrigin", entry))
    except:
        pass


    # Production Companies
    try:
        companies_raw = literal_eval(row["production_companies"])
        for entry in companies_raw:
            if ":" not in entry:
                continue  # skip malformed
            name, rest = entry.split(":", 1)
            url, country = rest.rsplit(":", 1)
            company_id = get_local_id(url)
            if not company_id:
                continue
            triples.append((subj, "schema:productionCompany", company_id))
            triples.append((company_id, "rdf:type", "schema:Company"))
            triples.append((company_id, "schema:name", name.strip()))
            triples.append((company_id, "ex:country", country))
    except:
        pass


    # Production Countries
    try:
        countries_raw = literal_eval(row["production_countries"])
        for entry in countries_raw:
            if ":" not in entry:
                continue
            name, code = entry.split(":", 1)
            triples.append((subj, "ex:productionCountry", code))
            triples.append((code, "rdf:type", "schema:Country"))
            triples.append((code, "schema:name", name.strip()))
    except:
        pass


    # Spoken Languages
    try:
        languages_raw = literal_eval(row["spoken_languages"])
        for entry in languages_raw:
            if ":" not in entry:
                continue
            name, code = entry.split(":", 1)
            triples.append((subj, "schema:inLanguage", code))
            triples.append((code, "rdf:type", "schema:Language"))
            triples.append((code, "schema:name", name.strip()))
    except:
        pass




# Convert to DataFrame and export for PyKEEN
triples_df = pd.DataFrame(triples, columns=["subject", "predicate", "object"])
triples_df.to_csv("data/movie_kg_triples.tsv", sep="\t", index=False, header=False)


## Convert to .ttl (Delete this again, as it is not necessary)

In [7]:
# convert_tsv_to_ttl.py

INPUT_FILE = "data/movie_kg_triples.tsv"
OUTPUT_FILE = "data/triples.ttl"

# Prefixes
prefixes = [
    "@prefix schema: <http://schema.org/> .",
    "@prefix ex: <http://example.org/> .",
    "@prefix tmdb: <https://www.themoviedb.org/> .",
    "@prefix : <http://example.org/> .",
    ""
]

def escape(term):
    # Basic escaping for URIs (if needed)
    return term.replace(" ", "_")

def convert_tsv_to_ttl(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    triples = []
    for line in lines:
        parts = line.strip().split("\t")
        if len(parts) != 3:
            continue  # Skip malformed lines

        subj, pred, obj = map(escape, parts)

        # Expand schema: to full URI
        if pred.startswith("schema:"):
            predicate = pred  # already prefixed
        else:
            predicate = f":{pred}"

        triple = f":{subj} {predicate} :{obj} ."
        triples.append(triple)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(prefixes + triples))

    print(f"✅ Converted {len(triples)} triples to {output_file}")

# Run the function
if __name__ == "__main__":
    convert_tsv_to_ttl(INPUT_FILE, OUTPUT_FILE)


✅ Converted 44946 triples to data/triples.ttl
