# TMDB Recommendations from KG Triples → Enrich (Algorithm A) → Append as Triples

This notebook:
1) Loads `movie_kg_triples.tsv` (head, rel, tail).
2) Extracts movies and their ratings.
3) Picks the top-N best-rated films.
4) For each, queries **TMDB recommendations**, then **enriches** each recommendation using *Algorithm A* (details + credits).
5) Appends the enriched information **as triples** to the same TSV (after creating a timestamped backup).

In [1]:
import os, re, unicodedata, time, shutil
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import requests
from datetime import datetime

# --- Config ---
TOP_N_SEEDS     = 100   # number of KG movies to seed
RECS_PER_SEED   = 10   # number of recommendations per seed to take
SLEEP_SEC       = 0.25 # API politeness
KG_PATH    = Path("../data/kg/triples/movie_kg_triples.tsv")
OUT_PATH   = KG_PATH  # gleiche Datei bleibt Ziel

# Backup-Verzeichnis erstellen, falls es nicht existiert
BACKUP_DIR = KG_PATH.parent / "backups"
BACKUP_DIR.mkdir(parents=True, exist_ok=True)

# Backup-Dateiname mit Zeitstempel
timestamp   = datetime.now().strftime("%Y%m%d_%H%M%S")
BACKUP_PATH = BACKUP_DIR / f"movie_kg_triples_backup_{timestamp}.tsv"

# Load API key
load_dotenv()
TMDB_API_TOKEN = os.getenv('TMDB_API_TOKEN')
assert TMDB_API_TOKEN, "Please set TMDB_API_TOKEN in your environment (.env)."

HEADERS = {
    "Authorization": "Bearer " + TMDB_API_TOKEN,
    "Content-Type": "application/json;charset=utf-8"
}

print("KG file:", KG_PATH)
print("Backup will be written to:", BACKUP_PATH)

KG file: ../data/kg/triples/movie_kg_triples.tsv
Backup will be written to: ../data/kg/triples/backups/movie_kg_triples_backup_20250915_010217.tsv


## Load triples & extract movies with ratings

In [28]:
import numpy as np

triples = pd.read_csv(KG_PATH, sep="\\t", header=None, names=["head","rel","tail"])
print("Triples loaded:", len(triples))

# Bereits vorhandene Filmknoten im KG (an rdf:type = schema:Movie erkennbar)
existing_movies = set(
    triples.loc[
        (triples["rel"] == "rdf:type") & (triples["tail"] == "schema:Movie"),
        "head"
    ].astype(str)
)

# Optional: alle vorhandenen Tripel zum Feingranular-Deduplizieren
existing_triples = set(map(tuple, triples.astype(str).values.tolist()))
print(f"{len(existing_movies)} movies already in KG, {len(existing_triples)} triples total.")

# Helper: parse numeric fields like 'avgVote_8.4', 'personalVote_5.0', 'published_1989', 'runtime116'
def extract_personal_review(s: str | None) -> float | None:
    """Erwartet Tail wie 'personalVote_5.0' (evtl. auch 5,0)."""
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    txt = str(s)
    # 1) Versuche exakte Token-Variante
    m = re.search(r"personalVote[_:\s]*([0-9]+(?:[.,][0-9]+)?)", txt, flags=re.I)
    if m:
        return float(m.group(1).replace(",", "."))
    # 2) Fallback: irgend eine Zahl nach 'personal...'
    m = re.search(r"personal\w*\D*([0-9]+(?:[.,][0-9]+)?)", txt, flags=re.I)
    if m:
        return float(m.group(1).replace(",", "."))
    return None

def extract_avg_vote(s: str | None) -> float | None:
    """Erwartet Tail wie 'avgVote_8.4' (evtl. auch 8,4)."""
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    txt = str(s)
    # 1) Versuche exakte Token-Variante
    m = re.search(r"avgVote[_:\s]*([0-9]+(?:[.,][0-9]+)?)", txt, flags=re.I)
    if m:
        return float(m.group(1).replace(",", "."))
    # 2) Fallback: irgend eine Zahl nach 'avg...'
    m = re.search(r"avg\w*\D*([0-9]+(?:[.,][0-9]+)?)", txt, flags=re.I)
    if m:
        return float(m.group(1).replace(",", "."))
    return None

def extract_year(s: str | None) -> int | None:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    m = re.search(r"(19|20)\d{2}", str(s))
    return int(m.group(0)) if m else None

# Collect movies
movies = triples[(triples["rel"]=="rdf:type") & (triples["tail"]=="schema:Movie")]["head"].unique()

# Build a small movie table
name_map = triples[triples["rel"]=="schema:name"].set_index("head")["tail"].to_dict()
year_map = triples[triples["rel"]=="schema:datePublished"].set_index("head")["tail"].to_dict()
agg_map  = triples[triples["rel"]=="schema:aggregateRating"].set_index("head")["tail"].to_dict()
rev_map  = triples[triples["rel"]=="schema:review"].set_index("head")["tail"].to_dict()

rows = []
for mid in movies:
    title = name_map.get(mid)
    year  = extract_year(year_map.get(mid))
    personal = extract_personal_review(rev_map.get(mid))   # 0–5
    avg     = extract_avg_vote(agg_map.get(mid))           # 0–10
    rows.append({
        "movie_id": mid, "title": title, "year": year,
        "personal5": personal, "avg10": avg
    })

movie_df = pd.DataFrame(rows)
# personal (0–5) auf 0–10 hochskalieren
movie_df["personal10"] = movie_df["personal5"].apply(lambda x: x*2 if pd.notna(x) else np.nan)

# Endscore: nutze personal10 wenn da, sonst avg10, sonst 0
movie_df["score10"] = movie_df.apply(
    lambda r: r["personal10"] if pd.notna(r["personal10"])
    else (r["avg10"] if pd.notna(r["avg10"]) else 0.0),
    axis=1
)

# Sortierung: Score absteigend, Jahr absteigend
movie_df_sorted = movie_df.sort_values(by=["score10","year"], ascending=[False, False]).reset_index(drop=True)

print(movie_df[["title","personal5","avg10","year","score10"]].head(20))
print(movie_df[["personal5","avg10","score10"]].describe())
print("Anzahl personal vorhanden:", movie_df["personal5"].notna().sum())
print("Anzahl avg vorhanden:", movie_df["avg10"].notna().sum())

Triples loaded: 58616
695 movies already in KG, 41199 triples total.
                                    title  personal5  avg10  year  score10
0                              Twin Peaks        5.0  8.400  1989     10.0
1                              The Matrix        5.0  8.232  1999     10.0
2                                   Alien        5.0  8.164  1979     10.0
3                          Alien: Romulus        4.5  7.179  2024      9.0
4                 AVP: Alien vs. Predator        1.5  5.938  2004      3.0
5             Predator: Killer of Killers        3.0  7.904  2025      6.0
6                                 Weapons        3.0  7.420  2025      6.0
7   The Hunger Games: Mockingjay – Part 2        3.0  6.897  2015      6.0
8                              BlackBerry        4.0  7.127  2023      8.0
9                             Challengers        3.0  6.956  2024      6.0
10                                   Okja        4.0  7.349  2017      8.0
11                             

  triples = pd.read_csv(KG_PATH, sep="\\t", header=None, names=["head","rel","tail"])


## TMDB search & enrich (Algorithm A helpers)

In [29]:
def normalize_title(title):
    if not title:
        return ""
    title = unicodedata.normalize("NFKC", title)
    substitutions = {
        "–": "-", "—": "-", "−": "-",
        "×": "x", "’": "'", "“": '"', "”": '"', "…": "...",
        "&": "and",
    }
    for orig, repl in substitutions.items():
        title = title.replace(orig, repl)
    title = re.sub(r"\\s+", " ", title).strip().lower()
    return title

def search_exact_match(results, search_title):
    norm_search = normalize_title(search_title)
    for r in results:
        tmdb_title = r.get("title") or r.get("name") or ""
        if normalize_title(tmdb_title) == norm_search:
            return r
    return None

def search_movie_or_tv(title, year=None):
    # Try movie
    movie_url = "https://api.themoviedb.org/3/search/movie"
    params = {"query": title}
    if year:
        params["year"] = year
    resp = requests.get(movie_url, headers=HEADERS, params=params)
    if resp.status_code == 200:
        results = resp.json().get("results", [])
        match = search_exact_match(results, title)
        if match:
            match["media_type"] = "movie"
            return match
        if results:
            results[0]["media_type"] = "movie"
            return results[0]

    # Try TV
    tv_url = "https://api.themoviedb.org/3/search/tv"
    params = {"query": title}
    if year:
        params["first_air_date_year"] = year
    resp = requests.get(tv_url, headers=HEADERS, params=params)
    if resp.status_code == 200:
        results = resp.json().get("results", [])
        match = search_exact_match(results, title)
        if match:
            match["media_type"] = "tv"
            return match
        if results:
            results[0]["media_type"] = "tv"
            return results[0]
    return None

def get_details(tmdb_id, media_type):
    url = f"https://api.themoviedb.org/3/{media_type}/{tmdb_id}"
    resp = requests.get(url, headers=HEADERS)
    return resp.json() if resp.status_code == 200 else None

def get_credits(tmdb_id, media_type):
    url = f"https://api.themoviedb.org/3/{media_type}/{tmdb_id}/credits"
    resp = requests.get(url, headers=HEADERS)
    return resp.json() if resp.status_code == 200 else None

def get_recommendations(tmdb_id, media_type, page=1):
    url = f"https://api.themoviedb.org/3/{media_type}/{tmdb_id}/recommendations"
    resp = requests.get(url, headers=HEADERS, params={"page": page})
    return resp.json().get("results", []) if resp.status_code == 200 else []

## Select top-N seeds and fetch TMDB recommendations

In [30]:
seeds = movie_df_sorted.head(TOP_N_SEEDS).copy()
print(f"Using top {len(seeds)} seeds from KG (by personal/avg rating).")
seed_hits = []

for _, row in seeds.iterrows():
    title, year = row["title"], row["year"]
    sr = search_movie_or_tv(title, year)
    if not sr:
        print(f"Seed not found on TMDB: {title} ({year})")
        continue
    seed_hits.append({"kg_movie_id": row["movie_id"], "title": title, "year": year, "tmdb_id": sr["id"], "media_type": sr["media_type"]})
    time.sleep(SLEEP_SEC)

print("Seeds matched on TMDB:", len(seed_hits))
seed_hits[:5]

Using top 100 seeds from KG (by personal/avg rating).
Seeds matched on TMDB: 100


[{'kg_movie_id': 'movie940721',
  'title': 'Godzilla Minus One',
  'year': 2023,
  'tmdb_id': 940721,
  'media_type': 'movie'},
 {'kg_movie_id': 'movie872585',
  'title': 'Oppenheimer',
  'year': 2023,
  'tmdb_id': 872585,
  'media_type': 'movie'},
 {'kg_movie_id': 'movie346698',
  'title': 'Barbie',
  'year': 2023,
  'tmdb_id': 346698,
  'media_type': 'movie'},
 {'kg_movie_id': 'movie961323',
  'title': 'Nimona',
  'year': 2023,
  'tmdb_id': 961323,
  'media_type': 'movie'},
 {'kg_movie_id': 'movie603692',
  'title': 'John Wick: Chapter 4',
  'year': 2023,
  'tmdb_id': 603692,
  'media_type': 'movie'}]

## Enrich recommendations (Algorithm A style)

In [31]:
tmdb_media_base_url = "https://www.themoviedb.org/"
tmdb_poster_base_url = "https://image.tmdb.org/t/p/"
tmdb_person_base_url = "https://www.themoviedb.org/person/"
tmdb_genre_base_url = "https://www.themoviedb.org/genre/"
tmdb_company_base_url = "https://www.themoviedb.org/company/"
size = "original"

recs_enriched = []

for sh in seed_hits:
    sid, mtype = sh["tmdb_id"], sh["media_type"]
    recs = get_recommendations(sid, mtype, page=1)[:RECS_PER_SEED]
    for rec in recs:
        rec_id = rec["id"]
        rec_type = rec.get("media_type") or mtype  # assume same type if missing
        det = get_details(rec_id, rec_type)
        cred = get_credits(rec_id, rec_type)
        if not det:
            continue

        # collect enriched fields like Algorithm A
        title = det.get("title") or det.get("name")
        year  = None
        date_str = det.get("release_date") or det.get("first_air_date")
        if date_str and len(date_str)>=4:
            try: year = int(date_str[:4])
            except: year = None

        genres = [f"{g.get('name')}:{tmdb_genre_base_url}{g.get('id')}" for g in det.get("genres", [])]
        production_companies = [f"{p.get('name')}:{tmdb_company_base_url}{p.get('id')}:{p.get('origin_country')}" for p in det.get("production_companies", [])]
        production_countries = [f"{p.get('name')}:{p.get('iso_3166_1')}" for p in det.get("production_countries", [])]
        spoken_languages = [f"{s.get('english_name')}:{s.get('iso_639_1')}" for s in det.get("spoken_languages", [])]

        directors = []
        actors = []
        if cred:
            crew = cred.get("crew", [])
            directors = [f"{p.get('name')}:{tmdb_person_base_url}{p.get('id')}" for p in crew if p.get("job")=="Director"]
            cast = cred.get("cast", [])[:10]
            actors = [f"{a.get('name')}:{tmdb_person_base_url}{a.get('id')}" for a in cast]

        poster_url = None
        pp = det.get("poster_path")
        if pp: poster_url = tmdb_poster_base_url + size + pp

        recs_enriched.append({
            "seed_tmdb_id": sid,
            "seed_media_type": mtype,
            "tmdb_id": rec_id,
            "media_type": rec_type,
            "title": title,
            "year": year,
            "overview": det.get("overview"),
            "genres": genres or None,
            "runtime": det.get("runtime") if rec_type=="movie" else None,
            "vote_average": det.get("vote_average"),
            "poster_url": poster_url,
            "origin_country": det.get("origin_country"),
            "original_language": det.get("original_language"),
            "popularity": det.get("popularity"),
            "production_companies": production_companies or None,
            "production_countries": production_countries or None,
            "spoken_languages": spoken_languages or None,
            "directors": directors or None,
            "actors": actors or None,
            "tmdb_url": tmdb_media_base_url + f"{rec_type}/" + str(rec_id),
        })
        time.sleep(SLEEP_SEC)

print("Total recommendations enriched:", len(recs_enriched))
len(recs_enriched)

Total recommendations enriched: 1000


[{'seed_tmdb_id': 940721,
  'seed_media_type': 'movie',
  'tmdb_id': 315011,
  'media_type': 'movie',
  'title': 'Shin Godzilla',
  'year': 2016,
  'overview': "When a massive, gilled monster emerges from the deep and tears through the city, the government scrambles to save its citizens.  A rag-tag team of volunteers cuts through a web of red tape to uncover the monster's weakness and its mysterious ties to a foreign superpower.  But time is not on their side - the greatest catastrophe to ever befall the world is about to evolve right before their very eyes.",
  'genres': ['Action:https://www.themoviedb.org/genre/28',
   'Science Fiction:https://www.themoviedb.org/genre/878',
   'Horror:https://www.themoviedb.org/genre/27'],
  'runtime': 120,
  'vote_average': 7.2,
  'poster_url': 'https://image.tmdb.org/t/p/original/jPNShaWZMpVF0iQ7j1dvTuZLD20.jpg',
  'origin_country': ['JP'],
  'original_language': 'ja',
  'popularity': 8.4151,
  'production_companies': ['Toho Pictures:https://www.th

## Convert enriched recommendations to triples

In [32]:
# Build triples for each recommendation
new_triples = []
seen_new_triples = set()  # zum Deduplizieren innerhalb dieser Session

def movie_node(tmdb_id): return f"movie{tmdb_id}"
def person_node(pid):    return f"person{pid}"

def add_triple(h, r, t):
    key = (str(h), str(r), str(t))
    # vermeide Duplikate: nicht erneut hinzufügen, wenn schon im KG ODER schon in dieser Session erzeugt
    if key in existing_triples or key in seen_new_triples:
        return
    seen_new_triples.add(key)
    new_triples.append([h, r, t])

for r in recs_enriched:
    mid = movie_node(r["tmdb_id"])

    # **WICHTIG**: Falls der Film bereits als Knoten (rdf:type Movie) existiert -> komplett überspringen
    if mid in existing_movies:
        continue

    # Film-Basis
    new_triples.append([mid, "rdf:type", "schema:Movie"])
    if r["title"]:
        new_triples.append([mid, "schema:name", r["title"]])
    if r["year"]:
        new_triples.append([mid, "schema:datePublished", f"published_{r['year']}"])
    if r["vote_average"] is not None:
        new_triples.append([mid, "schema:aggregateRating", f"avgVote_{r['vote_average']}"])
    if r["runtime"]:
        try:
            new_triples.append([mid, "schema:duration", f"runtime{int(r['runtime'])}"])
        except Exception:
            pass
    if r["popularity"] is not None:
        new_triples.append([mid, "ex:popularity", f"popul_{r['popularity']}"])
    if r["original_language"]:
        new_triples.append([mid, "ex:originalLanguage", str(r["original_language"])])
    if r["origin_country"]:
        oc = r["origin_country"]
        if isinstance(oc, list):
            for c in oc:
                new_triples.append([mid, "ex:originCountry", str(c)])
        else:
            new_triples.append([mid, "ex:originCountry", str(oc)])

    # Directors
    if r["directors"]:
        for d in r["directors"]:
            name, _, _ = d.partition(":http")
            pid_match = re.search(r"/person/(\d+)", d)
            pid = pid_match.group(1) if pid_match else normalize_title(name).replace(" ","_")
            pnode = person_node(pid)
            new_triples.append([mid, "schema:director", pnode])
            new_triples.append([pnode, "rdf:type", "schema:Person"])
            new_triples.append([pnode, "schema:name", name.strip()])

    # Actors
    if r["actors"]:
        for a in r["actors"]:
            name, _, _ = a.partition(":http")
            pid_match = re.search(r"/person/(\d+)", a)
            pid = pid_match.group(1) if pid_match else normalize_title(name).replace(" ","_")
            pnode = person_node(pid)
            new_triples.append([mid, "schema:actor", pnode])
            new_triples.append([pnode, "rdf:type", "schema:Person"])
            new_triples.append([pnode, "schema:name", name.strip()])

    # Link back: recommended from which seed (optional edge)
    new_triples.append([mid, "ex:recommendedFrom", movie_node(str(r["seed_tmdb_id"]))])

print("New triples to append:", len(new_triples))
len(new_triples)
new_triples

New triples to append: 28638


[['movie437342', 'rdf:type', 'schema:Movie'],
 ['movie437342', 'schema:name', 'The First Omen'],
 ['movie437342', 'schema:datePublished', 'published_2024'],
 ['movie437342', 'schema:aggregateRating', 'avgVote_6.9'],
 ['movie437342', 'schema:duration', 'runtime119'],
 ['movie437342', 'ex:popularity', 'popul_7.9004'],
 ['movie437342', 'ex:originalLanguage', 'en'],
 ['movie437342', 'ex:originCountry', 'US'],
 ['movie437342', 'schema:director', 'person1706480'],
 ['person1706480', 'rdf:type', 'schema:Person'],
 ['person1706480', 'schema:name', 'Arkasha Stevenson'],
 ['movie437342', 'schema:actor', 'person1472584'],
 ['person1472584', 'rdf:type', 'schema:Person'],
 ['person1472584', 'schema:name', 'Nell Tiger Free'],
 ['movie437342', 'schema:actor', 'person202032'],
 ['person202032', 'rdf:type', 'schema:Person'],
 ['person202032', 'schema:name', 'Ralph Ineson'],
 ['movie437342', 'schema:actor', 'person27396'],
 ['person27396', 'rdf:type', 'schema:Person'],
 ['person27396', 'schema:name', 'S

## Append to KG (with backup)

In [33]:
# backup first
shutil.copy2(KG_PATH, BACKUP_PATH)

# append
new_df = pd.DataFrame(new_triples, columns=["head","rel","tail"])
assert len(new_df) > 0, "No new triples to append (all movies existed already?)."

with open(OUT_PATH, "a", encoding="utf-8") as f:
    new_df.to_csv(f, sep="\t", header=False, index=False)

print(f"Appended {len(new_df)} triples to", OUT_PATH)
print("Backup saved to", BACKUP_PATH)

Appended 28638 triples to ../data/kg/movie_kg_triples.tsv
Backup saved to ../data/kg/movie_kg_triples_backup_20250914_213737.tsv
