# TMDB Similar Movies ➜ Re-Ranking mit KG-Embedding

Dieses Notebook:
1. liest deine **Seed-Filme** (aus `data/enriched_merged.csv` oder via manueller TMDB-ID-Liste),
2. holt **ähnliche Filme** von TMDB als **Kandidatenset**,
3. berechnet einen **Metadaten-Similarity-Score**,
4. kombiniert ihn mit **Cosine-Ähnlichkeit aus deinem KG-Embedding**,
5. gibt **Top-5** (oder beliebig) Empfehlungen samt Komponenten-Scores aus und speichert eine CSV.

> **Voraussetzungen**  
> - `data/kg/embeddings/entity_embeddings.csv` existiert und enthält Film-Embeddings (Entity-Namen = Filmtitel).  
> - Internetzugang und ein TMDB-API-Key.  
> - Python 3.9+ mit `pandas`, `numpy`, `requests`.

## 1) Setup

In [None]:
# Optional: nötige Bibliotheken installieren (falls nicht vorhanden)
# %pip install pandas numpy requests

In [9]:
import os
from dotenv import load_dotenv

# TMDB-API-Key setzen (oder sicher im System/Notebook-Tooling hinterlegen)
# os.environ["TMDB_API_TOKEN"] = "DEIN_TMDB_KEY_HIER"
assert "TMDB_API_TOKEN" in os.environ, "Bitte TMDB_API_TOKEN per Umgebungsvariable setzen!"

## 2) Pfade & Parameter

In [10]:
PROJECT_DIR = "."

# Pfade
ENRICHED_CSV = f"{PROJECT_DIR}/data/enriched_merged.csv"
ENTITY_EMB = f"{PROJECT_DIR}/data/kg/embeddings/entity_embeddings.csv"

# Recommender-Parameter
TOPK = 5
PAGES = 2
ALPHA = 0.6  # Mischung: alpha * Cosine + (1-alpha) * Metadaten
WEIGHTS = {
    "genres": 0.25, "keywords": 0.20,
    "cast": 0.20, "director": 0.15,
    "runtime": 0.05, "language": 0.05,
    "popularity": 0.05, "vote": 0.05,
}

# Optional: explizite Seeds statt aus enriched_merged.csv
EXPLICIT_SEEDS = None  # z.B. [603, 238]

## 3) Hilfsfunktionen (TMDB, Similarity, Embeddings)

In [19]:
import re, time, math
from typing import Dict, List, Set
from dataclasses import dataclass
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import requests
import os

load_dotenv()

TMDB_API = "https://api.themoviedb.org/3"

def _api_get(path: str, params: Dict | None = None, retries: int = 3):
    if params is None:
        params = {}

    bearer = os.getenv("TMDB_API_TOKEN")  # <-- genau dein Name aus .env
    if not bearer:
        raise RuntimeError("TMDB_API_TOKEN ist nicht gesetzt (v4 Bearer-Token erwartet).")

    headers = {
        "Authorization": f"Bearer {bearer}",
        "Content-Type": "application/json;charset=utf-8",
    }
    # WICHTIG: KEIN api_key Query-Parameter bei Bearer-Auth!

    for attempt in range(retries):
        r = requests.get(f"{TMDB_API}{path}", headers=headers, params=params, timeout=20)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(1.5 * (attempt + 1))
            continue
        raise RuntimeError(f"TMDB error {r.status_code}: {r.text}")

    raise RuntimeError("TMDB temporäre Fehler nach Retries.")

def parse_tmdb_id(s: str) -> int | None:
    if pd.isna(s):
        return None
    s = str(s)
    m = re.search(r"/movie/(\d+)", s)
    if m:
        return int(m.group(1))
    if s.isdigit():
        return int(s)
    m = re.search(r"(\d+)", s)
    return int(m.group(1)) if m else None

@dataclass
class MovieMeta:
    id: int
    title: str
    original_title: str | None
    release_year: int | None
    genres: Set[int]
    keywords: Set[int]
    cast_ids: Set[int]
    director_ids: Set[int]
    runtime: int | None
    language: str | None
    popularity: float | None
    vote_average: float | None

def get_movie_meta(movie_id: int) -> MovieMeta:
    base = _api_get(f"/movie/{movie_id}")
    credits = _api_get(f"/movie/{movie_id}/credits")
    kw = _api_get(f"/movie/{movie_id}/keywords")
    genres = {g["id"] for g in (base.get("genres") or [])}
    keywords = {k["id"] for k in (kw.get("keywords") or [])}
    cast_ids = {c["id"] for c in (credits.get("cast") or [])[:20]}
    director_ids = {c["id"] for c in (credits.get("crew") or []) if c.get("job") == "Director"}
    y = None
    if base.get("release_date"):
        try: y = int(base["release_date"][:4])
        except: pass
    return MovieMeta(
        id=movie_id,
        title=base.get("title") or str(movie_id),
        original_title=base.get("original_title"),
        release_year=y,
        genres=genres,
        keywords=keywords,
        cast_ids=cast_ids,
        director_ids=director_ids,
        runtime=base.get("runtime"),
        language=base.get("original_language"),
        popularity=base.get("popularity"),
        vote_average=base.get("vote_average"),
    )

def gather_candidates(seed_ids: List[int], pages_per_seed: int=2) -> Set[int]:
    cands: Set[int] = set()
    for sid in seed_ids:
        for p in range(1, pages_per_seed+1):
            data = _api_get(f"/movie/{sid}/similar", params={"page": p})
            for m in data.get("results", []):
                if m.get("id"):
                    cands.add(m["id"])
    return cands - set(seed_ids)

def jaccard(a: Set, b: Set) -> float:
    if not a and not b:
        return 0.0
    return len(a & b) / (len(a | b) or 1)

def sim_runtime(a: int | None, b: int | None) -> float:
    if not a or not b:
        return 0.0
    return math.exp(-(abs(a-b)**2) / (2 * 30**2))

def sim_numeric(a: float | None, b: float | None, maxdiff: float) -> float:
    if a is None or b is None:
        return 0.0
    return max(0.0, 1.0 - abs(a-b)/maxdiff)

def sim_language(a: str | None, b: str | None) -> float:
    if not a or not b:
        return 0.0
    return 1.0 if a == b else 0.0

def metadata_similarity(seed: MovieMeta, cand: MovieMeta, weights: Dict[str,float]):
    comps = {
        "genres": jaccard(seed.genres, cand.genres),
        "keywords": jaccard(seed.keywords, cand.keywords),
        "cast": jaccard(seed.cast_ids, cand.cast_ids),
        "director": jaccard(seed.director_ids, cand.director_ids),
        "runtime": sim_runtime(seed.runtime, cand.runtime),
        "language": sim_language(seed.language, cand.language),
        "popularity": sim_numeric(seed.popularity, cand.popularity, 50.0),
        "vote": sim_numeric(seed.vote_average, cand.vote_average, 4.0),
    }
    return sum(weights[k]*comps[k] for k in comps), comps

def load_entity_embeddings(path: str):
    df = pd.read_csv(path)
    name_col = "Unnamed: 0" if "Unnamed: 0" in df.columns else df.columns[0]
    names = df[name_col].astype(str).tolist()
    vecs = df.drop(columns=[name_col]).to_numpy(float)
    vecs /= (np.linalg.norm(vecs, axis=1, keepdims=True)+1e-9)
    return {names[i].strip(): vecs[i] for i in range(len(names))}, vecs

def cosine(a, b):
    return float(np.dot(a, b)) if a is not None and b is not None else 0.0

def normalize_title(t: str) -> str:
    return re.sub(r"\s+", " ", (t or "").strip()).lower()

def find_embedding_for_title(table, title, year=None):
    if not title:
        return None
    if title in table:
        return table[title]
    norm = normalize_title(title)
    for k,v in table.items():
        if normalize_title(k) == norm:
            return v
    if year is not None:
        key = f"{title} ({year})"
        if key in table:
            return table[key]
    return None

In [20]:
# sollte ein JSON mit Konfiguration liefern
print(_api_get("/configuration"))
# und ein Filmobjekt:
print(_api_get("/movie/603")["title"])

{'change_keys': ['adult', 'air_date', 'also_known_as', 'alternative_titles', 'biography', 'birthday', 'budget', 'cast', 'certifications', 'character_names', 'created_by', 'crew', 'deathday', 'episode', 'episode_number', 'episode_run_time', 'freebase_id', 'freebase_mid', 'general', 'genres', 'guest_stars', 'homepage', 'images', 'imdb_id', 'languages', 'name', 'network', 'origin_country', 'original_name', 'original_title', 'overview', 'parts', 'place_of_birth', 'plot_keywords', 'production_code', 'production_companies', 'production_countries', 'releases', 'revenue', 'runtime', 'season', 'season_number', 'season_regular', 'spoken_languages', 'status', 'tagline', 'title', 'translations', 'tvdb_id', 'tvrage_id', 'type', 'video', 'videos'], 'images': {'base_url': 'http://image.tmdb.org/t/p/', 'secure_base_url': 'https://image.tmdb.org/t/p/', 'backdrop_sizes': ['w300', 'w780', 'w1280', 'original'], 'logo_sizes': ['w45', 'w92', 'w154', 'w185', 'w300', 'w500', 'original'], 'poster_sizes': ['w92

## 4) Seeds laden, Kandidaten sammeln, Scoring & Top-K anzeigen

In [21]:
if EXPLICIT_SEEDS:
    seed_ids = EXPLICIT_SEEDS
else:
    df_enr = pd.read_csv(ENRICHED_CSV)
    seed_ids = [parse_tmdb_id(r.get("tmdb_url", "")) for _,r in df_enr.iterrows() if parse_tmdb_id(r.get("tmdb_url", ""))]
seed_ids = list(dict.fromkeys(seed_ids))
assert seed_ids, "Keine Seeds gefunden."

seeds_meta = [get_movie_meta(sid) for sid in seed_ids]

emb_table,_ = load_entity_embeddings(ENTITY_EMB)
seed_vecs=[find_embedding_for_title(emb_table, sm.title, sm.release_year) for sm in seeds_meta]
seed_vecs=[v for v in seed_vecs if v is not None]
user_vec = np.mean(np.vstack(seed_vecs),0) if seed_vecs else None
if user_vec is not None: user_vec/= (np.linalg.norm(user_vec)+1e-9)

cand_ids = gather_candidates([sm.id for sm in seeds_meta], pages_per_seed=PAGES)
rows=[]
for cid in cand_ids:
    cm = get_movie_meta(cid)
    best_meta, comps_keep = 0.0, None
    best_seed=None
    for sm in seeds_meta:
        s, comps = metadata_similarity(sm, cm, WEIGHTS)
        if s>best_meta:
            best_meta, comps_keep, best_seed = s, comps, sm.title
    cand_vec = find_embedding_for_title(emb_table, cm.title, cm.release_year)
    cos = cosine(user_vec, cand_vec)
    final = ALPHA*cos + (1-ALPHA)*best_meta
    rows.append({"candidate_title":cm.title,"year":cm.release_year,"cos":round(cos,4),"meta":round(best_meta,4),"final":round(final,4),"seed":best_seed})

df_out=pd.DataFrame(rows).sort_values("final",ascending=False).head(TOPK)
df_out

RuntimeError: TMDB error 404: {"success":false,"status_code":34,"status_message":"The resource you requested could not be found."}

## 5) Ergebnisse speichern

In [None]:
OUT_CSV=f"{PROJECT_DIR}/tmdb_rerank_with_embedding_results.csv"
df_out.to_csv(OUT_CSV,index=False)
print(f"Gespeichert: {OUT_CSV}")