# TMDB Similar Movies (nur Filme) ➜ Re-Ranking mit KG-Embedding

Dieses Notebook:
1. liest **nur Movie-Seeds** (alle `/tv/…` werden ignoriert) aus `data/enriched_merged.csv`,
2. holt **ähnliche Filme** über TMDB `/movie/{id}/similar`,
3. berechnet **Metadaten-Ähnlichkeit** und **Cosine mit deinem KG-Embedding**,
4. kombiniert beides zu einem finalen Score und zeigt **Top-K**.

> Voraussetzungen:
> - `TMDB_API_TOKEN` (v4 Bearer) als Umgebungsvariable (oder in `.env`),
> - `data/kg/embeddings/entity_embeddings.csv` vorhanden,
> - Python: `pandas`, `numpy`, `requests`.

## 1) Setup

In [None]:
# Optional: Falls nötig, lokal installieren
# %pip install pandas numpy requests python-dotenv

In [1]:
import os, re, time, math
from typing import Dict, List, Set
from dataclasses import dataclass
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests

load_dotenv()

assert os.getenv("TMDB_API_TOKEN"), "Bitte TMDB_API_TOKEN (v4 Bearer) als Umgebungsvariable oder in .env setzen!"

## 2) Pfade & Parameter

In [2]:
PROJECT_DIR = "."

# Pfade
ENRICHED_CSV = f"{PROJECT_DIR}/data/enriched_merged.csv"
ENTITY_EMB   = f"{PROJECT_DIR}/data/kg/embeddings/entity_embeddings.csv"

# Recommender-Parameter
TOPK   = 5          # Anzahl Empfehlungen
PAGES  = 2          # Anzahl Seiten von TMDB /similar je Seed (20 Einträge/Seite)
ALPHA  = 0.6        # Mischung: alpha * Cosine + (1-alpha) * Metadaten
WEIGHTS = {
    "genres": 0.25, "keywords": 0.20,
    "cast": 0.20, "director": 0.15,
    "runtime": 0.05, "language": 0.05,
    "popularity": 0.05, "vote": 0.05,
}

# Optional: explizite Seeds (nur Movies). Beispiel: [603, 238]
EXPLICIT_SEED_MOVIE_IDS = None

## 3) TMDB-API, Parser & Helper (nur Movie!)

In [3]:
TMDB_API = "https://api.themoviedb.org/3"

def _api_get(path: str, params: Dict | None = None, retries: int = 3):
    if params is None:
        params = {}
    bearer = os.getenv("TMDB_API_TOKEN")  # v4 Bearer
    headers = {
        "Authorization": f"Bearer {bearer}",
        "Content-Type": "application/json;charset=utf-8",
    }
    for attempt in range(retries):
        r = requests.get(f"{TMDB_API}{path}", headers=headers, params=params, timeout=20)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(1.5 * (attempt + 1))
            continue
        raise RuntimeError(f"TMDB error {r.status_code}: {r.text}")
    raise RuntimeError("TMDB temporäre Fehler nach Retries.")

def parse_tmdb_ref_movies_only(s: str):
    """
    Extrahiert NUR Movie-IDs: return ("movie", id) oder (None, None).
    - URLs mit /tv/… werden bewusst ignoriert.
    - Reine Zahlen werden als Movie interpretiert.
    """
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None, None
    s = str(s)
    m = re.search(r"/(movie|tv)/(\d+)", s)
    if m:
        kind, sid = m.group(1), int(m.group(2))
        if kind == "movie":
            return "movie", sid
        return None, None  # tv ignorieren
    if s.isdigit():
        return "movie", int(s)
    m = re.search(r"(\d+)", s)
    return ("movie", int(m.group(1))) if m else (None, None)

@dataclass
class MovieMeta:
    id: int
    title: str
    original_title: str | None
    release_year: int | None
    genres: Set[int]
    keywords: Set[int]
    cast_ids: Set[int]
    director_ids: Set[int]
    runtime: int | None
    language: str | None
    popularity: float | None
    vote_average: float | None

def _year_from(s: str | None) -> int | None:
    if not s: return None
    try: return int(s[:4])
    except: return None

def get_movie_meta(movie_id: int) -> MovieMeta:
    base    = _api_get(f"/movie/{movie_id}")
    credits = _api_get(f"/movie/{movie_id}/credits")
    kw      = _api_get(f"/movie/{movie_id}/keywords")
    genres   = {g["id"] for g in (base.get("genres") or [])}
    keywords = {k["id"] for k in (kw.get("keywords") or [])}
    cast_ids = {c["id"] for c in (credits.get("cast") or [])[:20]}
    director_ids = {c["id"] for c in (credits.get("crew") or []) if c.get("job") == "Director"}
    return MovieMeta(
        id=movie_id,
        title=base.get("title") or str(movie_id),
        original_title=base.get("original_title"),
        release_year=_year_from(base.get("release_date")),
        genres=genres,
        keywords=keywords,
        cast_ids=cast_ids,
        director_ids=director_ids,
        runtime=base.get("runtime"),
        language=base.get("original_language"),
        popularity=base.get("popularity"),
        vote_average=base.get("vote_average"),
    )

def gather_candidates_movies_only(seed_movie_ids: List[int], pages_per_seed: int=2) -> Set[int]:
    cands: Set[int] = set()
    for sid in seed_movie_ids:
        for p in range(1, pages_per_seed+1):
            data = _api_get(f"/movie/{sid}/similar", params={"page": p})
            for m in data.get("results", []):
                mid = m.get("id")
                if mid:
                    cands.add(int(mid))
    return cands - set(seed_movie_ids)

## 4) Similarity & Embeddings (unverändert nützlich)

In [4]:
def jaccard(a: Set, b: Set) -> float:
    if not a and not b:
        return 0.0
    return len(a & b) / (len(a | b) or 1)

def sim_runtime(a: int | None, b: int | None) -> float:
    if not a or not b:
        return 0.0
    return math.exp(-(abs(a-b)**2) / (2 * 30**2))

def sim_numeric(a: float | None, b: float | None, maxdiff: float) -> float:
    if a is None or b is None:
        return 0.0
    return max(0.0, 1.0 - abs(a-b)/maxdiff)

def sim_language(a: str | None, b: str | None) -> float:
    if not a or not b:
        return 0.0
    return 1.0 if a == b else 0.0

def metadata_similarity(seed: MovieMeta, cand: MovieMeta, weights: Dict[str,float]):
    comps = {
        "genres": jaccard(seed.genres, cand.genres),
        "keywords": jaccard(seed.keywords, cand.keywords),
        "cast": jaccard(seed.cast_ids, cand.cast_ids),
        "director": jaccard(seed.director_ids, cand.director_ids),
        "runtime": sim_runtime(seed.runtime, cand.runtime),
        "language": sim_language(seed.language, cand.language),
        "popularity": sim_numeric(seed.popularity, cand.popularity, 50.0),
        "vote": sim_numeric(seed.vote_average, cand.vote_average, 4.0),
    }
    return sum(weights[k]*comps[k] for k in comps), comps

def load_entity_embeddings(path: str):
    df = pd.read_csv(path)
    name_col = "Unnamed: 0" if "Unnamed: 0" in df.columns else df.columns[0]
    names = df[name_col].astype(str).tolist()
    vecs = df.drop(columns=[name_col]).to_numpy(float)
    vecs /= (np.linalg.norm(vecs, axis=1, keepdims=True)+1e-9)
    return {names[i].strip(): vecs[i] for i in range(len(names))}, vecs

def cosine(a, b):
    return float(np.dot(a, b)) if a is not None and b is not None else 0.0

def normalize_title(t: str) -> str:
    return re.sub(r"\s+", " ", (t or "").strip()).lower()

def find_embedding_for_title(table, title, year=None):
    if not title:
        return None
    if title in table:
        return table[title]
    norm = normalize_title(title)
    for k,v in table.items():
        if normalize_title(k) == norm:
            return v
    if year is not None:
        key = f"{title} ({year})"
        if key in table:
            return table[key]
    return None

## 5) Seeds laden (nur Movies), Kandidaten holen, Scoring, Top-K anzeigen & speichern

In [8]:
# 5.1 Seeds (nur Movies)
if EXPLICIT_SEED_MOVIE_IDS:
    seed_movie_ids = list(dict.fromkeys([int(x) for x in EXPLICIT_SEED_MOVIE_IDS]))
else:
    df_enr = pd.read_csv(ENRICHED_CSV)
    raw = df_enr.get("tmdb_url", pd.Series(dtype=str)).dropna().tolist()
    seed_movie_ids = []
    for s in raw:
        kind, sid = parse_tmdb_ref_movies_only(s)
        if kind == "movie" and sid:
            seed_movie_ids.append(sid)
    seed_movie_ids = list(dict.fromkeys(seed_movie_ids))

assert seed_movie_ids, "Keine Movie-Seeds gefunden (alle /tv/ wurden ignoriert)."
print(f"Movie-Seeds: {len(seed_movie_ids)}")

# 5.2 Seed-Metadaten
seeds_meta = []
for sid in seed_movie_ids:
    try:
        seeds_meta.append(get_movie_meta(sid))
    except RuntimeError as e:
        print(f"WARN: Seed /movie/{sid} übersprungen: {e}")
assert seeds_meta, "Alle Seeds fielen raus. Prüfe tmdb_url und TMDB_API_TOKEN."

# 5.3 Embedding laden & User-Zentroid
emb_table, _ = load_entity_embeddings(ENTITY_EMB)
seed_vecs = []
for sm in seeds_meta:
    v = find_embedding_for_title(emb_table, sm.title, sm.release_year)
    if v is None:
        v = find_embedding_for_title(emb_table, sm.original_title, sm.release_year)
    if v is not None:
        seed_vecs.append(v)
user_vec = None
if seed_vecs:
    user_vec = np.mean(np.vstack(seed_vecs), axis=0)
    user_vec /= (np.linalg.norm(user_vec)+1e-9)
else:
    print("WARN: Keine Seed-Embeddings gefunden – Cosine fällt auf 0.")

# 5.4 Kandidaten (nur Movies)
cand_ids = gather_candidates_movies_only(seed_movie_ids, pages_per_seed=PAGES)
print(f"Kandidaten-Pool (Movies): {len(cand_ids)}")

# 5.5 Scoring
rows = []
for cid in cand_ids:
    try:
        cm = get_movie_meta(cid)
    except RuntimeError as e:
        print(f"WARN: skip /movie/{cid}: {e}")
        continue
    best_meta = 0.0
    best_seed_title = None
    comps_keep = None
    for sm in seeds_meta:
        s, comps = metadata_similarity(sm, cm, WEIGHTS)
        if s > best_meta:
            best_meta = s
            best_seed_title = sm.title
            comps_keep = comps
    cand_vec = find_embedding_for_title(emb_table, cm.title, cm.release_year)
    if cand_vec is None:
        cand_vec = find_embedding_for_title(emb_table, cm.original_title, cm.release_year)
    cos = cosine(user_vec, cand_vec) if (user_vec is not None and cand_vec is not None) else 0.0
    final = ALPHA * cos + (1 - ALPHA) * best_meta
    rows.append({
        "candidate_id": cid,
        "candidate_title": cm.title,
        "year": cm.release_year,
        "cos": round(cos,4),
        "meta": round(best_meta,4),
        "final": round(final,4),
        "seed": best_seed_title,
        **{f"comp_{k}": round(comps_keep[k],4) for k in (comps_keep or {})}
    })

df_out = pd.DataFrame(rows).sort_values("final", ascending=False).head(TOPK)
df_out

Movie-Seeds: 693
Kandidaten-Pool (Movies): 6200


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote
154,278927,The Jungle Book,2016.0,0.4462,0.4142,0.4334,The Lion King,1.0,0.0,0.0,0.0,0.8517,1.0,0.7804,0.6515
2339,4412,Les Misérables,1958.0,0.369,0.398,0.3806,It's Only the End of the World,1.0,0.0,0.0,0.0,0.0011,1.0,0.9712,0.9872
2342,4415,Les Misérables,1998.0,0.369,0.3912,0.3779,Little Women,0.75,0.087,0.0,0.0,0.9994,1.0,0.8963,0.8312
570,1366,Rocky,1976.0,0.2784,0.4477,0.3461,The Greatest Showman,1.0,0.0323,0.0,0.0,0.8825,1.0,0.9691,0.9725
889,1924,Superman,1978.0,0.214,0.4704,0.3166,Black Panther,1.0,0.1304,0.0,0.0,0.956,1.0,0.9849,0.9455


## 6) Ergebnisse speichern

In [9]:
OUT_CSV = f"{PROJECT_DIR}/tmdb_rerank_with_embedding_results_movies_only.csv"
df_out.to_csv(OUT_CSV, index=False)
print(f"Gespeichert: {OUT_CSV}")

Gespeichert: ./tmdb_rerank_with_embedding_results_movies_only.csv
