# TMDB Similar Movies (nur Filme) ➜ Re-Ranking mit KG-Embedding

Dieses Notebook:
1. liest **nur Movie-Seeds** (alle `/tv/…` werden ignoriert) aus `data/enriched_merged.csv`,
2. holt **ähnliche Filme** über TMDB `/movie/{id}/similar`,
3. berechnet **Metadaten-Ähnlichkeit** und **Cosine mit deinem KG-Embedding**,
4. kombiniert beides zu einem finalen Score und zeigt **Top-K**.

> Voraussetzungen:
> - `TMDB_API_TOKEN` (v4 Bearer) als Umgebungsvariable (oder in `.env`),
> - `data/kg/embeddings/entity_embeddings.csv` vorhanden,
> - Python: `pandas`, `numpy`, `requests`.

## 1) Setup

In [9]:
# Optional: Falls nötig, lokal installieren
# %pip install pandas numpy requests python-dotenv

In [10]:
import os, re, time, math
from typing import Dict, List, Set
from dataclasses import dataclass
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import json, pathlib

load_dotenv()

assert os.getenv("TMDB_API_TOKEN"), "Bitte TMDB_API_TOKEN (v4 Bearer) als Umgebungsvariable oder in .env setzen!"

## 2) Pfade & Parameter

In [11]:
PROJECT_DIR = ".."
CACHE_DIR = pathlib.Path("../.tmdb_cache")
CACHE_DIR.mkdir(exist_ok=True)

# Pfade
ENRICHED_CSV = f"{PROJECT_DIR}/data/enriched_merged.csv"
ENTITY_EMB   = f"{PROJECT_DIR}/data/kg/embeddings/entity_embeddings.csv"

# Recommender-Parameter
TOPK   = 200          # Anzahl Empfehlungen
PAGES  = 2          # Anzahl Seiten von TMDB /similar je Seed (20 Einträge/Seite)
ALPHA  = 0.6        # Mischung: alpha * Cosine + (1-alpha) * Metadaten
WEIGHTS = {
    "genres": 0.25, "keywords": 0.20,
    "cast": 0.20, "director": 0.15,
    "runtime": 0.05, "language": 0.05,
    "popularity": 0.05, "vote": 0.05,
}

# Optional: explizite Seeds (nur Movies). Beispiel: [603, 238]
EXPLICIT_SEED_MOVIE_IDS = None

## 3) TMDB-API, Parser & Helper (nur Movie!)

In [12]:
TMDB_API = "https://api.themoviedb.org/3"

def _cache_get(key: str):
    p = CACHE_DIR / (re.sub(r'[^a-zA-Z0-9_.-]+','_', key) + ".json")
    if p.exists():
        try:
            return json.loads(p.read_text(encoding="utf-8"))
        except:
            return None
    return None

def _cache_set(key: str, data: dict):
    p = CACHE_DIR / (re.sub(r'[^a-zA-Z0-9_.-]+','_', key) + ".json")
    p.write_text(json.dumps(data), encoding="utf-8")


def _api_get(path: str, params: Dict | None = None, retries: int = 3):
    if params is None:
        params = {}
    bearer = os.getenv("TMDB_API_TOKEN")
    headers = {"Authorization": f"Bearer {bearer}", "Content-Type": "application/json;charset=utf-8"}

    # Cache-Key bauen
    key = path + "?" + "&".join(f"{k}={v}" for k,v in sorted(params.items()))
    hit = _cache_get(key)
    if hit is not None:
        return hit

    for attempt in range(retries):
        r = requests.get(f"{TMDB_API}{path}", headers=headers, params=params, timeout=20)
        if r.status_code == 200:
            data = r.json()
            _cache_set(key, data)
            return data
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(1.5 * (attempt + 1))
            continue
        raise RuntimeError(f"TMDB error {r.status_code}: {r.text}")
    raise RuntimeError("TMDB temporäre Fehler nach Retries.")

def parse_tmdb_ref_movies_only(s: str):
    """
    Extrahiert NUR Movie-IDs: return ("movie", id) oder (None, None).
    - URLs mit /tv/… werden bewusst ignoriert.
    - Reine Zahlen werden als Movie interpretiert.
    """
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None, None
    s = str(s)
    m = re.search(r"/(movie|tv)/(\d+)", s)
    if m:
        kind, sid = m.group(1), int(m.group(2))
        if kind == "movie":
            return "movie", sid
        return None, None  # tv ignorieren
    if s.isdigit():
        return "movie", int(s)
    m = re.search(r"(\d+)", s)
    return ("movie", int(m.group(1))) if m else (None, None)

@dataclass
class MovieMeta:
    id: int
    title: str
    original_title: str | None
    release_year: int | None
    genres: Set[int]
    keywords: Set[int]
    cast_ids: Set[int]
    director_ids: Set[int]
    runtime: int | None
    language: str | None
    popularity: float | None
    vote_average: float | None

def _year_from(s: str | None) -> int | None:
    if not s: return None
    try: return int(s[:4])
    except: return None

def get_movie_meta(movie_id: int) -> MovieMeta:
    base = _api_get(f"/movie/{movie_id}", params={"append_to_response": "credits,keywords"})
    genres   = {g["id"] for g in (base.get("genres") or [])}
    # bei append_to_response liegen die Felder direkt im base:
    credits  = base.get("credits") or {}
    kw       = base.get("keywords") or {}
    keywords = {k["id"] for k in (kw.get("keywords") or [])}
    cast_ids = {c["id"] for c in (credits.get("cast") or [])[:20]}
    director_ids = {c["id"] for c in (credits.get("crew") or []) if c.get("job") == "Director"}

    def _year(s):
        return int(s[:4]) if s and len(s) >= 4 else None

    return MovieMeta(
        id=movie_id,
        title=base.get("title") or str(movie_id),
        original_title=base.get("original_title"),
        release_year=_year(base.get("release_date")),
        genres=genres,
        keywords=keywords,
        cast_ids=cast_ids,
        director_ids=director_ids,
        runtime=base.get("runtime"),
        language=base.get("original_language"),
        popularity=base.get("popularity"),
        vote_average=base.get("vote_average"),
    )

def gather_candidates_movies_only(seed_movie_ids: List[int], pages_per_seed: int=1,
                                  per_seed_limit: int=10, global_limit: int=1500) -> set[int]:
    cands: list[int] = []
    seen = set()
    for sid in seed_movie_ids:
        added_for_seed = 0
        for p in range(1, pages_per_seed+1):
            data = _api_get(f"/movie/{sid}/similar", params={"page": p})
            for m in data.get("results", []):
                mid = m.get("id")
                if not mid or mid in seen:
                    continue
                cands.append(int(mid))
                seen.add(int(mid))
                added_for_seed += 1
                if per_seed_limit and added_for_seed >= per_seed_limit:
                    break
            if per_seed_limit and added_for_seed >= per_seed_limit:
                break
        if global_limit and len(cands) >= global_limit:
            break
    # Seeds entfernen
    return set(cands) - set(seed_movie_ids)

## 4) Similarity & Embeddings (unverändert nützlich)

In [13]:
def jaccard(a: Set, b: Set) -> float:
    if not a and not b:
        return 0.0
    return len(a & b) / (len(a | b) or 1)

def sim_runtime(a: int | None, b: int | None) -> float:
    if not a or not b:
        return 0.0
    return math.exp(-(abs(a-b)**2) / (2 * 30**2))

def sim_numeric(a: float | None, b: float | None, maxdiff: float) -> float:
    if a is None or b is None:
        return 0.0
    return max(0.0, 1.0 - abs(a-b)/maxdiff)

def sim_language(a: str | None, b: str | None) -> float:
    if not a or not b:
        return 0.0
    return 1.0 if a == b else 0.0

def metadata_similarity(seed: MovieMeta, cand: MovieMeta, weights: Dict[str,float]):
    comps = {
        "genres": jaccard(seed.genres, cand.genres),
        "keywords": jaccard(seed.keywords, cand.keywords),
        "cast": jaccard(seed.cast_ids, cand.cast_ids),
        "director": jaccard(seed.director_ids, cand.director_ids),
        "runtime": sim_runtime(seed.runtime, cand.runtime),
        "language": sim_language(seed.language, cand.language),
        "popularity": sim_numeric(seed.popularity, cand.popularity, 50.0),
        "vote": sim_numeric(seed.vote_average, cand.vote_average, 4.0),
    }
    return sum(weights[k]*comps[k] for k in comps), comps

def load_entity_embeddings(path: str):
    df = pd.read_csv(path)
    name_col = "Unnamed: 0" if "Unnamed: 0" in df.columns else df.columns[0]
    names = df[name_col].astype(str).tolist()
    vecs = df.drop(columns=[name_col]).to_numpy(float)
    vecs /= (np.linalg.norm(vecs, axis=1, keepdims=True)+1e-9)
    return {names[i].strip(): vecs[i] for i in range(len(names))}, vecs

def cosine(a, b):
    return float(np.dot(a, b)) if a is not None and b is not None else 0.0

def normalize_title(t: str) -> str:
    return re.sub(r"\s+", " ", (t or "").strip()).lower()

def find_embedding_for_title(table, title, year=None):
    if not title:
        return None
    if title in table:
        return table[title]
    norm = normalize_title(title)
    for k,v in table.items():
        if normalize_title(k) == norm:
            return v
    if year is not None:
        key = f"{title} ({year})"
        if key in table:
            return table[key]
    return None

## 5) Seeds laden (nur Movies), Kandidaten holen, Scoring, Top-K anzeigen & speichern

In [14]:
# 5.1 Seeds (nur Movies)
if EXPLICIT_SEED_MOVIE_IDS:
    seed_movie_ids = list(dict.fromkeys([int(x) for x in EXPLICIT_SEED_MOVIE_IDS]))
else:
    df_enr = pd.read_csv(ENRICHED_CSV)
    raw = df_enr.get("tmdb_url", pd.Series(dtype=str)).dropna().tolist()
    seed_movie_ids = []
    for s in raw:
        kind, sid = parse_tmdb_ref_movies_only(s)
        if kind == "movie" and sid:
            seed_movie_ids.append(sid)
    seed_movie_ids = list(dict.fromkeys(seed_movie_ids))

assert seed_movie_ids, "Keine Movie-Seeds gefunden (alle /tv/ wurden ignoriert)."
print(f"Movie-Seeds: {len(seed_movie_ids)}")

# 5.2 Seed-Metadaten
seeds_meta = []
for sid in seed_movie_ids:
    try:
        seeds_meta.append(get_movie_meta(sid))
    except RuntimeError as e:
        print(f"WARN: Seed /movie/{sid} übersprungen: {e}")
assert seeds_meta, "Alle Seeds fielen raus. Prüfe tmdb_url und TMDB_API_TOKEN."

# 5.3 Embedding laden & User-Zentroid
emb_table, _ = load_entity_embeddings(ENTITY_EMB)
seed_vecs = []
for sm in seeds_meta:
    v = find_embedding_for_title(emb_table, sm.title, sm.release_year)
    if v is None:
        v = find_embedding_for_title(emb_table, sm.original_title, sm.release_year)
    if v is not None:
        seed_vecs.append(v)
user_vec = None
if seed_vecs:
    user_vec = np.mean(np.vstack(seed_vecs), axis=0)
    user_vec /= (np.linalg.norm(user_vec)+1e-9)
else:
    print("WARN: Keine Seed-Embeddings gefunden – Cosine fällt auf 0.")

# 5.4 Kandidaten (nur Movies)
cand_ids = gather_candidates_movies_only(seed_movie_ids, pages_per_seed=PAGES,
                                         per_seed_limit=10, global_limit=1500)
print(f"Kandidaten-Pool (Movies): {len(cand_ids)}")

# 5.5 Scoring
rows = []
for cid in cand_ids:
    try:
        cm = get_movie_meta(cid)
    except RuntimeError as e:
        print(f"WARN: skip /movie/{cid}: {e}")
        continue
    best_meta = 0.0
    best_seed_title = None
    comps_keep = None
    for sm in seeds_meta:
        s, comps = metadata_similarity(sm, cm, WEIGHTS)
        if s > best_meta:
            best_meta = s
            best_seed_title = sm.title
            comps_keep = comps
    cand_vec = find_embedding_for_title(emb_table, cm.title, cm.release_year)
    if cand_vec is None:
        cand_vec = find_embedding_for_title(emb_table, cm.original_title, cm.release_year)
    cos = cosine(user_vec, cand_vec) if (user_vec is not None and cand_vec is not None) else 0.0
    final = ALPHA * cos + (1 - ALPHA) * best_meta
    rows.append({
        "candidate_id": cid,
        "candidate_title": cm.title,
        "year": cm.release_year,
        "cos": round(cos,4),
        "meta": round(best_meta,4),
        "final": round(final,4),
        "seed": best_seed_title,
        **{f"comp_{k}": round(comps_keep[k],4) for k in (comps_keep or {})}
    })

df_out = pd.DataFrame(rows).sort_values("final", ascending=False).head(TOPK)
df_out

Movie-Seeds: 694
Kandidaten-Pool (Movies): 1444


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote
219,929,Godzilla,1998.0,0.3688,0.5221,0.4301,The Day After Tomorrow,0.75,0.0189,0.0000,1.0,0.8825,1.0,0.9615,0.7732
1134,11797,Fright Night,1985.0,0.3051,0.4600,0.3671,Fright Night,1.00,0.0769,0.0278,0.0,0.9994,1.0,0.9916,0.7908
1433,262097,Trio,1997.0,0.3612,0.3630,0.3619,Seven Psychopaths,1.00,0.0000,0.0000,0.0,0.9731,0.0,0.9183,0.3692
1152,11868,Dracula,1958.0,0.2846,0.4735,0.3601,Dracula,1.00,0.1364,0.0000,0.0,0.9651,1.0,0.9900,0.9695
468,1498,Teenage Mutant Ninja Turtles,1990.0,0.2999,0.4414,0.3565,Teenage Mutant Ninja Turtles,0.80,0.3182,0.0000,0.0,0.9651,1.0,0.7857,0.8055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,1662,State of Grace,1990.0,0.0000,0.4478,0.1791,Uncut Gems,1.00,0.0303,0.0000,0.0,0.9994,1.0,0.8853,0.9500
685,1979,Fantastic Four: Rise of the Silver Surfer,2007.0,0.0000,0.4478,0.1791,Venom: Let There Be Carnage,1.00,0.0789,0.0000,0.0,0.9862,1.0,0.9405,0.7135
300,9366,Donnie Brasco,1997.0,0.0000,0.4477,0.1791,Inside Man,1.00,0.0278,0.0000,0.0,0.9978,1.0,0.8796,0.9650
583,1878,Fear and Loathing in Las Vegas,1998.0,0.0000,0.4476,0.1790,The Life Aquatic with Steve Zissou,1.00,0.0000,0.0000,0.0,0.9994,1.0,0.9858,0.9663


## 6) Ergebnisse speichern

In [15]:
OUT_CSV = f"{PROJECT_DIR}/data/kg/rerank_with_embedding_results.csv"
df_out.to_csv(OUT_CSV, index=False)
print(f"Gespeichert: {OUT_CSV}")

Gespeichert: ../data/kg/rerank_with_embedding_results.csv


In [16]:
'''
The code in the previous cells is in big parts AI generated by the free and paid version of ChatGPT and was afterwards heavily adapted by me. Since it is not possible to accurately say which parts were originaly AI generated by wich promt, I have included all prompts that were used on this file here.
These following prompts were used:


    "im file kg_recommender_from_pykeen möchte ich anstelle von selbst eingegeben daten nun ähnliche filme zu den filmen in den triplen aus der tmdb datenbank laden, und für diese filme einen similarity score erstellen. danach möchte ich 5 filme mit dem besten similarity score empfehlen"

    "import re, time, math
        from typing import Dict, List, Set
        from dataclasses import dataclass
        from dotenv import load_dotenv

        import pandas as pd
        import numpy as np
        import requests

        load_dotenv()

        TMDB_API_TOKEN = "https://api.themoviedb.org/3"
        HEADERS = {
            "Authorization": "Bearer " + TMDB_API_TOKEN,
            "Content-Type": "application/json;charset=utf-8"
        }

        def _api_get(path: str, params: Dict=None, retries: int=3):
            if params is None:
                params = {}
            key = os.getenv("TMDB_API_TOKEN")
            if not key:
                raise RuntimeError("TMDB_API_TOKEN ist nicht gesetzt.")
            params = {**params, "api_key": key}
            for attempt in range(retries):
                r = requests.get(f"{TMDB_API_TOKEN}{path}",headers=HEADERS, params=params, timeout=20)
                if r.status_code == 200:
                    return r.json()
                if r.status_code in (429, 500, 502, 503, 504):
                    time.sleep(1.5 * (attempt + 1))
                    continue
                raise RuntimeError(f"TMDB error {r.status_code}: {r.text[:200]}")
            raise RuntimeError("TMDB temporäre Fehler nach Retries.")

        def parse_tmdb_id(s: str) -> int | None:
            if pd.isna(s):
                return None
            s = str(s)
            m = re.search(r"/movie/(\d+)", s)
            if m:
                return int(m.group(1))
            if s.isdigit():
                return int(s)
            m = re.search(r"(\d+)", s)
            return int(m.group(1)) if m else None

        @dataclass
        class MovieMeta:
            id: int
            title: str
            original_title: str | None
            release_year: int | None
            genres: Set[int]
            keywords: Set[int]
            cast_ids: Set[int]
            director_ids: Set[int]
            runtime: int | None
            language: str | None
            popularity: float | None
            vote_average: float | None

        def get_movie_meta(movie_id: int) -> MovieMeta:
            base = _api_get(f"/movie/{movie_id}")
            credits = _api_get(f"/movie/{movie_id}/credits")
            kw = _api_get(f"/movie/{movie_id}/keywords")
            genres = {g["id"] for g in (base.get("genres") or [])}
            keywords = {k["id"] for k in (kw.get("keywords") or [])}
            cast_ids = {c["id"] for c in (credits.get("cast") or [])[:20]}
            director_ids = {c["id"] for c in (credits.get("crew") or []) if c.get("job") == "Director"}
            y = None
            if base.get("release_date"):
                try: y = int(base["release_date"][:4])
                except: pass
            return MovieMeta(
                id=movie_id,
                title=base.get("title") or str(movie_id),
                original_title=base.get("original_title"),
                release_year=y,
                genres=genres,
                keywords=keywords,
                cast_ids=cast_ids,
                director_ids=director_ids,
                runtime=base.get("runtime"),
                language=base.get("original_language"),
                popularity=base.get("popularity"),
                vote_average=base.get("vote_average"),
            )

        def gather_candidates(seed_ids: List[int], pages_per_seed: int=2) -> Set[int]:
            cands: Set[int] = set()
            for sid in seed_ids:
                for p in range(1, pages_per_seed+1):
                    data = _api_get(f"/movie/{sid}/similar", params={"page": p})
                    for m in data.get("results", []):
                        if m.get("id"):
                            cands.add(m["id"])
            return cands - set(seed_ids)

        def jaccard(a: Set, b: Set) -> float:
            if not a and not b:
                return 0.0
            return len(a & b) / (len(a | b) or 1)

        def sim_runtime(a: int | None, b: int | None) -> float:
            if not a or not b:
                return 0.0
            return math.exp(-(abs(a-b)**2) / (2 * 30**2))

        def sim_numeric(a: float | None, b: float | None, maxdiff: float) -> float:
            if a is None or b is None:
                return 0.0
            return max(0.0, 1.0 - abs(a-b)/maxdiff)

        def sim_language(a: str | None, b: str | None) -> float:
            if not a or not b:
                return 0.0
            return 1.0 if a == b else 0.0

        def metadata_similarity(seed: MovieMeta, cand: MovieMeta, weights: Dict[str,float]):
            comps = {
                "genres": jaccard(seed.genres, cand.genres),
                "keywords": jaccard(seed.keywords, cand.keywords),
                "cast": jaccard(seed.cast_ids, cand.cast_ids),
                "director": jaccard(seed.director_ids, cand.director_ids),
                "runtime": sim_runtime(seed.runtime, cand.runtime),
                "language": sim_language(seed.language, cand.language),
                "popularity": sim_numeric(seed.popularity, cand.popularity, 50.0),
                "vote": sim_numeric(seed.vote_average, cand.vote_average, 4.0),
            }
            return sum(weights[k]*comps[k] for k in comps), comps

        def load_entity_embeddings(path: str):
            df = pd.read_csv(path)
            name_col = "Unnamed: 0" if "Unnamed: 0" in df.columns else df.columns[0]
            names = df[name_col].astype(str).tolist()
            vecs = df.drop(columns=[name_col]).to_numpy(float)
            vecs /= (np.linalg.norm(vecs, axis=1, keepdims=True)+1e-9)
            return {names[i].strip(): vecs[i] for i in range(len(names))}, vecs

        def cosine(a, b):
            return float(np.dot(a, b)) if a is not None and b is not None else 0.0

        def normalize_title(t: str) -> str:
            return re.sub(r"\s+", " ", (t or "").strip()).lower()

        def find_embedding_for_title(table, title, year=None):
            if not title:
                return None
            if title in table:
                return table[title]
            norm = normalize_title(title)
            for k,v in table.items():
                if normalize_title(k) == norm:
                    return v
            if year is not None:
                key = f"{title} ({year})"
                if key in table:
                    return table[key]
            return None

        i altered this file slightly to try to resolve this error:
        ---------------------------------------------------------------------------
        RuntimeError                              Traceback (most recent call last)
        Cell In[14], line 9
              6 seed_ids = list(dict.fromkeys(seed_ids))
              7 assert seed_ids, "Keine Seeds gefunden."
        ----> 9 seeds_meta = [get_movie_meta(sid) for sid in seed_ids]
             11 emb_table,_ = load_entity_embeddings(ENTITY_EMB)
             12 seed_vecs=[find_embedding_for_title(emb_table, sm.title, sm.release_year) for sm in seeds_meta]

        Cell In[13], line 63, in get_movie_meta(movie_id)
             62 def get_movie_meta(movie_id: int) -> MovieMeta:
        ---> 63     base = _api_get(f"/movie/{movie_id}")
             64     credits = _api_get(f"/movie/{movie_id}/credits")
             65     kw = _api_get(f"/movie/{movie_id}/keywords")

        Cell In[13], line 32, in _api_get(path, params, retries)
             30         time.sleep(1.5 * (attempt + 1))
             31         continue
        ---> 32     raise RuntimeError(f"TMDB error {r.status_code}: {r.text[:200]}")
             33 raise RuntimeError("TMDB temporäre Fehler nach Retries.")

        RuntimeError: TMDB error 401: {"status_code":7,"status_message":"Invalid API key: You must be granted a valid key.","success":false}

        unfortunately, it didnt work"

    "---------------------------------------------------------------------------
        RuntimeError                              Traceback (most recent call last)
        Cell In[21], line 9
              6 seed_ids = list(dict.fromkeys(seed_ids))
              7 assert seed_ids, "Keine Seeds gefunden."
        ----> 9 seeds_meta = [get_movie_meta(sid) for sid in seed_ids]
             11 emb_table,_ = load_entity_embeddings(ENTITY_EMB)
             12 seed_vecs=[find_embedding_for_title(emb_table, sm.title, sm.release_year) for sm in seeds_meta]

        Cell In[19], line 68, in get_movie_meta(movie_id)
             67 def get_movie_meta(movie_id: int) -> MovieMeta:
        ---> 68     base = _api_get(f"/movie/{movie_id}")
             69     credits = _api_get(f"/movie/{movie_id}/credits")
             70     kw = _api_get(f"/movie/{movie_id}/keywords")

        Cell In[19], line 36, in _api_get(path, params, retries)
             34         time.sleep(1.5 * (attempt + 1))
             35         continue
        ---> 36     raise RuntimeError(f"TMDB error {r.status_code}: {r.text}")
             38 raise RuntimeError("TMDB temporäre Fehler nach Retries.")

        RuntimeError: TMDB error 404: {"success":false,"status_code":34,"status_message":"The resource you requested could not be found."}

        was tu ich mit dieser fehlermeldung?

        das ist der aktuelle stand meines codes:
        import re, time, math
        from typing import Dict, List, Set
        from dataclasses import dataclass
        from dotenv import load_dotenv

        import pandas as pd
        import numpy as np
        import requests
        import os

        load_dotenv()

        TMDB_API = "https://api.themoviedb.org/3"

        def _api_get(path: str, params: Dict | None = None, retries: int = 3):
            if params is None:
                params = {}

            bearer = os.getenv("TMDB_API_TOKEN")  # <-- genau dein Name aus .env
            if not bearer:
                raise RuntimeError("TMDB_API_TOKEN ist nicht gesetzt (v4 Bearer-Token erwartet).")

            headers = {
                "Authorization": f"Bearer {bearer}",
                "Content-Type": "application/json;charset=utf-8",
            }
            # WICHTIG: KEIN api_key Query-Parameter bei Bearer-Auth!

            for attempt in range(retries):
                r = requests.get(f"{TMDB_API}{path}", headers=headers, params=params, timeout=20)
                if r.status_code == 200:
                    return r.json()
                if r.status_code in (429, 500, 502, 503, 504):
                    time.sleep(1.5 * (attempt + 1))
                    continue
                raise RuntimeError(f"TMDB error {r.status_code}: {r.text}")

            raise RuntimeError("TMDB temporäre Fehler nach Retries.")

        def parse_tmdb_id(s: str) -> int | None:
            if pd.isna(s):
                return None
            s = str(s)
            m = re.search(r"/movie/(\d+)", s)
            if m:
                return int(m.group(1))
            if s.isdigit():
                return int(s)
            m = re.search(r"(\d+)", s)
            return int(m.group(1)) if m else None

        @dataclass
        class MovieMeta:
            id: int
            title: str
            original_title: str | None
            release_year: int | None
            genres: Set[int]
            keywords: Set[int]
            cast_ids: Set[int]
            director_ids: Set[int]
            runtime: int | None
            language: str | None
            popularity: float | None
            vote_average: float | None

        def get_movie_meta(movie_id: int) -> MovieMeta:
            base = _api_get(f"/movie/{movie_id}")
            credits = _api_get(f"/movie/{movie_id}/credits")
            kw = _api_get(f"/movie/{movie_id}/keywords")
            genres = {g["id"] for g in (base.get("genres") or [])}
            keywords = {k["id"] for k in (kw.get("keywords") or [])}
            cast_ids = {c["id"] for c in (credits.get("cast") or [])[:20]}
            director_ids = {c["id"] for c in (credits.get("crew") or []) if c.get("job") == "Director"}
            y = None
            if base.get("release_date"):
                try: y = int(base["release_date"][:4])
                except: pass
            return MovieMeta(
                id=movie_id,
                title=base.get("title") or str(movie_id),
                original_title=base.get("original_title"),
                release_year=y,
                genres=genres,
                keywords=keywords,
                cast_ids=cast_ids,
                director_ids=director_ids,
                runtime=base.get("runtime"),
                language=base.get("original_language"),
                popularity=base.get("popularity"),
                vote_average=base.get("vote_average"),
            )

        def gather_candidates(seed_ids: List[int], pages_per_seed: int=2) -> Set[int]:
            cands: Set[int] = set()
            for sid in seed_ids:
                for p in range(1, pages_per_seed+1):
                    data = _api_get(f"/movie/{sid}/similar", params={"page": p})
                    for m in data.get("results", []):
                        if m.get("id"):
                            cands.add(m["id"])
            return cands - set(seed_ids)

        def jaccard(a: Set, b: Set) -> float:
            if not a and not b:
                return 0.0
            return len(a & b) / (len(a | b) or 1)

        def sim_runtime(a: int | None, b: int | None) -> float:
            if not a or not b:
                return 0.0
            return math.exp(-(abs(a-b)**2) / (2 * 30**2))

        def sim_numeric(a: float | None, b: float | None, maxdiff: float) -> float:
            if a is None or b is None:
                return 0.0
            return max(0.0, 1.0 - abs(a-b)/maxdiff)

        def sim_language(a: str | None, b: str | None) -> float:
            if not a or not b:
                return 0.0
            return 1.0 if a == b else 0.0

        def metadata_similarity(seed: MovieMeta, cand: MovieMeta, weights: Dict[str,float]):
            comps = {
                "genres": jaccard(seed.genres, cand.genres),
                "keywords": jaccard(seed.keywords, cand.keywords),
                "cast": jaccard(seed.cast_ids, cand.cast_ids),
                "director": jaccard(seed.director_ids, cand.director_ids),
                "runtime": sim_runtime(seed.runtime, cand.runtime),
                "language": sim_language(seed.language, cand.language),
                "popularity": sim_numeric(seed.popularity, cand.popularity, 50.0),
                "vote": sim_numeric(seed.vote_average, cand.vote_average, 4.0),
            }
            return sum(weights[k]*comps[k] for k in comps), comps

        def load_entity_embeddings(path: str):
            df = pd.read_csv(path)
            name_col = "Unnamed: 0" if "Unnamed: 0" in df.columns else df.columns[0]
            names = df[name_col].astype(str).tolist()
            vecs = df.drop(columns=[name_col]).to_numpy(float)
            vecs /= (np.linalg.norm(vecs, axis=1, keepdims=True)+1e-9)
            return {names[i].strip(): vecs[i] for i in range(len(names))}, vecs

        def cosine(a, b):
            return float(np.dot(a, b)) if a is not None and b is not None else 0.0

        def normalize_title(t: str) -> str:
            return re.sub(r"\s+", " ", (t or "").strip()).lower()

        def find_embedding_for_title(table, title, year=None):
            if not title:
                return None
            if title in table:
                return table[title]
            norm = normalize_title(title)
            for k,v in table.items():
                if normalize_title(k) == norm:
                    return v
            if year is not None:
                key = f"{title} ({year})"
                if key in table:
                    return table[key]
            return None"

    "ich möchte folgende änderungen: für alle serien in tmdb_url soll einfach gar kein tmdb endpoint abgefragt werden"

    "---------------------------------------------------------------------------
        ValueError                                Traceback (most recent call last)
        Cell In[7], line 30
             28 seed_vecs = []
             29 for sm in seeds_meta:
        ---> 30     v = (find_embedding_for_title(emb_table, sm.title, sm.release_year) or
             31          find_embedding_for_title(emb_table, sm.original_title, sm.release_year))
             32     if v is not None:
             33         seed_vecs.append(v)

        ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()"

    "Movie-Seeds: 693
        Kandidaten-Pool (Movies): 6200

        wie kommt diese hohe anzahl an Kandiaten zustande? der code lief fast 2h lang"



'''

'\nThe code in the previous cells is in big parts AI generated by the free and paid version of ChatGPT and was afterwards heavily adapted by me. Since it is not possible to accurately say which parts were originaly AI generated by wich promt, I have included all prompts that were used on this file here.\nThese following prompts were used:\n\n\n    "im file kg_recommender_from_pykeen möchte ich anstelle von selbst eingegeben daten nun ähnliche filme zu den filmen in den triplen aus der tmdb datenbank laden, und für diese filme einen similarity score erstellen. danach möchte ich 5 filme mit dem besten similarity score empfehlen"\n\n    "import re, time, math\n        from typing import Dict, List, Set\n        from dataclasses import dataclass\n        from dotenv import load_dotenv\n\n        import pandas as pd\n        import numpy as np\n        import requests\n\n        load_dotenv()\n\n        TMDB_API_TOKEN = "https://api.themoviedb.org/3"\n        HEADERS = {\n            "Autho