# Film-KG — TSV-only Pipeline + Rekursive Traversierung

Dieses Notebook:
1) Lädt deine `movie_kg_triples.tsv`.
2) **Backup** der TSV-Datei anlegen.
3) Vorverarbeitung: kanonisierte Character-URIs `ex:char/<slug>` + `ex:featuresCharacter` **ins TSV anhängen** (und in den In-Memory-Graph einfügen).
4) Ableitung:
   - `ex:SAME_UNIVERSE` (Self-Join über Character) **nur TSV-Append**
   - `ex:CREATIVE_PAIR` (Director×Actor ≥2) **nur TSV-Append**
5) **Rekursive Traversierung** über `ex:SAME_UNIVERSE` (SPARQL Property Path **und** Python-BFS).

## 0) Setup & Konfiguration

In [1]:
!python -c "import rdflib" 2>/dev/null || pip -q install rdflib==7.0.0
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import shutil, re
from datetime import datetime

DATA_PATH = Path("../data/kg/triples/movie_kg_triples.tsv")  # bestehende KG-Datei (TSV)
OUT_DIR = Path("../data/kg/triples"); OUT_DIR.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 50000  # Tripel je Ausgabedatei

SCHEMA = Namespace("http://schema.org/")
EX     = Namespace("http://example.org/")
CHAR_NS = Namespace(str(EX) + "char/")

print("Data:", DATA_PATH.resolve())
print("Output:", OUT_DIR.resolve())


Data: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/kg/triples/movie_kg_triples.tsv
Output: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/kg/triples


## 1) Daten laden (robuster TSV-Parser)

In [2]:
g = Graph(); g.bind("schema", SCHEMA); g.bind("ex", EX); g.bind("rdf", RDF)
prefix_map = {"schema": str(SCHEMA), "rdf": str(RDF), "rdfs": str(RDFS), "xsd": str(XSD), "ex": str(EX)}

def parse_term(term: str):
    term = term.strip()
    if len(term) >= 2 and term[0] == '"' and term[-1] == '"':
        return Literal(term[1:-1])
    if term.startswith("http://") or term.startswith("https://"):
        return URIRef(term)
    if ":" in term:
        pfx, local = term.split(":", 1)
        if pfx in prefix_map:
            return URIRef(prefix_map[pfx] + local)
    return Literal(term)

count = 0
with DATA_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        parts = line.split("\t")
        if len(parts) != 3:
            continue
        s, p, o = map(parse_term, parts)
        g.add((s, p, o))
        count += 1
print("Geladene Tripel:", count)
print("Beispiel-Tripel:")
for i, (s,p,o) in enumerate(g):
    print("-", s, p, o)
    if i >= 4: break


Geladene Tripel: 58601
Beispiel-Tripel:
- person57551 http://schema.org/name Mario Cantone
- movie10567 http://schema.org/aggregateRating avgVote_6.548
- movie9928 http://schema.org/genre genre35
- movie809 http://schema.org/character Page / Elf / Nobleman / Nobleman's Son (voice)
- person5649 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://schema.org/Person


## 2) Backup & TSV-Append-Helfer

In [3]:
BACKUP_DIR = OUT_DIR / "backups"; BACKUP_DIR.mkdir(parents=True, exist_ok=True)
BACKUP_PATH = BACKUP_DIR / f"movie_kg_triples_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.tsv"

# Backup anlegen
shutil.copy2(DATA_PATH, BACKUP_PATH)
print("Backup gespeichert:", BACKUP_PATH)

def term_to_str(term):
    if isinstance(term, URIRef):
        # Versuche Prefix-Kurzform
        for pfx, ns in prefix_map.items():
            if str(term).startswith(ns):
                return f"{pfx}:{str(term)[len(ns):]}"
        return str(term)
    elif isinstance(term, Literal):
        s = str(term).replace('"', '\"')
        return f'"{s}"'
    else:
        return str(term)

def append_triples_tsv(triples):
    with DATA_PATH.open("a", encoding="utf-8") as f:
        for (s,p,o) in triples:
            f.write(term_to_str(s) + "\t" + term_to_str(p) + "\t" + term_to_str(o) + "\n")


Backup gespeichert: ../data/kg/triples/backups/movie_kg_triples_backup_20250915_025203.tsv


## 3) Vorverarbeitung: Character-URIs + `ex:featuresCharacter` (TSV-only)

In [4]:
canon_re = re.compile(r"\s*\([^)]*\)")
def canonize(text: str) -> str:
    s = canon_re.sub("", text)
    return s.strip().lower()

def slugify(text: str) -> str:
    t = re.sub(r"[^a-z0-9]+", "-", text)
    t = re.sub(r"-+", "-", t).strip("-")
    return t or "x"

char_uri_by_canon = {}
added_nodes = 0; added_edges = 0

new_feature_triples = []  # für optionales Mitschreiben in TSV

for f, _, ch in g.triples((None, SCHEMA.character, None)):
    c = canonize(str(ch))
    if not c:
        continue
    uri = char_uri_by_canon.get(c)
    if uri is None:
        uri = URIRef(CHAR_NS + slugify(c))
        char_uri_by_canon[c] = uri
        if (uri, RDF.type, EX.Character) not in g:
            g.add((uri, RDF.type, EX.Character))
            g.add((uri, EX.canonName, Literal(c)))
            new_feature_triples.append((uri, RDF.type, EX.Character))
            new_feature_triples.append((uri, EX.canonName, Literal(c)))
            added_nodes += 1
    if (f, EX.featuresCharacter, uri) not in g:
        g.add((f, EX.featuresCharacter, uri))
        new_feature_triples.append((f, EX.featuresCharacter, uri))
        added_edges += 1

print("Neue Character-Knoten:", added_nodes)
print("Neue featuresCharacter-Kanten:", added_edges)

# Optional: diese neuen Vorverarbeitungs-Tripel direkt ins TSV anhängen
#if new_feature_triples:
#    append_triples_tsv(new_feature_triples)
#    print("Vorverarbeitungs-Tripel ins TSV angehängt:", len(new_feature_triples))


Neue Character-Knoten: 5392
Neue featuresCharacter-Kanten: 6645


## 4) SAME_UNIVERSE (Self-Join über Character) — **TSV-only**

In [5]:
from itertools import combinations
films_by_char = defaultdict(list)
for f, _, c in g.triples((None, EX.featuresCharacter, None)):
    films_by_char[c].append(f)

new_su_triples = []
seen = set()
for char_uri, films in films_by_char.items():
    if len(films) < 2:
        continue
    films_sorted = sorted(set(films), key=str)
    for f1, f2 in combinations(films_sorted, 2):
        key = (str(f1), str(f2))
        if key in seen:
            continue
        seen.add(key)
        triple = (f1, EX.sameUniverse, f2)
        new_su_triples.append(triple)
        g.add(triple)  # auch in-memory für Traversierung

append_triples_tsv(new_su_triples)
print("SAME_UNIVERSE Tripel angehängt:", len(new_su_triples))


SAME_UNIVERSE Tripel angehängt: 1576


## 5) CREATIVE_PAIR (Director×Actor ≥2) — **TSV-only**

In [6]:
pair_counts = defaultdict(int)
for f, _, d in g.triples((None, SCHEMA.director, None)):
    for _, _, a in g.triples((f, SCHEMA.actor, None)):
        pair_counts[(d, a)] += 1

new_cp_triples = []
for (d, a), n in pair_counts.items():
    if n >= 2:
        new_cp_triples.append((d, EX.creativePair, a))
        new_cp_triples.append((d, EX.creativePairRoles, Literal("Director,Actor")))
        new_cp_triples.append((d, EX.creativePairCount, Literal(n)))
        g.add((d, EX.creativePair, a))
        g.add((d, EX.creativePairRoles, Literal("Director,Actor")))
        g.add((d, EX.creativePairCount, Literal(n)))

append_triples_tsv(new_cp_triples)
print("CREATIVE_PAIR Tripel angehängt:", len(new_cp_triples))


CREATIVE_PAIR Tripel angehängt: 1644


## 6) Rekursive Traversierung über `ex:SAME_UNIVERSE`
Wir bieten **zwei Wege**:

**A) SPARQL Property Path** (`ex:SAME_UNIVERSE+`) — bequem, wenn du einen Seed kennst.

**B) Python-BFS** — schnell und vielseitig (z. B. mit Tiefenlimit).

In [None]:
# TODO

### 6A) SPARQL Property Path

In [16]:
def sparql_same_universe(seed_uri_str: str, limit=200):
    seed = URIRef(seed_uri_str) if seed_uri_str.startswith("http") else URIRef(str(EX) + seed_uri_str)
    q = (
        "PREFIX ex: <http://example.org/>\n"
        "SELECT DISTINCT ?g WHERE {\n"
        f"  VALUES ?seed {{ <{seed}> }}\n"
        "  ?seed ex:sameUniverse+ ?g .\n"
        "} LIMIT " + str(limit)
    )
    return [str(row[0]) for row in g.query(q)]

# Beispiel (anpassen!):
results = sparql_same_universe("movie181808")
print(len(results), "Filme im selben Universe")
results[:10]


0 Filme im selben Universe


[]

### 6B) Python-BFS (mit optionalem Tiefenlimit)

In [17]:
from collections import deque

def bfs_same_universe(seed_uri_str: str, max_hops=None):
    seed = URIRef(seed_uri_str) if seed_uri_str.startswith("http") else URIRef(str(EX) + seed_uri_str)
    # Adjazenz (beide Richtungen behandeln, falls du die Kante nur einseitig erzeugst)
    adj = defaultdict(set)
    for s, _, o in g.triples((None, EX.sameUniverse, None)):
        adj[s].add(o)
        adj[o].add(s)

    seen = {seed}
    q = deque([(seed, 0)])
    order = []
    while q:
        node, d = q.popleft()
        order.append((node, d))
        if max_hops is not None and d >= max_hops:
            continue
        for nei in adj.get(node, []):
            if nei not in seen:
                seen.add(nei)
                q.append((nei, d+1))
    return order

# Beispiel (anpassen!):
comp = bfs_same_universe("movie181808", max_hops=6)
print("Gefundene Filme:", len(comp))
[str(n) for n,_ in comp[:10]]


Gefundene Filme: 1


['http://example.org/movie181808']

**Fertig!** Dieses Notebook hängt nur an die TSV-Datei an (nach Backup) und erzeugt **keine** zusätzlichen `.nt`/`.ttl` Files. Für die Traversierung kannst du entweder SPARQL (Property Path) oder den BFS-Helfer verwenden.