
# LO2 with **Datalog**: Exclude Already-Watched Movies from Recommendations

We apply the logical rule in **Datalog** form:

> **Rule:** `recommendedFor(U, N, Y) :- candidateFor(U, N, Y) ∧ not watched(U, N, Y)`

This notebook tries to use `pyDatalog` for a declarative specification. If `pyDatalog` isn't available
in the current environment, it falls back to a tiny in-notebook Datalog-style evaluator that
implements the same rule for this specific use case.

**Inputs**
- `data/letterboxd_export/watched.csv` (must contain columns like `Name`, `Year`)
- `data/kg/tmdb_rerank_with_embedding_results_movies_only.csv` (may contain `candidate_title` instead of `Name`; we auto-detect)

**Outputs**
- `data/kg/rerank_filtered_by_LO2_datalog.csv` — filtered recommendations via Datalog
- (optional) `data/kg/recommended_materialized_datalog.ttl` — RDF triples `:recommendedFor`


In [1]:

import pandas as pd
from pathlib import Path

# Auto-detect project root (directory that contains 'data')
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path("../logical")
print("Detected project_root:", project_root.resolve())

watched_path = project_root / "data" / "letterboxd_export" / "watched.csv"
candidates_path = project_root / "data" / "kg" / "tmdb_rerank_with_embedding_results_movies_only.csv"
out_csv = project_root / "data" / "kg" / "rerank_filtered_by_LO2_datalog.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG


## Load & Normalize Data

In [2]:

watched = pd.read_csv(watched_path)
recs = pd.read_csv(candidates_path)

# Normalize column names
watched.columns = [c.lower() for c in watched.columns]
recs.columns = [c.lower() for c in recs.columns]

def pick(colnames, options):
    for o in options:
        if o in colnames:
            return o
    return None

# Pick columns for watched.csv (keep defaults but robust to variants)
watched_name_col = pick(watched.columns, ["name","film name","title"])
watched_year_col = pick(watched.columns, ["year","release year","release_year"])
assert watched_name_col is not None, "watched.csv must include a 'Name' (or 'Film Name'/'Title') column"
assert watched_year_col is not None, "watched.csv must include a 'Year' (or 'Release Year') column"

# Pick columns for candidates CSV (support 'candidate_title')
recs_name_col = pick(recs.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col = pick(recs.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])
assert recs_name_col is not None, "candidates CSV must include a candidate title column (e.g., 'candidate_title'/'Name'/'Title')"
assert recs_year_col is not None, "candidates CSV must include a year column (e.g., 'year'/'release_year')"

print("watched uses:", watched_name_col, "/", watched_year_col)
print("candidates use:", recs_name_col, "/", recs_year_col)

# Normalize values
watched["name_norm"] = watched[watched_name_col].astype(str).str.strip().str.lower()
watched["year_str"] = watched[watched_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    watched[watched_year_col].astype(str).str.strip()
)

recs["name_norm"] = recs[recs_name_col].astype(str).str.strip().str.lower()
recs["year_str"] = recs[recs_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    recs[recs_year_col].astype(str).str.strip()
)

watched_pairs = set(zip(watched["name_norm"], watched["year_str"]))
print("Loaded watched unique pairs:", len(watched_pairs))
print("Loaded candidate rows:", len(recs))
recs.head(3)


watched uses: name / year
candidates use: candidate_title / year
Loaded watched unique pairs: 754
Loaded candidate rows: 15


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote,name_norm,year_str
0,1924,Superman,1978.0,0.4339,0.4698,0.4483,Black Panther,1.0,0.1304,0.0,0.0,0.956,1.0,0.9719,0.9457,superman,1978
1,1498,Teenage Mutant Ninja Turtles,1990.0,0.3336,0.4414,0.3767,Teenage Mutant Ninja Turtles,0.8,0.3182,0.0,0.0,0.9651,1.0,0.7857,0.8055,teenage mutant ninja turtles,1990
2,11868,Dracula,1958.0,0.3113,0.4735,0.3762,Dracula,1.0,0.1364,0.0,0.0,0.9651,1.0,0.99,0.9695,dracula,1958


## Datalog Approach (pyDatalog if available)

In [3]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog
    use_pyDatalog = True
    print("pyDatalog is available — using it.")
except Exception as e:
    print("pyDatalog not available, will use a tiny fallback evaluator. Error:", e)


pyDatalog is available — using it.


In [4]:

USER = "tobias"

if use_pyDatalog:
    from pyDatalog import pyDatalog
    pyDatalog.clear()
    pyDatalog.create_terms('watched, candidateFor, recommendedFor, U, N, Y')
    
    # Add EDB facts
    for n, y in watched_pairs:
        +watched(USER, n, y)
    for _, row in recs.iterrows():
        +candidateFor(USER, row["name_norm"], row["year_str"])
    
    # Stratified negation-as-failure rule
    recommendedFor(U, N, Y) <= candidateFor(U, N, Y) & ~watched(U, N, Y)
    
    # Query results
    q = recommendedFor(USER, N, Y)
    df_res = q.to_pandas()
    df_res.columns = ["user", "name_norm", "year_str"]
    # Join back to original recs to keep all columns
    filtered_recs = recs.merge(df_res[["name_norm","year_str"]], on=["name_norm","year_str"], how="inner")
else:
    # Fallback: Tiny evaluator for the same rule in this specific case.
    # recommendedFor := candidateFor - watched  (by (name_norm, year_str))
    candidate_pairs = list(zip(recs["name_norm"], recs["year_str"]))
    keep_mask = [pair not in watched_pairs for pair in candidate_pairs]
    filtered_recs = recs.loc[keep_mask].copy()

print("Filtered recommendations via Datalog rule:", len(filtered_recs), " / original:", len(recs))
filtered_recs.head(10)


TypeError: 'DataFrame' object is not callable

## Save Results

In [None]:

out_csv.parent.mkdir(parents=True, exist_ok=True)
filtered_recs.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())


## (Optional) Materialize `:recommendedFor` Triples (RDF)

In [None]:

try:
    from rdflib import Graph, Namespace, URIRef, Literal, RDF

    EX = Namespace("http://example.org/")
    g = Graph()
    g.bind("ex", EX)

    user = EX.user_tobias

    for _, row in filtered_recs.iterrows():
        movie_uri = URIRef(f"http://example.org/movie/{row['name_norm']}_{row['year_str']}")
        g.add((movie_uri, RDF.type, EX.Movie))
        g.add((movie_uri, EX.recommendedFor, user))
        label = row.get(recs_name_col, row['name_norm'])
        g.add((movie_uri, EX.label, Literal(f"{label} ({row['year_str']})")))

    ttl_out = project_root / "data" / "kg" / "recommended_materialized_datalog.ttl"
    g.serialize(destination=str(ttl_out), format="turtle")
    print("Wrote TTL:", ttl_out.resolve())
except Exception as e:
    print("Skipping RDF materialization:", e)


## Quick Summary

In [None]:

summary = {
    "candidates_total": len(recs),
    "watched_total": int(len(watched)),
    "watched_unique_pairs": int(len(watched_pairs)),
    "filtered_total": int(len(filtered_recs)),
    "removed_by_rule": int(len(recs) - len(filtered_recs)),
    "used_engine": "pyDatalog" if use_pyDatalog else "fallback (set-difference)"
}
summary
