
# LO2 with **Datalog** (Robust Version): Exclude Already-Watched Movies

We apply the logical rule in **Datalog** form and collect results via `pyDatalog.ask(...)`:

> **Rule:** `recommendedFor(U, N, Y) :- candidateFor(U, N, Y) ∧ not watched_fact(U, N, Y)`

**Inputs**
- `data/letterboxd_export/watched.csv` (columns like `Name`, `Year`)
- `data/kg/tmdb_rerank_with_embedding_results_movies_only.csv` (e.g., `candidate_title`, `year`)

**Outputs**
- `data/kg/rerank_filtered_by_LO2_datalog.csv`


In [1]:

import pandas as pd
from pathlib import Path

# Auto-detect project root (directory that contains 'data')
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path("../logical")
print("Detected project_root:", project_root.resolve())

watched_path = project_root / "data" / "letterboxd_export" / "watched.csv"
candidates_path = project_root / "data" / "kg" / "tmdb_rerank_with_embedding_results_movies_only.csv"
out_csv = project_root / "data" / "kg" / "rerank_filtered_by_LO2_datalog.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG


## Load & Normalize Data

In [2]:

watched_df = pd.read_csv(watched_path)
recs_df = pd.read_csv(candidates_path)

# Normalize column names
watched_df.columns = [c.lower() for c in watched_df.columns]
recs_df.columns = [c.lower() for c in recs_df.columns]

def pick(colnames, options):
    for o in options:
        if o in colnames:
            return o
    return None

# Pick columns for watched.csv
watched_name_col = pick(watched_df.columns, ["name","film name","title"])
watched_year_col = pick(watched_df.columns, ["year","release year","release_year"])
assert watched_name_col is not None, "watched.csv must include a name-like column"
assert watched_year_col is not None, "watched.csv must include a year-like column"

# Pick columns for candidates CSV (support 'candidate_title')
recs_name_col = pick(recs_df.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col = pick(recs_df.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])
assert recs_name_col is not None, "candidates CSV must include a title column (e.g., 'candidate_title')"
assert recs_year_col is not None, "candidates CSV must include a year column"

print("watched uses:", watched_name_col, "/", watched_year_col)
print("candidates use:", recs_name_col, "/", recs_year_col)

# Normalize values
watched_df["name_norm"] = watched_df[watched_name_col].astype(str).str.strip().str.lower()
watched_df["year_str"] = watched_df[watched_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    watched_df[watched_year_col].astype(str).str.strip()
)

recs_df["name_norm"] = recs_df[recs_name_col].astype(str).str.strip().str.lower()
recs_df["year_str"] = recs_df[recs_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    recs_df[recs_year_col].astype(str).str.strip()
)

watched_pairs = set(zip(watched_df["name_norm"], watched_df["year_str"]))
print("Loaded watched unique pairs:", len(watched_pairs))
print("Loaded candidate rows:", len(recs_df))
recs_df.head(3)


watched uses: name / year
candidates use: candidate_title / year
Loaded watched unique pairs: 754
Loaded candidate rows: 15


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote,name_norm,year_str
0,1924,Superman,1978.0,0.4339,0.4698,0.4483,Black Panther,1.0,0.1304,0.0,0.0,0.956,1.0,0.9719,0.9457,superman,1978
1,1498,Teenage Mutant Ninja Turtles,1990.0,0.3336,0.4414,0.3767,Teenage Mutant Ninja Turtles,0.8,0.3182,0.0,0.0,0.9651,1.0,0.7857,0.8055,teenage mutant ninja turtles,1990
2,11868,Dracula,1958.0,0.3113,0.4735,0.3762,Dracula,1.0,0.1364,0.0,0.0,0.9651,1.0,0.99,0.9695,dracula,1958


## Datalog Rule Application (with safe fallback)

In [3]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog, Variable
    use_pyDatalog = True
    print("pyDatalog is available — using it.")
except Exception as e:
    print("pyDatalog not available, using fallback. Error:", e)
    use_pyDatalog = False

USER = "tobias"

if use_pyDatalog:
    pyDatalog.clear()
    # Create predicate and variable symbols
    pyDatalog.create_terms('watched_fact, candidateFor, recommendedFor, U, N, Y')
    
    # Add EDB facts
    for n, y in watched_pairs:
        +watched_fact(USER, n, y)
    for _, row in recs_df.iterrows():
        +candidateFor(USER, row["name_norm"], row["year_str"])
    
    # Rule with negation-as-failure
    recommendedFor(U, N, Y) <= candidateFor(U, N, Y) & ~watched_fact(U, N, Y)
    
    # Query via ask(), collect answers as list of [N, Y]
    ans = pyDatalog.ask('recommendedFor("{USER}", N, Y)'.format(USER=USER))
    if ans is None:
        rec_pairs = []
    else:
        rec_pairs = ans.answers  # list of [N, Y]
    
    # Build DataFrame and join back to recs_df to keep all columns
    if rec_pairs:
        rec_pairs_df = pd.DataFrame(rec_pairs, columns=["name_norm","year_str"])
        filtered_recs = recs_df.merge(rec_pairs_df, on=["name_norm","year_str"], how="inner")
    else:
        filtered_recs = recs_df.iloc[0:0].copy()
else:
    # Fallback: set difference on (name_norm, year_str)
    candidate_pairs = list(zip(recs_df["name_norm"], recs_df["year_str"]))
    keep_mask = [pair not in watched_pairs for pair in candidate_pairs]
    filtered_recs = recs_df.loc[keep_mask].copy()

print("Filtered recommendations:", len(filtered_recs), "/", len(recs_df))
filtered_recs.head(10)


pyDatalog not available, using fallback. Error: cannot import name 'Variable' from 'pyDatalog' (/opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/__init__.py)
Filtered recommendations: 15 / 15


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote,name_norm,year_str
0,1924,Superman,1978.0,0.4339,0.4698,0.4483,Black Panther,1.0,0.1304,0.0,0.0,0.956,1.0,0.9719,0.9457,superman,1978
1,1498,Teenage Mutant Ninja Turtles,1990.0,0.3336,0.4414,0.3767,Teenage Mutant Ninja Turtles,0.8,0.3182,0.0,0.0,0.9651,1.0,0.7857,0.8055,teenage mutant ninja turtles,1990
2,11868,Dracula,1958.0,0.3113,0.4735,0.3762,Dracula,1.0,0.1364,0.0,0.0,0.9651,1.0,0.99,0.9695,dracula,1958
3,11797,Fright Night,1985.0,0.288,0.46,0.3568,Fright Night,1.0,0.0769,0.0278,0.0,0.9994,1.0,0.9916,0.7908,fright night,1985
4,262097,Trio,1997.0,0.3501,0.363,0.3553,Seven Psychopaths,1.0,0.0,0.0,0.0,0.9731,0.0,0.9183,0.3692,trio,1997
5,11122,India,1993.0,0.2222,0.4412,0.3098,Summer Storm,1.0,0.037,0.0,0.0,0.9651,1.0,0.9614,0.75,india,1993
6,10889,Gloria,1980.0,0.1594,0.4452,0.2737,Good Time,1.0,0.0526,0.0,0.0,0.7827,1.0,0.9956,0.9152,gloria,1980
7,2661,Batman,1966.0,0.1428,0.4507,0.266,21 Jump Street,1.0,0.0606,0.0,0.0,0.9912,1.0,0.8997,0.8805,batman,1966
8,1227770,Taylor Tomlinson: Have It All,2024.0,0.0,0.6468,0.2587,Hannah Gadsby: Douglas,1.0,1.0,0.0,0.0,0.9802,1.0,0.9857,0.9695,taylor tomlinson: have it all,2024
9,671652,Taylor Tomlinson: Quarter-Life Crisis,2020.0,0.0,0.6461,0.2584,Hannah Gadsby: Douglas,1.0,1.0,0.0,0.0,0.935,1.0,0.9966,0.9895,taylor tomlinson: quarter-life crisis,2020


## Save Results

In [None]:

out_csv.parent.mkdir(parents=True, exist_ok=True)
filtered_recs.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())
