
# LO2 with **Datalog** (Robust Version): Exclude Already-Watched Movies

We apply the logical rule in **Datalog** form and collect results via `pyDatalog.ask(...)`:

> **Rule:** `recommendedFor(U, N, Y) :- candidateFor(U, N, Y) ∧ not watched_fact(U, N, Y)`

**Inputs**
- `data/letterboxd_export/watched.csv` (columns like `Name`, `Year`)
- `data/kg/tmdb_rerank_with_embedding_results_movies_only.csv` (e.g., `candidate_title`, `year`)

**Outputs**
- `data/kg/rerank_filtered_by_LO2_datalog.csv`


In [4]:

import pandas as pd
from pathlib import Path

# Auto-detect project root (directory that contains 'data')
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path(".")
print("Detected project_root:", project_root.resolve())

watched_path = project_root / "data" / "letterboxd_export" / "watched.csv"
candidates_path = project_root / "data" / "kg" / "tmdb_rerank_with_embedding_results_movies_only.csv"
out_csv = project_root / "data" / "kg" / "rerank_filtered_by_LO2_datalog.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG


## Load & Normalize Data

In [5]:

watched_df = pd.read_csv(watched_path)
recs_df = pd.read_csv(candidates_path)

# Normalize column names
watched_df.columns = [c.lower() for c in watched_df.columns]
recs_df.columns = [c.lower() for c in recs_df.columns]

def pick(colnames, options):
    for o in options:
        if o in colnames:
            return o
    return None

# Pick columns for watched.csv
watched_name_col = pick(watched_df.columns, ["name","film name","title"])
watched_year_col = pick(watched_df.columns, ["year","release year","release_year"])
assert watched_name_col is not None, "watched.csv must include a name-like column"
assert watched_year_col is not None, "watched.csv must include a year-like column"

# Pick columns for candidates CSV (support 'candidate_title')
recs_name_col = pick(recs_df.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col = pick(recs_df.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])
assert recs_name_col is not None, "candidates CSV must include a title column (e.g., 'candidate_title')"
assert recs_year_col is not None, "candidates CSV must include a year column"

print("watched uses:", watched_name_col, "/", watched_year_col)
print("candidates use:", recs_name_col, "/", recs_year_col)

# Normalize values
watched_df["name_norm"] = watched_df[watched_name_col].astype(str).str.strip().str.lower()
watched_df["year_str"] = watched_df[watched_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    watched_df[watched_year_col].astype(str).str.strip()
)

recs_df["name_norm"] = recs_df[recs_name_col].astype(str).str.strip().str.lower()
recs_df["year_str"] = recs_df[recs_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    recs_df[recs_year_col].astype(str).str.strip()
)

watched_pairs = set(zip(watched_df["name_norm"], watched_df["year_str"]))
print("Loaded watched unique pairs:", len(watched_pairs))
print("Loaded candidate rows:", len(recs_df))
recs_df.head(3)


watched uses: name / year
candidates use: candidate_title / year
Loaded watched unique pairs: 754
Loaded candidate rows: 100


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote,name_norm,year_str
0,1924,Superman,1978.0,0.4339,0.4685,0.4477,Black Panther,1.0,0.1304,0.0,0.0,0.956,1.0,0.9456,0.9457,superman,1978
1,841,Dune,1984.0,0.3683,0.4558,0.4033,Star Wars: Episode I - The Phantom Menace,1.0,0.0625,0.0,0.0,0.9994,1.0,0.9569,0.909,dune,1984
2,10730,King Kong,1976.0,0.3074,0.4265,0.355,King Kong,1.0,0.08,0.0,0.0,0.6065,1.0,0.9334,0.6692,king kong,1976


## Datalog Rule Application (with safe fallback)

In [6]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog
    use_pyDatalog = True
    print("pyDatalog is available — using it.")
except Exception as e:
    print("pyDatalog not available, using fallback. Error:", e)
    use_pyDatalog = False

USER = "tobias"

if use_pyDatalog:
    pyDatalog.clear()
    # Create predicate and variable symbols
    pyDatalog.create_terms('watched_fact, candidateFor, recommendedFor, U, N, Y')
    
    # Add EDB facts
    for n, y in watched_pairs:
        +watched_fact(USER, n, y)
    for _, row in recs_df.iterrows():
        +candidateFor(USER, row["name_norm"], row["year_str"])
    
    # Rule with negation-as-failure
    recommendedFor(U, N, Y) <= candidateFor(U, N, Y) & ~watched_fact(U, N, Y)
    
    # Query via ask(), collect answers as list of [N, Y]
    ans = pyDatalog.ask('recommendedFor("{USER}", N, Y)'.format(USER=USER))
    if ans is None:
        rec_pairs = []
    else:
        rec_pairs = ans.answers  # list of [N, Y]
    
    # Build DataFrame and join back to recs_df to keep all columns
    if rec_pairs:
        rec_pairs_df = pd.DataFrame(rec_pairs, columns=["name_norm","year_str"])
        filtered_recs = recs_df.merge(rec_pairs_df, on=["name_norm","year_str"], how="inner")
    else:
        filtered_recs = recs_df.iloc[0:0].copy()
else:
    # Fallback: set difference on (name_norm, year_str)
    candidate_pairs = list(zip(recs_df["name_norm"], recs_df["year_str"]))
    keep_mask = [pair not in watched_pairs for pair in candidate_pairs]
    filtered_recs = recs_df.loc[keep_mask].copy()

print("Filtered recommendations:", len(filtered_recs), "/", len(recs_df))
filtered_recs.head(10)


pyDatalog is available — using it.
Filtered recommendations: 100 / 100


Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,comp_director,comp_runtime,comp_language,comp_popularity,comp_vote,name_norm,year_str
0,1924,Superman,1978.0,0.4339,0.4685,0.4477,Black Panther,1.0,0.1304,0.0,0.0,0.956,1.0,0.9456,0.9457,superman,1978
1,841,Dune,1984.0,0.3683,0.4558,0.4033,Star Wars: Episode I - The Phantom Menace,1.0,0.0625,0.0,0.0,0.9994,1.0,0.9569,0.909,dune,1984
2,10730,King Kong,1976.0,0.3074,0.4265,0.355,King Kong,1.0,0.08,0.0,0.0,0.6065,1.0,0.9334,0.6692,king kong,1976
3,262606,Talkback,1987.0,0.3445,0.3612,0.3512,Aftersun,1.0,0.04,0.0,0.0,0.21,1.0,0.8545,0.0,talkback,1987
4,929,Godzilla,1998.0,0.1888,0.524,0.3229,The Day After Tomorrow,0.75,0.0189,0.0,1.0,0.8825,1.0,0.9986,0.7732,godzilla,1998
5,869,Planet of the Apes,2001.0,0.2096,0.4423,0.3027,Starship Troopers,1.0,0.0645,0.0,0.0,0.956,1.0,0.9627,0.6683,planet of the apes,2001
6,620778,Jim Gaffigan: Quality Time,2019.0,0.0,0.6424,0.257,Hannah Gadsby: Douglas,1.0,1.0,0.0,0.0,0.995,1.0,0.9913,0.8625,jim gaffigan: quality time,2019
7,439107,Katherine Ryan: In Trouble,2017.0,0.0,0.6379,0.2552,Hannah Gadsby: Douglas,1.0,1.0,0.0,0.0,0.956,1.0,0.9901,0.8125,katherine ryan: in trouble,2017
8,901,City Lights,1931.0,0.0,0.6371,0.2548,Modern Times,1.0,0.0769,0.1111,1.0,1.0,1.0,0.9939,0.9955,city lights,1931
9,330457,Frozen II,2019.0,0.0,0.636,0.2544,Frozen,0.8,0.1622,0.2903,1.0,0.9994,1.0,0.9116,0.9995,frozen ii,2019


## Save Results

In [None]:

out_csv.parent.mkdir(parents=True, exist_ok=True)
filtered_recs.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())
