
# LO2 with **Datalog** (Fixed Version): Exclude Already-Watched Movies

We apply the logical rule in **Datalog** form:

> **Rule:** `recommendedFor(U, N, Y) :- candidateFor(U, N, Y) ∧ not watched_fact(U, N, Y)`

This version fixes the **naming conflict** between the DataFrame `watched_df` and the Datalog predicate `watched_fact`.


In [1]:

import pandas as pd
from pathlib import Path

# Auto-detect project root (directory that contains 'data')
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path("../logical")
print("Detected project_root:", project_root.resolve())

watched_path = project_root / "data" / "letterboxd_export" / "watched.csv"
candidates_path = project_root / "data" / "kg" / "tmdb_rerank_with_embedding_results_movies_only.csv"
out_csv = project_root / "data" / "kg" / "rerank_filtered_by_LO2_datalog.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG


In [2]:

watched_df = pd.read_csv(watched_path)
recs_df = pd.read_csv(candidates_path)

watched_df.columns = [c.lower() for c in watched_df.columns]
recs_df.columns = [c.lower() for c in recs_df.columns]

def pick(colnames, options):
    for o in options:
        if o in colnames:
            return o
    return None

watched_name_col = pick(watched_df.columns, ["name","film name","title"])
watched_year_col = pick(watched_df.columns, ["year","release year","release_year"])
recs_name_col = pick(recs_df.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col = pick(recs_df.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])

assert watched_name_col and watched_year_col, "watched.csv must have name/year"
assert recs_name_col and recs_year_col, "candidates must have name/year"

watched_df["name_norm"] = watched_df[watched_name_col].astype(str).str.strip().str.lower()
watched_df["year_str"] = watched_df[watched_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    watched_df[watched_year_col].astype(str).str.strip()
)
recs_df["name_norm"] = recs_df[recs_name_col].astype(str).str.strip().str.lower()
recs_df["year_str"] = recs_df[recs_year_col].astype(str).str.extract(r"(\d{4})", expand=False).fillna(
    recs_df[recs_year_col].astype(str).str.strip()
)

watched_pairs = set(zip(watched_df["name_norm"], watched_df["year_str"]))
print("Loaded watched unique pairs:", len(watched_pairs))
print("Loaded candidate rows:", len(recs_df))


Loaded watched unique pairs: 754
Loaded candidate rows: 15


In [3]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog
    use_pyDatalog = True
except:
    pass

USER = "tobias"

if use_pyDatalog:
    from pyDatalog import pyDatalog
    pyDatalog.clear()
    pyDatalog.create_terms('watched_fact, candidateFor, recommendedFor, U, N, Y')
    for n,y in watched_pairs:
        +watched_fact(USER,n,y)
    for _,row in recs_df.iterrows():
        +candidateFor(USER,row["name_norm"],row["year_str"])
    recommendedFor(U,N,Y) <= candidateFor(U,N,Y) & ~watched_fact(U,N,Y)
    q = recommendedFor(USER,N,Y)
    df_res = q.to_pandas()
    df_res.columns=["user","name_norm","year_str"]
    filtered_recs = recs_df.merge(df_res[["name_norm","year_str"]],on=["name_norm","year_str"],how="inner")
else:
    candidate_pairs = list(zip(recs_df["name_norm"], recs_df["year_str"]))
    keep=[pair not in watched_pairs for pair in candidate_pairs]
    filtered_recs = recs_df.loc[keep].copy()

print("Filtered:",len(filtered_recs),"/",len(recs_df))
filtered_recs.head()


AttributeError: 'Query' object has no attribute 'to_pandas'

In [None]:

out_csv.parent.mkdir(parents=True,exist_ok=True)
filtered_recs.to_csv(out_csv,index=False)
print("Saved:",out_csv)
