
# LO2 with **Datalog** — Robust (Patched)
We keep Datalog for the **core logic** (exclude watched, mark watchlist),
and compute **genre/director likes/dislikes** directly in Python (stable).

**Datalog:** `recommendedBase(U,N,Y) :- candidateFor(U,N,Y) ∧ not watched_fact(U,N,Y)`


In [1]:

import pandas as pd
import ast, re
from pathlib import Path

# Auto-detect project root
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path(".")
print("Detected project_root:", project_root.resolve())

# Paths
watched_path   = project_root / "data" / "letterboxd_export" / "watched.csv"
watchlist_path = project_root / "data" / "letterboxd_export" / "watchlist.csv"
candidates_path= project_root / "data" / "kg" / "tmdb_rerank_with_embedding_results_movies_only.csv"

# enriched-merged uploaded in this session or local fallback
enriched_uploaded = Path("/mnt/data/0bf2f757-dc8c-43d8-9f82-d2705737b4fe.csv")
enriched_local = project_root / "data" / "enriched_merged.csv"
enriched_path = enriched_uploaded if enriched_uploaded.exists() else enriched_local
print("Using enriched file:", enriched_path)

out_csv = project_root / "data" / "kg" / "rerank_LO2_watchlist_likes_dislikes_PATCHED.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG
Using enriched file: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/enriched_merged.csv


## Load & Normalize

In [2]:

watched_df = pd.read_csv(watched_path)
watchlist_df = pd.read_csv(watchlist_path)
recs_df = pd.read_csv(candidates_path)
enriched_df = pd.read_csv(enriched_path)

# lowercase columns
for df in (watched_df, watchlist_df, recs_df, enriched_df):
    df.columns = [c.lower() for c in df.columns]

def pick(cols, opts):
    for o in opts:
        if o in cols: return o
    return None

# pick columns
watched_name_col = pick(watched_df.columns, ["name","film name","title"])
watched_year_col = pick(watched_df.columns, ["year","release year","release_year"])
watch_name_col   = pick(watchlist_df.columns, ["name","film name","title","candidate_title","movie","movie_title","original_title"])
watch_year_col   = pick(watchlist_df.columns, ["year","release year","release_year","releaseyear"])
recs_name_col    = pick(recs_df.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col    = pick(recs_df.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])
en_title_col     = pick(enriched_df.columns, ["title","name"])
en_year_col      = pick(enriched_df.columns, ["year","release_year"])
en_rating_col    = pick(enriched_df.columns, ["rating","myrating","rating10","rating_10"])
en_genres_col    = pick(enriched_df.columns, ["genres","genre","tmdb_genres"])
en_dir_col       = pick(enriched_df.columns, ["director","directors","tmdb_directors"])

assert all([watched_name_col, watched_year_col, watch_name_col, watch_year_col, recs_name_col, recs_year_col, en_title_col, en_year_col, en_rating_col, en_genres_col, en_dir_col])

def norm_name(s): return s.astype(str).str.strip().str.lower()
def norm_year(s):  return s.astype(str).str.extract(r"(\d{4})", expand=False).fillna(s.astype(str).str.strip())

for df_, ncol, ycol in [(watched_df, watched_name_col, watched_year_col),
                        (watchlist_df, watch_name_col, watch_year_col),
                        (recs_df, recs_name_col, recs_year_col),
                        (enriched_df, en_title_col, en_year_col)]:
    df_["name_norm"] = norm_name(df_[ncol])
    df_["year_str"]  = norm_year(df_[ycol])

# parse list-like fields to pure lists of names (strip URLs)
def parse_list(cell):
    if pd.isna(cell): return []
    txt = str(cell)
    try:
        lst = ast.literal_eval(txt)
        out = []
        if isinstance(lst, list):
            for it in lst:
                if isinstance(it, str):
                    out.append(it.split(':',1)[0].strip())
                else:
                    out.append(str(it))
        return out
    except Exception:
        hits = re.findall(r"'([^':]+):", txt)
        return [h.strip() for h in hits]

enriched_df["genre_list"]    = enriched_df[en_genres_col].apply(parse_list)
enriched_df["director_list"] = enriched_df[en_dir_col].apply(parse_list)

# aggregate metadata per (name_norm, year_str)
def set_union(series_of_lists):
    s = set()
    for lst in series_of_lists:
        if isinstance(lst, list):
            s.update(lst)
        elif pd.isna(lst):
            continue
        else:
            s.add(str(lst))
    return sorted(s)

agg_meta = (enriched_df
            .groupby(["name_norm","year_str"], as_index=False)
            .agg(genre_list=("genre_list", set_union),
                 director_list=("director_list", set_union)))

recs_df = recs_df.merge(agg_meta, on=["name_norm","year_str"], how="left")

# force to lists
def ensure_list(x):
    if isinstance(x, list): return x
    if pd.isna(x): return []
    return [str(x)]
recs_df["genre_list"] = recs_df["genre_list"].apply(ensure_list)
recs_df["director_list"] = recs_df["director_list"].apply(ensure_list)

watched_pairs   = set(zip(watched_df["name_norm"], watched_df["year_str"]))
watchlist_pairs = set(zip(watchlist_df["name_norm"], watchlist_df["year_str"]))


## Preferences (likes/dislikes) from ratings

In [10]:

rated = enriched_df.dropna(subset=[en_rating_col]).copy()
rmax = rated[en_rating_col].max()
scale = 10.0 if rmax > 5 else 5.0
like_th = 7.0 if scale == 10.0 else 3.5
dislike_th = 3.0 if scale == 10.0 else 1.5
min_count = 2

genres_long = rated.explode("genre_list").dropna(subset=["genre_list"])
dirs_long   = rated.explode("director_list").dropna(subset=["director_list"])

g_stats = genres_long.groupby("genre_list")[en_rating_col].agg(['mean','count']).reset_index().rename(columns={'genre_list':'genre'})
d_stats = dirs_long.groupby("director_list")[en_rating_col].agg(['mean','count']).reset_index().rename(columns={'director_list':'director'})

liked_genres    = set(g_stats[(g_stats['mean']>=like_th)    & (g_stats['count']>=min_count)]['genre'])
disliked_genres = set(g_stats[(g_stats['mean']<=dislike_th) & (g_stats['count']>=min_count)]['genre'])
liked_dirs      = set(d_stats[(d_stats['mean']>=like_th)    & (d_stats['count']>=min_count)]['director'])
disliked_dirs   = set(d_stats[(d_stats['mean']<=dislike_th) & (d_stats['count']>=min_count)]['director'])

print("Scale:", scale, "| like_th:", like_th, "| dislike_th:", dislike_th)


Scale: 5.0 | like_th: 3.5 | dislike_th: 1.5


{'Ang Lee',
 'Benny Safdie',
 'Bo Burnham',
 'Byron Howard',
 'Chad Stahelski',
 'Chris Buck',
 'Christopher Nolan',
 'Christopher Storer',
 'Dan Trachtenberg',
 'David Fincher',
 'David Lynch',
 'Dean DeBlois',
 'Denis Villeneuve',
 'Elizabeth Chai Vasarhelyi',
 'Gore Verbinski',
 'Greta Gerwig',
 'Guillermo del Toro',
 'Hayao Miyazaki',
 'James Cameron',
 'James Gunn',
 'Jeff Fowler',
 'Jennifer Yuh Nelson',
 'Jimmy Chin',
 'Josh Safdie',
 'Julia Ducournau',
 'M. Night Shyamalan',
 'Matt Reeves',
 'Noah Baumbach',
 'Patty Jenkins',
 'Paul King',
 'Peter Lord',
 'Quentin Tarantino',
 'Ridley Scott',
 'Sam Raimi',
 'Stanley Kubrick',
 'Steven Spielberg',
 'Taika Waititi',
 'Tim Burton',
 'Tom McGrath',
 'Wes Anderson'}

## Datalog core (only recommended & watchBoost)

In [4]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog
    use_pyDatalog = True
    print("pyDatalog is available — using it.")
except Exception as e:
    print("pyDatalog not available, fallback will be used:", e)

USER = "tobias"

if use_pyDatalog:
    pyDatalog.clear()
    pyDatalog.create_terms('watched_fact, candidateFor, onWatchlist, '
                           'recommendedBase, recommended, watchBoost, U,N,Y')
    for n,y in watched_pairs:   +watched_fact(USER,n,y)
    for n,y in watchlist_pairs: +onWatchlist(USER,n,y)
    for _,row in recs_df.iterrows():
        +candidateFor(USER,row["name_norm"],row["year_str"])
    recommendedBase(U,N,Y) <= candidateFor(U,N,Y) & ~watched_fact(U,N,Y)
    recommended(U,N,Y)     <= recommendedBase(U,N,Y)
    watchBoost(U,N,Y)      <= recommendedBase(U,N,Y) & onWatchlist(U,N,Y)

    def qset(s):
        ans = pyDatalog.ask(s)
        return set(tuple(x) for x in (ans.answers if ans else []))

    all_pairs   = qset(f'recommended("{USER}", N, Y)')
    watch_pairs = qset(f'watchBoost("{USER}", N, Y)')

    if all_pairs:
        rec_df = pd.DataFrame(list(all_pairs), columns=["name_norm","year_str"])
        out = recs_df.merge(rec_df, on=["name_norm","year_str"], how="inner")
    else:
        out = recs_df.iloc[0:0].copy()

    out["watchlist_priority"] = list(map(lambda p: p in watch_pairs, zip(out["name_norm"], out["year_str"])))

else:
    # Fallback: pure pandas for 'recommended' and 'watchlist'
    cand_pairs = list(zip(recs_df["name_norm"], recs_df["year_str"]))
    keep_mask = [pair not in watched_pairs for pair in cand_pairs]
    out = recs_df.loc[keep_mask].copy()
    out["watchlist_priority"] = list(map(lambda p: p in watchlist_pairs, zip(out["name_norm"], out["year_str"])))


pyDatalog is available — using it.


## Compute boosts/penalties (Python) & Score

In [11]:

def any_in(candidate_list, prefer_set):
    try:
        return any(x in prefer_set for x in (candidate_list or []))
    except TypeError:
        return False

out["genre_boost"]      = out["genre_list"].apply(lambda lst: any_in(lst, liked_genres))
out["director_boost"]   = out["director_list"].apply(lambda lst: any_in(lst, liked_dirs))
out["genre_penalty"]    = out["genre_list"].apply(lambda lst: any_in(lst, disliked_genres))
out["director_penalty"] = out["director_list"].apply(lambda lst: any_in(lst, disliked_dirs))

# Weights
w_watch, w_glike, w_dlike, w_gbad, w_dbad = 2, 1, 2, 1, 2
out["score"] = (out["watchlist_priority"].astype(int)*w_watch +
                out["genre_boost"].astype(int)*w_glike +
                out["director_boost"].astype(int)*w_dlike -
                out["genre_penalty"].astype(int)*w_gbad -
                out["director_penalty"].astype(int)*w_dbad)

sort_cols, ascending = ["score"], [False]
if "rank" in out.columns: sort_cols.append("rank"); ascending.append(True)
out_sorted = out.sort_values(by=sort_cols, ascending=ascending).reset_index(drop=True)

Unnamed: 0,candidate_id,candidate_title,year,cos,meta,final,seed,comp_genres,comp_keywords,comp_cast,...,name_norm,year_str,genre_list,director_list,watchlist_priority,genre_boost,director_boost,genre_penalty,director_penalty,score
0,29437,Rabid,1977.0,0.0000,0.6046,0.2418,Crimes of the Future,1.0000,0.0909,0.0000,...,rabid,1977,[],[],True,False,False,False,False,2
1,9493,Twins,1988.0,0.0000,0.4727,0.1891,Ghostbusters II,0.5000,0.0286,0.0000,...,twins,1988,[],[],True,False,False,False,False,2
2,838330,Not Okay,2022.0,0.0000,0.4606,0.1843,The Meyerowitz Stories (New and Selected),1.0000,0.0769,0.0000,...,not okay,2022,[],[],True,False,False,False,False,2
3,466420,Killers of the Flower Moon,2023.0,0.0000,0.4601,0.1840,The Wolf of Wall Street,0.5000,0.0435,0.0256,...,killers of the flower moon,2023,[],[],True,False,False,False,False,2
4,3146,The War of the Gargantuas,1966.0,0.0000,0.5059,0.2024,Godzilla,0.5000,0.1429,0.1111,...,the war of the gargantuas,1966,[],[],True,False,False,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,10539,James and the Giant Peach,1996.0,0.0000,0.5081,0.2033,Coraline,0.7500,0.0769,0.0000,...,james and the giant peach,1996,[],[],False,False,False,False,False,0
96,19398,Simon,2004.0,0.0465,0.4419,0.2046,Trio,1.0000,0.0000,0.0000,...,simon,2004,[],[],False,False,False,False,False,0
97,75,Mars Attacks!,1996.0,0.0000,0.5156,0.2062,Dark Shadows,0.6667,0.0294,0.0000,...,mars attacks!,1996,[],[],False,False,False,False,False,0
98,326359,Frozen Fever,2015.0,0.0000,0.5293,0.2117,Frozen,0.6000,0.1765,0.3333,...,frozen fever,2015,[],[],False,False,False,False,False,0


## Save & Summary

In [6]:

out_csv.parent.mkdir(parents=True, exist_ok=True)
out_sorted.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())

summary = {
    "candidates_total": int(len(recs_df)),
    "recommended_total": int(len(out_sorted)),
    "watchlist_priority_true": int(out_sorted["watchlist_priority"].sum()),
    "genre_boost_true": int(out_sorted["genre_boost"].sum()),
    "director_boost_true": int(out_sorted["director_boost"].sum()),
    "genre_penalty_true": int(out_sorted["genre_penalty"].sum()),
    "director_penalty_true": int(out_sorted["director_penalty"].sum()),
    "top5": list(out_sorted.head(5).get("candidate_title", out_sorted.head(5).get("name")).astype(str))
}
summary


Saved: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/kg/rerank_LO2_watchlist_likes_dislikes_PATCHED.csv


{'candidates_total': 100,
 'recommended_total': 100,
 'watchlist_priority_true': 9,
 'genre_boost_true': 0,
 'director_boost_true': 0,
 'genre_penalty_true': 0,
 'director_penalty_true': 0,
 'top5': ['Rabid',
  'Twins',
  'Not Okay',
  'Killers of the Flower Moon',
  'The War of the Gargantuas']}