
# LO2 with **Datalog** — Watchlist + Likes/Dislikes (No `film_genre/3`)

This version **avoids** the `film_genre/3` and `director_fact/3` predicates inside Datalog
(by **materializing** the boosts/penalties in Python and asserting them as facts).
This sidesteps the `Predicate without definition: film_genre/3` error.

**Datalog facts asserted:**
- `watched_fact(U,N,Y)`
- `candidateFor(U,N,Y)`
- `onWatchlist(U,N,Y)`
- `genreBoost(U,N,Y)`  (precomputed in Python)
- `dirBoost(U,N,Y)`    (precomputed in Python)
- `genrePenalty(U,N,Y)` (precomputed in Python)
- `dirPenalty(U,N,Y)`   (precomputed in Python)

**Rules:**
```
recommendedBase(U,N,Y) <= candidateFor(U,N,Y) & ~watched_fact(U,N,Y)
recommended(U,N,Y)     <= recommendedBase(U,N,Y)
watchBoost(U,N,Y)      <= recommendedBase(U,N,Y) & onWatchlist(U,N,Y)
```
Scoring is done in Python from the flags.


In [1]:

import pandas as pd
import ast, re
from pathlib import Path

# Auto-detect project root
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path("../logical")
print("Detected project_root:", project_root.resolve())

# Paths
watched_path   = project_root / "data" / "letterboxd_export" / "watched.csv"
watchlist_path = project_root / "data" / "letterboxd_export" / "watchlist.csv"
candidates_path= project_root / "data" / "kg" / "tmdb_rerank_with_embedding_results_movies_only.csv"

# enriched-merged uploaded in this session:
enriched_uploaded = Path("/mnt/data/0bf2f757-dc8c-43d8-9f82-d2705737b4fe.csv")
enriched_local = project_root / "data" / "enriched_merged.csv"
enriched_path = enriched_uploaded if enriched_uploaded.exists() else enriched_local
print("Using enriched file:", enriched_path)

out_csv = project_root / "data" / "kg" / "rerank_LO2_watchlist_likes_dislikes_NO_FILMGENRE.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG
Using enriched file: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/enriched_merged.csv


In [2]:

# Load data
watched_df = pd.read_csv(watched_path)
watchlist_df = pd.read_csv(watchlist_path)
recs_df = pd.read_csv(candidates_path)
enriched_df = pd.read_csv(enriched_path)

# Normalize column names
for df in (watched_df, watchlist_df, recs_df, enriched_df):
    df.columns = [c.lower() for c in df.columns]

def pick(colnames, options):
    for o in options:
        if o in colnames:
            return o
    return None

# Columns
watched_name_col = pick(watched_df.columns, ["name","film name","title"])
watched_year_col = pick(watched_df.columns, ["year","release year","release_year"])
watch_name_col   = pick(watchlist_df.columns, ["name","film name","title","candidate_title","movie","movie_title","original_title"])
watch_year_col   = pick(watchlist_df.columns, ["year","release year","release_year","releaseyear"])
recs_name_col    = pick(recs_df.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col    = pick(recs_df.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])

en_title_col = pick(enriched_df.columns, ["title","name"])
en_year_col  = pick(enriched_df.columns, ["year","release_year"])
en_rating_col= pick(enriched_df.columns, ["rating","myrating","rating10","rating_10"])
en_genres_col= pick(enriched_df.columns, ["genres","genre","tmdb_genres"])
en_dir_col   = pick(enriched_df.columns, ["director","directors","tmdb_directors"])

assert all([watched_name_col, watched_year_col, watch_name_col, watch_year_col, recs_name_col, recs_year_col, en_title_col, en_year_col, en_rating_col, en_genres_col, en_dir_col]), "Missing required columns"

def norm_name(s): return s.astype(str).str.strip().str.lower()
def norm_year(s): 
    return s.astype(str).str.extract(r"(\d{4})", expand=False).fillna(s.astype(str).str.strip())

for df_, ncol, ycol in [(watched_df, watched_name_col, watched_year_col),
                        (watchlist_df, watch_name_col, watch_year_col),
                        (recs_df, recs_name_col, recs_year_col),
                        (enriched_df, en_title_col, en_year_col)]:
    df_["name_norm"] = norm_name(df_[ncol])
    df_["year_str"]  = norm_year(df_[ycol])

# Parse lists
def parse_list_of_colon_pairs(cell):
    if pd.isna(cell): return []
    txt = str(cell)
    try:
        lst = ast.literal_eval(txt)
        out = []
        for it in lst if isinstance(lst, list) else []:
            if isinstance(it, str):
                out.append(it.split(':',1)[0].strip())
        return out
    except Exception:
        return re.findall(r"'([^':]+):", txt)

enriched_df["genre_list"] = enriched_df[en_genres_col].apply(parse_list_of_colon_pairs)
enriched_df["director_list"] = enriched_df[en_dir_col].apply(parse_list_of_colon_pairs)

# Aggregate per (name_norm, year_str)
def set_union(series_of_lists):
    s = set()
    for lst in series_of_lists:
        if isinstance(lst, list):
            s.update(lst)
    return sorted(s)

agg_meta = (enriched_df
            .groupby(["name_norm","year_str"], as_index=False)
            .agg(genre_list=("genre_list", set_union),
                 director_list=("director_list", set_union)))

recs_df = recs_df.merge(agg_meta, on=["name_norm","year_str"], how="left")

watched_pairs   = set(zip(watched_df["name_norm"], watched_df["year_str"]))
watchlist_pairs = set(zip(watchlist_df["name_norm"], watchlist_df["year_str"]))


In [4]:

# Compute likes & dislikes from ratings
rated = enriched_df.dropna(subset=[en_rating_col]).copy()
rmax = rated[en_rating_col].max()
scale = 10.0 if rmax > 5 else 5.0
like_threshold = 7.0 if scale == 10.0 else 3.5
dislike_threshold = 3.0 if scale == 10.0 else 1.5
min_count = 2

genres_long = rated.explode("genre_list").dropna(subset=["genre_list"])
dirs_long   = rated.explode("director_list").dropna(subset=["director_list"])

g_stats = (genres_long.groupby("genre_list")[en_rating_col]
           .agg(['mean','count']).reset_index().rename(columns={'genre_list':'genre'}))
d_stats = (dirs_long.groupby("director_list")[en_rating_col]
           .agg(['mean','count']).reset_index().rename(columns={'director_list':'director'}))

liked_genres    = set(g_stats[(g_stats['mean']>=like_threshold)    & (g_stats['count']>=min_count)]['genre'])
disliked_genres = set(g_stats[(g_stats['mean']<=dislike_threshold) & (g_stats['count']>=min_count)]['genre'])

liked_dirs      = set(d_stats[(d_stats['mean']>=like_threshold)    & (d_stats['count']>=min_count)]['director'])
disliked_dirs   = set(d_stats[(d_stats['mean']<=dislike_threshold) & (d_stats['count']>=min_count)]['director'])

print("Scale:", scale, "| like_threshold:", like_threshold, "| dislike_threshold:", dislike_threshold)
print("liked genres (n):", len(liked_genres), "| disliked genres (n):", len(disliked_genres))
print("liked directors (n):", len(liked_dirs), "| disliked directors (n):", len(disliked_dirs))


Scale: 5.0 | like_threshold: 3.5 | dislike_threshold: 1.5
liked genres (n): 17 | disliked genres (n): 0
liked directors (n): 40 | disliked directors (n): 0


In [6]:

# Precompute boosts/penalties per candidate (name_norm, year_str)
def has_any(lst, S): 
    return any(x in S for x in (lst or []))

recs_df["genre_boost"]     = recs_df["genre_list"].apply(lambda lst: has_any(lst, liked_genres))
recs_df["director_boost"]  = recs_df["director_list"].apply(lambda lst: has_any(lst, liked_dirs))
recs_df["genre_penalty"]   = recs_df["genre_list"].apply(lambda lst: has_any(lst, disliked_genres))
recs_df["director_penalty"]= recs_df["director_list"].apply(lambda lst: has_any(lst, disliked_dirs))

# We'll assert these as facts in Datalog later.


TypeError: 'float' object is not iterable

## Datalog: assert facts and apply core rules

In [7]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog
    use_pyDatalog = True
    print("pyDatalog is available — using it.")
except Exception as e:
    print("pyDatalog not available, fallback will be used:", e)

USER = "tobias"

if use_pyDatalog:
    pyDatalog.clear()
    pyDatalog.create_terms('watched_fact, candidateFor, onWatchlist, '
                           'genreBoost, dirBoost, genrePenalty, dirPenalty, '
                           'recommendedBase, recommended, watchBoost, U,N,Y')

    for n,y in watched_pairs:   +watched_fact(USER,n,y)
    for n,y in watchlist_pairs: +onWatchlist(USER,n,y)
    for _,row in recs_df.iterrows():
        +candidateFor(USER,row["name_norm"],row["year_str"])

    # Assert boosts/penalties as *facts* (no inner predicates needed)
    for _,row in recs_df[recs_df["genre_boost"]==True].iterrows():
        +genreBoost(USER,row["name_norm"],row["year_str"])
    for _,row in recs_df[recs_df["director_boost"]==True].iterrows():
        +dirBoost(USER,row["name_norm"],row["year_str"])
    for _,row in recs_df[recs_df["genre_penalty"]==True].iterrows():
        +genrePenalty(USER,row["name_norm"],row["year_str"])
    for _,row in recs_df[recs_df["director_penalty"]==True].iterrows():
        +dirPenalty(USER,row["name_norm"],row["year_str"])

    # Rules
    recommendedBase(U,N,Y) <= candidateFor(U,N,Y) & ~watched_fact(U,N,Y)
    recommended(U,N,Y)     <= recommendedBase(U,N,Y)
    watchBoost(U,N,Y)      <= recommendedBase(U,N,Y) & onWatchlist(U,N,Y)

    def qset(s): 
        ans = pyDatalog.ask(s); 
        return set(tuple(x) for x in (ans.answers if ans else []))

    all_pairs   = qset(f'recommended("{USER}", N, Y)')
    watch_pairs = qset(f'watchBoost("{USER}", N, Y)')
    g_like      = qset(f'genreBoost("{USER}", N, Y)')
    d_like      = qset(f'dirBoost("{USER}", N, Y)')
    g_bad       = qset(f'genrePenalty("{USER}", N, Y)')
    d_bad       = qset(f'dirPenalty("{USER}", N, Y)')

    if all_pairs:
        rec_df = pd.DataFrame(list(all_pairs), columns=["name_norm","year_str"])
        out = recs_df.merge(rec_df, on=["name_norm","year_str"], how="inner")
    else:
        out = recs_df.iloc[0:0].copy()

    def flag(df, S, col):
        df[col] = list(map(lambda p: p in S, zip(df["name_norm"], df["year_str"])))
    flag(out, watch_pairs, "watchlist_priority")
    flag(out, g_like, "genre_boost")
    flag(out, d_like, "director_boost")
    flag(out, g_bad,  "genre_penalty")
    flag(out, d_bad,  "director_penalty")

else:
    # Pure pandas fallback
    cand_pairs = list(zip(recs_df["name_norm"], recs_df["year_str"]))
    keep_mask = [pair not in watched_pairs for pair in cand_pairs]
    out = recs_df.loc[keep_mask].copy()
    out["watchlist_priority"] = list(map(lambda p: p in watchlist_pairs, zip(out["name_norm"], out["year_str"])))
    out["genre_boost"] = recs_df["genre_boost"]
    out["director_boost"] = recs_df["director_boost"]
    out["genre_penalty"] = recs_df["genre_penalty"]
    out["director_penalty"] = recs_df["director_penalty"]


pyDatalog is available — using it.


KeyError: 'genre_boost'

In [8]:

# Score and save
w_watch, w_glike, w_dlike, w_gbad, w_dbad = 2, 1, 2, 1, 2
out["score"] = (out["watchlist_priority"].astype(int)*w_watch +
                out["genre_boost"].astype(int)*w_glike +
                out["director_boost"].astype(int)*w_dlike -
                out["genre_penalty"].astype(int)*w_gbad -
                out["director_penalty"].astype(int)*w_dbad)

sort_cols, ascending = ["score"], [False]
if "rank" in out.columns: sort_cols.append("rank"); ascending.append(True)

out_sorted = out.sort_values(by=sort_cols, ascending=ascending).reset_index(drop=True)
out_csv.parent.mkdir(parents=True, exist_ok=True)
out_sorted.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())

summary = {
    "candidates_total": int(len(recs_df)),
    "recommended_total": int(len(out_sorted)),
    "watchlist_priority_true": int(out_sorted["watchlist_priority"].sum()),
    "genre_boost_true": int(out_sorted["genre_boost"].sum()),
    "director_boost_true": int(out_sorted["director_boost"].sum()),
    "genre_penalty_true": int(out_sorted["genre_penalty"].sum()),
    "director_penalty_true": int(out_sorted["director_penalty"].sum()),
}
summary


NameError: name 'out' is not defined