
# LO2 with **Datalog** — Robust (Patched)
We keep Datalog for the **core logic** (exclude watched, mark watchlist),
and compute **genre/director likes/dislikes** directly in Python (stable).

**Datalog:** `recommendedBase(U,N,Y) :- candidateFor(U,N,Y) ∧ not watched_fact(U,N,Y)`


In [44]:

import pandas as pd
import ast, re
from pathlib import Path

# Auto-detect project root
here = Path.cwd()
candidate = here
while candidate != candidate.parent and not (candidate / "data").exists():
    candidate = candidate.parent
project_root = candidate if (candidate / "data").exists() else Path(".")
print("Detected project_root:", project_root.resolve())

# Paths
watched_path   = project_root / "data" / "letterboxd_export" / "watched.csv"
watchlist_path = project_root / "data" / "letterboxd_export" / "watchlist.csv"
candidates_path= project_root / "data" / "kg" / "rerank_with_embedding_results.csv"

# enriched-merged uploaded in this session or local fallback
enriched_uploaded = Path("/mnt/data/0bf2f757-dc8c-43d8-9f82-d2705737b4fe.csv")
enriched_local = project_root / "data" / "enriched_merged.csv"
enriched_path = enriched_uploaded if enriched_uploaded.exists() else enriched_local
print("Using enriched file:", enriched_path)

out_csv = project_root / "data" / "kg" / "rerank_by_logical_rules.csv"


Detected project_root: /Users/tschaffel/PycharmProjects/letterboxd-KG
Using enriched file: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/enriched_merged.csv


## Load & Normalize

In [45]:

watched_df = pd.read_csv(watched_path)
watchlist_df = pd.read_csv(watchlist_path)
recs_df = pd.read_csv(candidates_path)
enriched_df = pd.read_csv(enriched_path)

# lowercase columns
for df in (watched_df, watchlist_df, recs_df, enriched_df):
    df.columns = [c.lower() for c in df.columns]

def pick(cols, opts):
    for o in opts:
        if o in cols: return o
    return None

# pick columns
watched_name_col = pick(watched_df.columns, ["name","film name","title"])
watched_year_col = pick(watched_df.columns, ["year","release year","release_year"])
watch_name_col   = pick(watchlist_df.columns, ["name","film name","title","candidate_title","movie","movie_title","original_title"])
watch_year_col   = pick(watchlist_df.columns, ["year","release year","release_year","releaseyear"])
recs_name_col    = pick(recs_df.columns, ["candidate_title","name","title","movie_title","original_title"])
recs_year_col    = pick(recs_df.columns, ["year","release_year","candidate_year","releaseyear","year_x","year_y"])
en_title_col     = pick(enriched_df.columns, ["title","name"])
en_year_col      = pick(enriched_df.columns, ["year","release_year"])
en_rating_col    = pick(enriched_df.columns, ["rating","myrating","rating10","rating_10"])
en_genres_col    = pick(enriched_df.columns, ["genres","genre","tmdb_genres"])
en_dir_col       = pick(enriched_df.columns, ["director","directors","tmdb_directors"])

assert all([watched_name_col, watched_year_col, watch_name_col, watch_year_col, recs_name_col, recs_year_col, en_title_col, en_year_col, en_rating_col, en_genres_col, en_dir_col])

def norm_name(s): return s.astype(str).str.strip().str.lower()
def norm_year(s):  return s.astype(str).str.extract(r"(\d{4})", expand=False).fillna(s.astype(str).str.strip())

for df_, ncol, ycol in [(watched_df, watched_name_col, watched_year_col),
                        (watchlist_df, watch_name_col, watch_year_col),
                        (recs_df, recs_name_col, recs_year_col),
                        (enriched_df, en_title_col, en_year_col)]:
    df_["name_norm"] = norm_name(df_[ncol])
    df_["year_str"]  = norm_year(df_[ycol])

# parse list-like fields to pure lists of names (strip URLs)
def parse_list(cell):
    if pd.isna(cell): return []
    txt = str(cell)
    try:
        lst = ast.literal_eval(txt)
        out = []
        if isinstance(lst, list):
            for it in lst:
                if isinstance(it, str):
                    out.append(it.split(':',1)[0].strip())
                else:
                    out.append(str(it))
        return out
    except Exception:
        hits = re.findall(r"'([^':]+):", txt)
        return [h.strip() for h in hits]

enriched_df["genre_list"]    = enriched_df[en_genres_col].apply(parse_list)
enriched_df["director_list"] = enriched_df[en_dir_col].apply(parse_list)

# aggregate metadata per (name_norm, year_str)
def set_union(series_of_lists):
    s = set()
    for lst in series_of_lists:
        if isinstance(lst, list):
            s.update(lst)
        elif pd.isna(lst):
            continue
        else:
            s.add(str(lst))
    return sorted(s)

agg_meta = (enriched_df
            .groupby(["name_norm","year_str"], as_index=False)
            .agg(genre_list=("genre_list", set_union),
                 director_list=("director_list", set_union)))

recs_df = recs_df.merge(agg_meta, on=["name_norm","year_str"], how="left")

# force to lists
def ensure_list(x):
    if isinstance(x, list): return x
    if pd.isna(x): return []
    return [str(x)]
recs_df["genre_list"] = recs_df["genre_list"].apply(ensure_list)
recs_df["director_list"] = recs_df["director_list"].apply(ensure_list)

watched_pairs   = set(zip(watched_df["name_norm"], watched_df["year_str"]))
watchlist_pairs = set(zip(watchlist_df["name_norm"], watchlist_df["year_str"]))


## Preferences (likes/dislikes) from ratings

In [46]:

rated = enriched_df.dropna(subset=[en_rating_col]).copy()
rmax = rated[en_rating_col].max()
scale = 10.0 if rmax > 5 else 5.0
like_th = 7.0 if scale == 10.0 else 3.5
dislike_th = 3.0 if scale == 10.0 else 1.5
min_count = 2

genres_long = rated.explode("genre_list").dropna(subset=["genre_list"])
dirs_long   = rated.explode("director_list").dropna(subset=["director_list"])

g_stats = genres_long.groupby("genre_list")[en_rating_col].agg(['mean','count']).reset_index().rename(columns={'genre_list':'genre'})
d_stats = dirs_long.groupby("director_list")[en_rating_col].agg(['mean','count']).reset_index().rename(columns={'director_list':'director'})

liked_genres    = set(g_stats[(g_stats['mean']>=like_th)    & (g_stats['count']>=min_count)]['genre'])
disliked_genres = set(g_stats[(g_stats['mean']<=dislike_th) & (g_stats['count']>=min_count)]['genre'])
liked_dirs      = set(d_stats[(d_stats['mean']>=like_th)    & (d_stats['count']>=min_count)]['director'])
disliked_dirs   = set(d_stats[(d_stats['mean']<=dislike_th) & (d_stats['count']>=min_count)]['director'])

print("Scale:", scale, "| like_th:", like_th, "| dislike_th:", dislike_th)


Scale: 5.0 | like_th: 3.5 | dislike_th: 1.5


## Datalog core (only recommended & watchBoost)

In [47]:

use_pyDatalog = False
try:
    from pyDatalog import pyDatalog
    use_pyDatalog = True
    print("pyDatalog is available — using it.")
except Exception as e:
    print("pyDatalog not available, fallback will be used:", e)

USER = "tobias"


if use_pyDatalog:
    pyDatalog.clear()
    pyDatalog.create_terms('watched_fact, candidateFor, onWatchlist, '
                           'recommendedBase, recommended, watchBoost, U,N,Y')
    for n,y in watched_pairs:   +watched_fact(USER,n,y)
    for n,y in watchlist_pairs: +onWatchlist(USER,n,y)
    for _,row in recs_df.iterrows():
        +candidateFor(USER,row["name_norm"],row["year_str"])
    recommendedBase(U,N,Y) <= candidateFor(U,N,Y) & ~watched_fact(U,N,Y)
    recommended(U,N,Y)     <= recommendedBase(U,N,Y)
    watchBoost(U,N,Y)      <= recommendedBase(U,N,Y) & onWatchlist(U,N,Y)

    def qset(s):
        ans = pyDatalog.ask(s)
        return set(tuple(x) for x in (ans.answers if ans else []))

    all_pairs   = qset(f'recommended("{USER}", N, Y)')
    watch_pairs = qset(f'watchBoost("{USER}", N, Y)')

    if all_pairs:
        rec_df = pd.DataFrame(list(all_pairs), columns=["name_norm","year_str"])
        out = recs_df.merge(rec_df, on=["name_norm","year_str"], how="inner")
    else:
        out = recs_df.iloc[0:0].copy()

    out["watchlist_priority"] = list(map(lambda p: p in watch_pairs, zip(out["name_norm"], out["year_str"])))

else:
    # Fallback: pure pandas for 'recommended' and 'watchlist'
    cand_pairs = list(zip(recs_df["name_norm"], recs_df["year_str"]))
    keep_mask = [pair not in watched_pairs for pair in cand_pairs]
    out = recs_df.loc[keep_mask].copy()
    out["watchlist_priority"] = list(map(lambda p: p in watchlist_pairs, zip(out["name_norm"], out["year_str"])))


pyDatalog is available — using it.


## Compute boosts/penalties (Python) & Score

In [48]:

def any_in(candidate_list, prefer_set):
    try:
        return any(x in prefer_set for x in (candidate_list or []))
    except TypeError:
        return False

out["genre_boost"]      = out["genre_list"].apply(lambda lst: any_in(lst, liked_genres))
out["director_boost"]   = out["director_list"].apply(lambda lst: any_in(lst, liked_dirs))
out["genre_penalty"]    = out["genre_list"].apply(lambda lst: any_in(lst, disliked_genres))
out["director_penalty"] = out["director_list"].apply(lambda lst: any_in(lst, disliked_dirs))

# Weights
w_watch, w_glike, w_dlike, w_gbad, w_dbad = 2, 1, 2, 1, 2
out["score"] = (out["watchlist_priority"].astype(int)*w_watch +
                out["genre_boost"].astype(int)*w_glike +
                out["director_boost"].astype(int)*w_dlike -
                out["genre_penalty"].astype(int)*w_gbad -
                out["director_penalty"].astype(int)*w_dbad)

sort_cols, ascending = ["score"], [False]
if "rank" in out.columns: sort_cols.append("rank"); ascending.append(True)
out_sorted = out.sort_values(by=sort_cols, ascending=ascending).reset_index(drop=True)

## Save & Summary

In [49]:

out_csv.parent.mkdir(parents=True, exist_ok=True)
out_sorted.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())

summary = {
    "candidates_total": int(len(recs_df)),
    "recommended_total": int(len(out_sorted)),
    "watchlist_priority_true": int(out_sorted["watchlist_priority"].sum()),
    "genre_boost_true": int(out_sorted["genre_boost"].sum()),
    "director_boost_true": int(out_sorted["director_boost"].sum()),
    "genre_penalty_true": int(out_sorted["genre_penalty"].sum()),
    "director_penalty_true": int(out_sorted["director_penalty"].sum()),
    "top5": list(out_sorted.head(5).get("candidate_title", out_sorted.head(5).get("name")).astype(str))
}
summary


Saved: /Users/tschaffel/PycharmProjects/letterboxd-KG/data/kg/rerank_by_logical_rules.csv


{'candidates_total': 200,
 'recommended_total': 200,
 'watchlist_priority_true': 17,
 'genre_boost_true': 0,
 'director_boost_true': 0,
 'genre_penalty_true': 0,
 'director_penalty_true': 0,
 'top5': ['Breakfast on Pluto',
  'The Matrix Resurrections',
  'The Last King of Scotland',
  'The Talented Mr. Ripley',
  'Sunshine']}

In [50]:
'''
The code in the previous cells is in big parts AI generated by the free and paid version of ChatGPT and was afterwards heavily adapted by me. Since it is not possible to accurately say which parts were originaly AI generated by wich promt, I have included all prompts that were used on this file here.
These following prompts were used:

    ""Beispiel: Zeig, dass du mit Regeln ausschließen kannst, dass dir Filme empfohlen werden, die du schon gesehen hast – das ist eine logische Restriktion, kein Embedding-Thema." ich würde gern damit beginnen"

    "ja bitte, mach das. bitte gib das notebook als .ipynb file"

    "kannst du in dem jupyter notebook datalog für die anwendung der regeln verwenden?"

    "Ja, Genre und Regisseur Regeln sollen auch noch rein. Dafür sollte ich zuerst erkennen, welche Regisseure und welche Genres in den Daten gut bewertet wurden, oder? Also dafür sollte ich wahrscheinlich mein Ebedding verwenden?"

    "pyDatalog is available — using it.
        ---------------------------------------------------------------------------
        AttributeError                            Traceback (most recent call last)
        Cell In[6], line 48
             46 all_pairs   = qset(f'recommended("{USER}", N, Y)')
             47 watch_pairs = qset(f'watchBoost("{USER}", N, Y)')
        ---> 48 g_like      = qset(f'genreBoost("{USER}", N, Y)')
             49 d_like      = qset(f'dirBoost("{USER}", N, Y)')
             50 g_bad       = qset(f'genrePenalty("{USER}", N, Y)')

        Cell In[6], line 43, in qset(s)
             42 def qset(s):
        ---> 43     ans = pyDatalog.ask(s);
             44     return set(tuple(x) for x in (ans.answers if ans else []))

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyDatalog.py:111, in ask(code)
            109 def ask(code):
            110     """returns the result of the query contained in the code string"""
        --> 111     return pyParser.ask(code)

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyParser.py:841, in ask(code)
            839 add_symbols(code.co_names, newglobals)
            840 parsed_code = eval(code, newglobals)
        --> 841 a = parsed_code.ask()
            842 return Answer.make(a)

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyParser.py:577, in Query.ask(self)
            576 def ask(self):
        --> 577     self._data = Body(self.pre_calculations, self).ask()
            578     self.todo = None
            579     return self._data

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyParser.py:693, in Body.ask(self)
            691 """ resolve the query and determine the values of its variables"""
            692 literal = self.literal()
        --> 693 self._data = literal.lua.ask()
            694 literal.todo, self.todo = None, None
            695 - (literal <= self) # delete the temporary clause

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyEngine.py:513, in Literal.ask(self)
            511 todo, arg = (SEARCH, (Ts.Goal, ))
            512 while todo:
        --> 513     todo, arg = todo(*arg)
            515 if Ts.Goal.facts is True:
            516     return True

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyEngine.py:812, in Subgoal.search(self)
            808         raise util.DatalogError("Error: right hand side of comparison must be bound: %s"
            809                             % literal.pred.id, None, None)
            810     return self.next_step()
        --> 812 raise AttributeError("Predicate without definition (or error in resolver): %s" % literal.pred.id)

        AttributeError: Predicate without definition (or error in resolver): film_genre/3"

    "---------------------------------------------------------------------------
        NameError                                 Traceback (most recent call last)
        Cell In[3], line 5
              2 def has_any(lst, S):
              3     return any(x in S for x in (lst or []))
        ----> 5 recs_df["genre_boost"]     = recs_df["genre_list"].apply(lambda lst: has_any(lst, liked_genres))
              6 recs_df["director_boost"]  = recs_df["director_list"].apply(lambda lst: has_any(lst, liked_dirs))
              7 recs_df["genre_penalty"]   = recs_df["genre_list"].apply(lambda lst: has_any(lst, disliked_genres))

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/series.py:4924, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
           4789 def apply(
           4790     self,
           4791     func: AggFuncType,
           (...)   4796     **kwargs,
           4797 ) -> DataFrame | Series:
           4798     """
           4799     Invoke function on values of Series.
           4800
           (...)   4915     dtype: float64
           4916     """
           4917     return SeriesApply(
           4918         self,
           4919         func,
           4920         convert_dtype=convert_dtype,
           4921         by_row=by_row,
           4922         args=args,
           4923         kwargs=kwargs,
        -> 4924     ).apply()

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/apply.py:1427, in SeriesApply.apply(self)
           1424     return self.apply_compat()
           1426 # self.func is Callable
        -> 1427 return self.apply_standard()

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/apply.py:1507, in SeriesApply.apply_standard(self)
           1501 # row-wise access
           1502 # apply doesn't have a na_action keyword and for backward compat reasons
           1503 # we need to give na_action="ignore" for categorical data.
           1504 # TODO: remove the na_action="ignore" when that default has been changed in
           1505 #  Categorical (GH51645)."

    "---------------------------------------------------------------------------
        TypeError                                 Traceback (most recent call last)
        Cell In[5], line 5
              2 def has_any(lst, S):
              3     return any(x in S for x in (lst or []))
        ----> 5 recs_df["genre_boost"]     = recs_df["genre_list"].apply(lambda lst: has_any(lst, liked_genres))
              6 recs_df["director_boost"]  = recs_df["director_list"].apply(lambda lst: has_any(lst, liked_dirs))
              7 recs_df["genre_penalty"]   = recs_df["genre_list"].apply(lambda lst: has_any(lst, disliked_genres))

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/series.py:4924, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
           4789 def apply(
           4790     self,
           4791     func: AggFuncType,
           (...)   4796     **kwargs,
           4797 ) -> DataFrame | Series:
           4798     """
           4799     Invoke function on values of Series.
           4800
           (...)   4915     dtype: float64
           4916     """
           4917     return SeriesApply(
           4918         self,
           4919         func,
           4920         convert_dtype=convert_dtype,
           4921         by_row=by_row,
           4922         args=args,
           4923         kwargs=kwargs,
        -> 4924     ).apply()

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/apply.py:1427, in SeriesApply.apply(self)
           1424     return self.apply_compat()
           1426 # self.func is Callable
        -> 1427 return self.apply_standard()

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/apply.py:1507, in SeriesApply.apply_standard(self)
           1501 # row-wise access
           1502 # apply doesn't have a na_action keyword and for backward compat reasons
           1503 # we need to give na_action="ignore" for categorical data.
           1504 # TODO: remove the na_action="ignore" when that default has been changed in
           1505 #  Categorical (GH51645).
           1506 action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None
        -> 1507 mapped = obj._map_values(
           1508     mapper=curried, na_action=action, convert=self.convert_dtype
           1509 )
           1511 if len(mapped) and isinstance(mapped[0], ABCSeries):
           1512     # GH#43986 Need to do list(mapped) in order to get treated as nested
           1513     #  See also GH#25959 regarding EA support
           1514     return obj._constructor_expanddim(list(mapped), index=obj.index)

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/base.py:921, in IndexOpsMixin._map_values(self, mapper, na_action, convert)
            918 if isinstance(arr, ExtensionArray):
            919     return arr.map(mapper, na_action=na_action)
        --> 921 return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pandas/core/algorithms.py:1743, in map_array(arr, mapper, na_action, convert)
           1741 values = arr.astype(object, copy=False)
           1742 if na_action is None:
        -> 1743     return lib.map_infer(values, mapper, convert=convert)
           1744 else:
           1745     return lib.map_infer_mask(
           1746         values, mapper, mask=isna(values).view(np.uint8), convert=convert
           1747     )

        File lib.pyx:2972, in pandas._libs.lib.map_infer()

        Cell In[5], line 5, in <lambda>(lst)
              2 def has_any(lst, S):
              3     return any(x in S for x in (lst or []))
        ----> 5 recs_df["genre_boost"]     = recs_df["genre_list"].apply(lambda lst: has_any(lst, liked_genres))
              6 recs_df["director_boost"]  = recs_df["director_list"].apply(lambda lst: has_any(lst, liked_dirs))
              7 recs_df["genre_penalty"]   = recs_df["genre_list"].apply(lambda lst: has_any(lst, disliked_genres))

        Cell In[5], line 3, in has_any(lst, S)
              2 def has_any(lst, S):
        ----> 3     return any(x in S for x in (lst or []))

        TypeError: 'float' object is not iterable"

    "---------------------------------------------------------------------------
        AttributeError                            Traceback (most recent call last)
        Cell In[5], line 41
             39 all_pairs   = qset(f'recommended("{USER}", N, Y)')
             40 watch_pairs = qset(f'watchBoost("{USER}", N, Y)')
        ---> 41 g_like      = qset(f'genreBoost("{USER}", N, Y)')
             42 d_like      = qset(f'dirBoost("{USER}", N, Y)')
             43 g_bad       = qset(f'genrePenalty("{USER}", N, Y)')

        Cell In[5], line 36, in qset(s)
             35 def qset(s):
        ---> 36     ans = pyDatalog.ask(s)
             37     return set(tuple(x) for x in (ans.answers if ans else []))

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyDatalog.py:111, in ask(code)
            109 def ask(code):
            110     """returns the result of the query contained in the code string"""
        --> 111     return pyParser.ask(code)

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyParser.py:841, in ask(code)
            839 add_symbols(code.co_names, newglobals)
            840 parsed_code = eval(code, newglobals)
        --> 841 a = parsed_code.ask()
            842 return Answer.make(a)

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyParser.py:577, in Query.ask(self)
            576 def ask(self):
        --> 577     self._data = Body(self.pre_calculations, self).ask()
            578     self.todo = None
            579     return self._data

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyParser.py:693, in Body.ask(self)
            691 """ resolve the query and determine the values of its variables"""
            692 literal = self.literal()
        --> 693 self._data = literal.lua.ask()
            694 literal.todo, self.todo = None, None
            695 - (literal <= self) # delete the temporary clause

        File /opt/anaconda3/envs/letterboxd-KG/lib/python3.12/site-packages/pyDatalog/pyEngine.py:513, in Literal.ask(self)
            511 todo, arg = (SEARCH, (Ts.Goal, ))"

'''

'\nThe code in the previous cells is in big parts AI generated by the free and paid version of ChatGPT and was afterwards heavily adapted by me. Since it is not possible to accurately say which parts were originaly AI generated by wich promt, I have included all prompts that were used on this file here.\nThese following prompts were used:\n\n    ""Beispiel: Zeig, dass du mit Regeln ausschließen kannst, dass dir Filme empfohlen werden, die du schon gesehen hast – das ist eine logische Restriktion, kein Embedding-Thema." ich würde gern damit beginnen"\n\n    "ja bitte, mach das. bitte gib das notebook als .ipynb file"\n\n    "kannst du in dem jupyter notebook datalog für die anwendung der regeln verwenden?"\n\n    "Ja, Genre und Regisseur Regeln sollen auch noch rein. Dafür sollte ich zuerst erkennen, welche Regisseure und welche Genres in den Daten gut bewertet wurden, oder? Also dafür sollte ich wahrscheinlich mein Ebedding verwenden?"\n\n    "pyDatalog is available — using it.\n      