## Recommender Systems Notebook

### Setup & Demo Data

We implement some  common recommender algorithms used in production :

- Popularity baseline
- Content-based TF-IDF
- Item-Item Co-visitation
- Collaborative Filtering (kNN) user-based and item-based
- Collaborative Filtering Matrix Factorization with Tensorflow
- Two-Tower Retrieval with Tensorflow

### Imports

In [94]:
# Numerical computing
import numpy as np

# Data handling
import pandas as pd

# For clean "struct-like" models (optional)
from dataclasses import dataclass

# Typing clarity (optional but good practice)
from typing import Dict, List, Tuple, Optional, Callable

# Useful for co-visitation counting
from collections import defaultdict, Counter

# Content-based TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from twisted.python.reflect import prefixedMethods

# Reproducibility
np.random.seed(42)


### Demo dataset generator

In [95]:
def make_demo_data(
        n_users: int = 30,
        n_items: int = 60,
        n_categories: int = 6,
        ratings_per_user: int = 12,
        session_len: int = 8,
        seed: int = 42,
):
    """
    Creates a toy dataset that behaves like a real recommendation dataset.

    Users:
      - each user prefers one category

    Items:
      - each item belongs to one category
      - each item has a text description (category-specific keywords)

    Ratings:
      - user gives higher ratings to items in their preferred category

    Sessions:
      - implicit sequences of interacted items (mostly from preferred category)
    """

    rng = np.random.RandomState(seed)

    # Assign each iteam a category ID
    item_category = rng.randint(0, n_categories, size=n_items)

    # Assign each user a preferred category
    user_pref = rng.randint(0, n_categories, size=n_users)

    # Words to generate item descriptions per category
    category_words = {
        0: [  # Action / Adventure
            "action", "fast", "adventure", "hero", "battle", "explosion",
            "chase", "mission", "fight", "weapon", "danger", "rescue"
        ],
        1: [  # Romance / Drama
            "romance", "love", "drama", "heart", "relationship", "emotion",
            "passion", "kiss", "betrayal", "wedding", "tearful", "affection"
        ],
        2: [  # Sci-Fi
            "scifi", "space", "future", "alien", "robot", "technology",
            "galaxy", "time", "experiment", "spaceship", "cyber", "planet"
        ],
        3: [  # Comedy
            "comedy", "funny", "joke", "laugh", "humor", "awkward",
            "satire", "parody", "prank", "clumsy", "ridiculous", "smile"
        ],
        4: [  # Horror
            "horror", "scary", "ghost", "dark", "monster", "fear",
            "nightmare", "blood", "curse", "haunted", "evil", "scream"
        ],
        5: [  # Documentary
            "documentary", "history", "facts", "nature", "real",
            "science", "culture", "wildlife", "investigation", "education",
            "truth", "archive"
        ],
    }

    # Create item text and titles
    item_text: Dict[int, str] = {}
    item_title: Dict[int, str] = {}
    for i in range(n_items):
        category = int(item_category[i])
        words = category_words[category]

        desc = " ".join(rng.choice(words, size=5, replace=False))

        item_text[i] = desc
        item_title[i] = f"Item_{i:02d}_Category_{category:02d}"

    # Build ratings as a list of (user_id,item_id,rating)
    ratings: List[Tuple[int, int, float]] = []
    for u in range(n_users):
        # Items in user's preferred category
        preferred_items = np.where(item_category == user_pref[u])[0]

        # Items Not in preferred category
        other_items = np.where(item_category != user_pref[u])[0]

        # Choose ~70% from preferred and ~30% from others
        n_pref = int(ratings_per_user * 0.7)
        n_other = ratings_per_user - n_pref

        # Choose without replacement
        chosen_pref = rng.choice(preferred_items, size=min(n_pref, len(preferred_items)), replace=False)
        chosen_other = rng.choice(other_items, size=min(n_other, len(other_items)), replace=False)

        chosen = np.concatenate((chosen_pref, chosen_other))
        rng.shuffle(chosen)

        for item_id in chosen:
            # Base rating is higher if matches preference
            base = 4.2 if item_category[item_id] == user_pref[u] else 2.8

            # Add Gaussian noise and clip into [1...5]
            r = np.clip(rng.normal(base, 0.6), 1.0, 5.0)

            # Round to 0.1 to look more realistic
            ratings.append((u, int(item_id), float(np.round(r, 1))))

    # Build the sessions ( view/click sequences)

    sessions: List[List[int]] = []

    for u in range(n_users):
        # sample from preferred category with replacement ( views  can repeat )
        pref_items = np.where(item_category == user_pref[u])[0]
        session = rng.choice(pref_items, size=session_len, replace=True)
        sessions.append(session)

    # Item metadata table
    items_df = pd.DataFrame({
        "item_id": np.arange(n_items),
        "title": [item_title[i] for i in range(n_items)],
        "category": item_category,
        "description": [item_text[i] for i in range(n_items)],
    })

    # Ratings dataframe
    ratings_df = pd.DataFrame(
        ratings,
        columns=["user_id", "item_id", "rating"]
    )

    return ratings_df, sessions, items_df, item_text

In [96]:
ratings_df, sessions, items_df, item_text = make_demo_data()

In [97]:
ratings_df.head(10)

Unnamed: 0,user_id,item_id,rating
0,0,39,4.4
1,0,32,5.0
2,0,47,2.6
3,0,24,2.4
4,0,40,3.2
5,0,59,4.9
6,0,7,2.9
7,0,0,4.2
8,0,38,5.0
9,0,50,3.6


In [98]:
items_df.head(10)

Unnamed: 0,item_id,title,category,description
0,0,Item_00_Category_03,3,smile joke awkward comedy ridiculous
1,1,Item_01_Category_04,4,fear scary haunted monster ghost
2,2,Item_02_Category_02,2,technology experiment cyber alien spaceship
3,3,Item_03_Category_04,4,scream haunted evil dark ghost
4,4,Item_04_Category_04,4,blood dark scary curse evil
5,5,Item_05_Category_01,1,wedding relationship love emotion heart
6,6,Item_06_Category_02,2,experiment scifi robot space spaceship
7,7,Item_07_Category_02,2,cyber planet scifi galaxy space
8,8,Item_08_Category_02,2,technology time space galaxy future
9,9,Item_09_Category_04,4,haunted fear scream nightmare blood


###

### Train/Test Split & Metrics

- For each user keep 1 rating as a test , the rest remains for training

In [99]:
def leave_last_one_out_split(ratings: pd.DataFrame, seed: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    For each user, hold out 1 interaction for test.
    """
    train_parts = []  # list of train chunks for each user
    test_rows = []    # list of single held-out rows for each user

    # Group ratings by user
    for user_id, group in ratings.groupby("user_id"):
        # Shuffle this user's ratings so "last one out" isn't biased by item_id ordering
        group = group.sample(frac=1.0, random_state=seed)

        # Last row becomes test
        test_rows.append(group.iloc[-1])

        # All except last become train
        train_parts.append(group.iloc[:-1])

    train_df = pd.concat(train_parts).reset_index(drop=True)
    test_df = pd.DataFrame(test_rows).reset_index(drop=True)
    return train_df, test_df


def precision_recall_at_k(recs: List[int], relevant: set, k: int) -> Tuple[float, float]:
    """
    Compute precision@k and recall@k.

    - precision@k: fraction of recommended items (top k) that are relevant
    - recall@k: fraction of relevant items that appear in top k
    """
    top_k = recs[:k]                            # keep only top-k recommendations
    hits = sum(1 for x in top_k if x in relevant)  # count matches with relevant set

    precision = hits / k
    recall = hits / max(1, len(relevant))
    return precision, recall


def evaluate_model(
    recommend_fn: Callable[[int, set, int], List[int]],
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    k: int = 10,
    name: str = "model"
):
    """
    Evaluates a recommender function using leave-last-one-out test.

    recommend_fn signature: (user_id, seen_set, k) -> list[item_id]
    """
    # Build helper dicts for speed:
    # - seen items per user (from train)
    seen_by_user = train_df.groupby("user_id")["item_id"].apply(set).to_dict()

    # - test relevant item per user (from test)
    relevant_by_user = test_df.groupby("user_id")["item_id"].apply(list).to_dict()

    precisions = []
    recalls = []

    for u, rel_items in relevant_by_user.items():
        relevant_set = set(rel_items)             # relevant items (here only 1)
        seen_set = seen_by_user.get(u, set())     # seen in training

        recs = recommend_fn(u, seen_set, k)       # top-k recommendations
        p, r = precision_recall_at_k(recs, relevant_set, k)

        precisions.append(p)
        recalls.append(r)

    print(f"\n{name} @ {k}")
    print(f"Precision@{k}: {np.mean(precisions):.3f}")
    print(f"Recall@{k}:    {np.mean(recalls):.3f}")


def show_recommendations(user_id: int, recs: List[int], title: str):
    """Prints recommended item titles and categories."""
    print(f"\n{title} (user={user_id})")
    display(items_df.set_index("item_id").loc[recs][["title", "category"]])





In [100]:
# Split the data
train_df, test_df = leave_last_one_out_split(ratings_df)

# Useful global counts
n_users = int(ratings_df["user_id"].max() + 1)
n_items = int(ratings_df["item_id"].max() + 1)

print("Train size:", len(train_df), "Test size:", len(test_df))

Train size: 328 Test size: 30


#### User Id

In [101]:
u_demo = 2

### Popularity Baseline Algorithm


In [102]:
class PopularityRecommender:
    """
    Recommends items based on global popularity (interaction count).
    Very common baseline and fallback in production.
    """

    def __init__(self):
        self.ranked_items : List[int] = []

    def fit(self,train_df:pd.DataFrame):
        # How much an item appears in train
        counts = train_df['item_id'].value_counts()

        self.ranked_items = [int(item_id) for item_id in counts.index]
    def recommend(self,user_id:int , seen:set,k:int) -> List[int]:
        recs = []

        for item_id in self.ranked_items:
            if item_id not in seen:
                recs.append(item_id)
            if len(recs) == k:
                break
        return recs

pop = PopularityRecommender()
pop.fit(train_df)

seen_demo = set(train_df[train_df.user_id == u_demo].item_id)
recs_demo = pop.recommend(u_demo, seen_demo, k=10)
show_recommendations(u_demo, recs_demo, "Popularity recommendations")

# Evaluate
evaluate_model(lambda u, seen, k: pop.recommend(u, seen, k), train_df, test_df, k=10, name="Popularity")


Popularity recommendations (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
50,Item_50_Category_05,5
17,Item_17_Category_05,5
18,Item_18_Category_01,1
16,Item_16_Category_05,5
33,Item_33_Category_03,3
56,Item_56_Category_01,1
12,Item_12_Category_05,5
5,Item_05_Category_01,1
36,Item_36_Category_05,5
47,Item_47_Category_03,3



Popularity @ 10
Precision@10: 0.030
Recall@10:    0.300


Those numbers mean: with the **Most Popular** baseline, **only ~3% of the top-10 recommendations are correct** on average (≈0.3 “hits” per user), but it still manages to include each user’s **one held-out test item** in the top-10 for about **30% of users** (that’s what Recall@10 = 0.300 means in your leave-one-out setup).


### Content-Based TF-IDF Algorithm

In [103]:
from typing import Set


class ContentTFIDFRecommender:
    """
    Content-based TF-IDF Recommender (Text Similarity Recommender)

    Idea:
    - Each item has a text description (bag of words).
    - We convert every item's text into a TF-IDF vector.
      (A vector where each dimension corresponds to a word; the value says how important
       that word is for that item compared to all other items.)
    - For a user, we take the TF-IDF vectors of the items they already interacted with
      (their "history") and average them → this becomes the user's "profile vector".
      (This profile captures what words/topics the user seems to like.)
    - We then compute cosine similarity between the user profile and ALL items.
      Items with the highest similarity are the recommendations.
    - We exclude items the user already saw/rated (seen set), so we don’t recommend them again.

    Output:
    - recommend(...) returns a list of item_ids (length k), sorted from most relevant to least.
    """

    def __init__(self, max_features: int = 5000):
        # Limit how many words/features TF-IDF keeps (top max_features most useful words).
        self.max_features = max_features

        # Keeps item IDs in the exact same order as rows in the TF-IDF matrix X.
        # Example: item_ids[0] corresponds to X[0], item_ids[1] -> X[1], etc.
        self.item_ids: List[int] = []

        # Turns text -> TF-IDF vectors.
        # stop_words='english' removes common words like "the", "and", "is" etc.
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words="english"
        )

        # TF-IDF matrix: shape = (num_items, num_features)
        # Row = one item, Columns = words/features
        # Usually a sparse matrix because most words are 0 for a given item.
        self.X = None

    def fit(self, item_text: Dict[int, str]) -> None:
        """
        Train/prepare the model on item descriptions.

        Parameters
        ----------
        item_text : dict[item_id -> description]
            Example: {0: "space alien future robot", 1: "romance love heart drama", ...}

        What happens:
        1) We fix an ordering of item IDs (sorted).
        2) We build a list of item descriptions in that exact order.
        3) We fit TF-IDF on all descriptions and store the resulting matrix in self.X.

        After this:
        - self.item_ids maps row index -> item_id
        - self.X[row_index] is the TF-IDF vector for that item_id
        """
        # Ensure consistent stable ordering so row indices are reproducible.
        self.item_ids = sorted(item_text.keys())

        # Create a list of texts in the same order as item_ids.
        texts = [item_text[i] for i in self.item_ids]

        # Learn vocabulary + transform text to TF-IDF matrix.
        # X: (n_items x n_features)
        self.X = self.vectorizer.fit_transform(texts)

    def recommend(
        self,
        user_id: int,
        seen: Set[int],
        k: int,
        user_history: List[int]
    ) -> List[int]:
        """
        Recommend top-k items for a user using TF-IDF + cosine similarity.

        Parameters
        ----------
        user_id : int
            Not used by the algorithm itself here (history already represents the user),
            but kept for a consistent recommender interface.
        seen : set[int]
            Items already interacted with in training. We exclude these from output.
        k : int
            Number of recommendations to return.
        user_history : list[int]
            Items the user interacted with in the past (training interactions).

        Returns
        -------
        list[int]
            Top-k recommended item_ids (most similar first).

        Flow (high level):
        1) Convert user_history item IDs -> row indices in self.X
        2) Build user profile vector = average of TF-IDF vectors of history items
        3) Compute cosine similarity(user_profile, every_item_vector)
        4) Set similarity of "seen" items to -1 (so they will never be recommended)
        5) Take the top-k highest similarity items
        """
        # -------------------------
        # (0) Cold start: no history
        # -------------------------
        if not user_history:
            # Without history, we can't build a user profile from content.
            return []

        # ---------------------------------------------------------
        # (1) Convert item IDs -> TF-IDF row indices (in self.X)
        # ---------------------------------------------------------
        # self.item_ids is the mapping: row_index -> item_id
        # We need the inverse mapping: item_id -> row_index, but here we use list.index(...)
        # which is correct but slower (can be optimized later).
        idxs = [self.item_ids.index(i) for i in user_history]

        # ---------------------------------------------------------
        # (2) Build user profile vector (average TF-IDF of history)
        # ---------------------------------------------------------
        # self.X[idxs] selects the rows for the user's history items:
        # shape: (len(history), n_features)
        #
        # mean(axis=0) averages over history items to create one "taste vector":
        # shape: (1, n_features)
        user_vec = self.X[idxs].mean(axis=0)

        # Convert potential np.matrix to a normal numpy array (sklearn dislikes np.matrix).
        # Keep it 2D: (1, n_features) so cosine_similarity is happy.
        user_vec = np.asarray(user_vec).reshape(1, -1)

        # ---------------------------------------------------------
        # (3) Similarity between user profile and ALL items
        # ---------------------------------------------------------
        # cosine_similarity compares user_vec to each item vector in self.X:
        # result shape: (1, n_items)
        sims = cosine_similarity(user_vec, self.X).ravel()  # -> (n_items,)

        # ---------------------------------------------------------
        # (4) Exclude already seen items
        # ---------------------------------------------------------
        # We set their similarity to -1 so they drop to the bottom.
        # (Cosine similarity for TF-IDF is normally between 0 and 1.)
        for it in seen:
            if it in self.item_ids:
                sims[self.item_ids.index(it)] = -1.0

        # ---------------------------------------------------------
        # (5) Choose the top-k highest similarity items
        # ---------------------------------------------------------
        # argsort sorts ascending, so we sort by -sims (descending).
        top_idx = np.argsort(-sims)[:k]

        # Convert TF-IDF row indices back to item IDs.
        return [self.item_ids[i] for i in top_idx]


# Train content model
cb = ContentTFIDFRecommender(max_features=5000)
cb.fit(item_text)

# Demo
history_demo = list(train_df[train_df.user_id == u_demo].item_id)
recs_demo = cb.recommend(u_demo, seen_demo, k=10, user_history=history_demo)
show_recommendations(u_demo, recs_demo, "Content TF-IDF recommendations")

# Evaluate with fallback to popularity if needed
def cb_recommend_fn(u, seen, k):
    history = list(train_df[train_df.user_id == u].item_id)
    recs = cb.recommend(u, seen, k, history)
    return recs if recs else pop.recommend(u, seen, k)

evaluate_model(cb_recommend_fn, train_df, test_df, k=10, name="Content TF-IDF")


Content TF-IDF recommendations (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
29,Item_29_Category_02,2
30,Item_30_Category_02,2
6,Item_06_Category_02,2
17,Item_17_Category_05,5
0,Item_00_Category_03,3
50,Item_50_Category_05,5
40,Item_40_Category_00,0
24,Item_24_Category_05,5
12,Item_12_Category_05,5
58,Item_58_Category_03,3



Content TF-IDF @ 10
Precision@10: 0.080
Recall@10:    0.800


These results mean that the **Content TF-IDF recommender** places the user’s held-out test item in the **top-10 recommendations for about 80% of users** (Recall@10 = 0.800), and on average **0.8 of the 10 recommended items are actually relevant** (Precision@10 = 0.080), which is a **substantial improvement over the popularity baseline** and shows that matching item descriptions to a user’s past content effectively captures user preferences in this dataset.


### Item-Item Co-visitation

- From sessions we count how often item A appears together with item B and we recommend neighbors of the last seen item

In [104]:
class ItemItemCoVisitation:
    """
    Item-Item Co-Visitation Recommender (Session Co-occurrence Graph)

    What this model does:
    - It learns which items tend to appear together in the SAME session.
      Example: if item 12 and item 33 are often viewed in the same session,
      then 33 becomes a strong "neighbor" recommendation for 12.

    This is commonly used in real systems as:
      "Customers who viewed this also viewed..."
      "Frequently bought together..."

    Output:
    - A graph/dictionary:
        graph[item_a] = [most common co-occurring items with item_a]
      sorted by how frequently they co-occur in sessions.
    """

    def __init__(self,top_k_per_item:int = 50):
         # For each item, keep only the top-K most frequent neighbors
        self.top_k_per_item = top_k_per_item

        # The learned co-visitation graph:
        self.graph : Dict[int,List[int]] = {}

    def fit(self,sessions:List[List[int]]) -> None:
        """
        Build co-visitation counts from session data.

        sessions:
            A list of sessions; each session is a list of item_ids
            Example session: [10, 10, 3, 7]  (views can repeat)

        Training logic:
        1) For each session, take UNIQUE items (avoid counting duplicates in the same session).
        2) For every pair (a,b) that appear together, increment counts[a][b].
        3) For each item a, store its top_k_per_item neighbors by count.
        """
        # counts[a] is a Counter holding co-occurrence counts with other items b
        # counts[a][b] = number of sessions where both a and b appeared
        counts = defaultdict(Counter)

        for session in sessions:

            # Remove duplicates
            unique_items = list(dict.fromkeys(session))

            # Count co-occurences for all pairs in this session

            for a in unique_items:
                for b in unique_items:
                    if a != b:
                        counts[a][b] += 1



        # Convert the counts into a neighbour list graph

        self.graph = {}

        for a, ctr in counts.items():
             self.graph[int(a)] = [int(b) for b, _ in ctr.most_common(self.top_k_per_item)]


    def recommend(
        self,
        user_id: int,
        seen: set,
        k: int,
        last_item: Optional[int]
    ) -> List[int]:
        """
        Recommend items based on the user's last interacted item.

        Intuition:
        - If the user's last_item is X, recommend "neighbors of X" (items often seen with X).
        - This is a SHORT-TERM / session-context recommender (not long-term user taste).

        Inputs:
        - last_item: the last item the user interacted with (context)
        - seen: items already seen by the user in training; exclude them
        - k: how many items to return

        Returns:
        - Up to k recommended item_ids
        """
        # Without context no prediction possible
        if last_item is None:
            return []

        # Get candidate neighbors from the graph
        candidates = self.graph.get(int(last_item), [])

        # Filter out items user already saw and return top-k
        return [i for i in candidates if i not in seen][:k]


covis = ItemItemCoVisitation(top_k_per_item=50)
covis.fit(sessions)

# Use the last training interaction as context (what they most recently viewed)
last_item_demo = history_demo[-1] if history_demo else None

recs_demo = covis.recommend(
    user_id=u_demo,
    seen=seen_demo,
    k=10,
    last_item=last_item_demo
)

show_recommendations(
    u_demo,
    recs_demo,
    f"Co-visitation recommendations (last_item={last_item_demo})"
)


# ------------------------------------------
# Evaluate: co-visitation with popularity fallback
# ------------------------------------------
def covis_recommend_fn(u, seen, k):
    """
    Wrapper that matches evaluate_model(...) signature.

    - Build last_item from this user's train history.
    - If co-visitation can't recommend (no history or empty neighbor list),
      fall back to the popularity baseline so evaluation always returns k items.
    """
    history = list(train_df[train_df.user_id == u].item_id)
    last_item = history[-1] if history else None

    recs = covis.recommend(u, seen, k, last_item)
    return recs if recs else pop.recommend(u, seen, k)


evaluate_model(covis_recommend_fn, train_df, test_df, k=10, name="Item-Item Co-visitation")


Co-visitation recommendations (last_item=38) (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
15,Item_15_Category_03,3
57,Item_57_Category_03,3
32,Item_32_Category_03,3
22,Item_22_Category_03,3
26,Item_26_Category_03,3
39,Item_39_Category_03,3
58,Item_58_Category_03,3



Item-Item Co-visitation @ 10
Precision@10: 0.043
Recall@10:    0.433


Item-Item Co-visitation achieves high recall (43.3%) but low precision (4.3%) at @10, which is expected and desirable because it is designed as a recall-focused candidate generator rather than a final ranking model.


### Collaborative Filtering

- User-based kNN CF
- Item-based kNN CF

In [105]:
def build_dense_rating_matrix(train_df:pd.DataFrame,n_users:int,n_items: int) -> np.ndarray:
    """
    Build a dense user-item rating matrix R.

    Goal:
      Convert a "long" ratings table like:
         user_id | item_id | rating
      into a matrix:
         R[user, item] = rating

    Output:
      R shape: (n_users, n_items)
      - R[u, i] = rating if user u rated item i in TRAIN
      - R[u, i] = NaN if user u never rated item i (unknown/missing)

    Why NaN?
      We want to distinguish:
        - "unknown rating" (missing)   -> NaN
        - "real rating value" (1..5)   -> float
    """

    # Fill everything with Nan initially
    R = np.full((n_users, n_items), np.nan,dtype=float)

    for row in train_df.itertuples(index=False):
        u = int(row.user_id)
        i = int(row.item_id)
        r = float(row.rating)

        R[u,i] = r

    return R

def nanmean_safe(x:np.ndarray) -> float:
     """
    Compute mean while ignoring NaNs.

    Why needed?
      Users will have NaNs for items they never rated.
      np.mean would produce NaN if any NaNs exist,
      so we use np.nanmean.

    Edge case:
      If the entire vector is NaN (user has no ratings),
      np.nanmean returns NaN. In that case return 0.0.
    """

     m = np.nanmean(x)
     return float(m) if not np.isnan(m) else 0.0

In [106]:
def pearson_sim_nan(a: np.ndarray, b: np.ndarray) -> float:
    """
    Pearson correlation similarity for rating vectors with NaNs.

    Use case:
      USER-based CF typically uses Pearson correlation:
      - It compares rating patterns AFTER mean-centering.
      - This handles users with different rating scales
        (e.g., strict user vs generous user).

    How it works:
      1) Find overlap positions where BOTH users rated the same items.
      2) Mean-center both users over only that overlap.
      3) Compute correlation (dot / norms).

    Returns:
      similarity in [-1, 1]
      0.0 if not enough overlap or zero variance
    """
    # Only compare where both have real ratings
    mask = ~np.isnan(a) & ~np.isnan(b)

    # If fewer than 2 overlapping ratings, correlation isn't meaningful
    if mask.sum() < 2:
        return 0.0

    x = a[mask]
    y = b[mask]

    # Mean-center within the overlap (Pearson requirement)
    x = x - x.mean()
    y = y - y.mean()

    denom = np.sqrt((x * x).sum()) * np.sqrt((y * y).sum())
    if denom == 0:
        return 0.0

    return float((x * y).sum() / denom)

def cosine_sim_nan(a: np.ndarray, b: np.ndarray) -> float:
    """
    Cosine similarity for vectors with NaNs, using only overlap positions.

    Use case:
      ITEM-based CF often uses cosine similarity (especially for implicit feedback
      or when you don't want mean-centering).

    Returns:
      similarity in [0, 1] typically for non-negative data,
      but can be negative if values can be negative.
      0.0 if not enough overlap or zero norm.
    """
    mask = ~np.isnan(a) & ~np.isnan(b)
    if mask.sum() < 2:
        return 0.0

    x = a[mask]
    y = b[mask]

    denom = np.sqrt((x * x).sum()) * np.sqrt((y * y).sum())
    if denom == 0:
        return 0.0

    return float((x * y).sum() / denom)

In [107]:
class UserKNNCF:
    """
    User-based kNN Collaborative Filtering (CF)

    Core idea:
      Users who rated items similarly in the past will rate new items similarly.

    Predict rating(u, i) using neighbors:
      1) Find other users v who rated item i
      2) Compute similarity s(u, v) (Pearson correlation)
      3) Combine their ratings with a weighted average

    We use a mean-centered formula:
      pred(u,i) = mean(u) + sum_v s(u,v) * (r(v,i) - mean(v)) / sum_v |s(u,v)|

    Why mean-centering?
      - Some users rate high overall (4-5), others rate low (2-3).
      - Pearson + mean-centering focuses on *preference patterns* not scale.
    """

    def __init__(self,k_neighbors:int = 30):

        self.k = k_neighbors

        # Rating matrix stored after fit()
        self.R: Optional[np.ndarray] = None

    def fit(self, R: np.ndarray) -> None:
        """
        Store the training rating matrix R.
        No "learning" parameters here — kNN is mostly lazy evaluation.
        """
        self.R = R

    def predict_rating(self, user: int, item: int) -> float:
        """
        Predict a rating for (user, item), even if user never rated it.

        Steps:
          A) If user already rated this item in TRAIN -> return that rating.
          B) Otherwise:
             - compute user's mean rating
             - find neighbors who rated this item
             - compute Pearson similarity to each neighbor
             - aggregate neighbor contributions (mean-centered)
        """
        R = self.R
        assert R is not None, "Call fit() before predict_rating()."

        user = int(user)
        item = int(item)

        # A) Known rating shortcut (already rated in train)
        if not np.isnan(R[user, item]):
            return float(R[user, item])

        # B1) Baseline: user's average rating
        user_mean = nanmean_safe(R[user])

        # B2) Collect candidate neighbors: users who rated this item
        sims: List[Tuple[int, float]] = []

        for v in range(R.shape[0]):
            if v == user:
                continue

            # neighbor must have rated the target item
            if np.isnan(R[v, item]):
                continue

            # similarity between user and neighbor v
            s = pearson_sim_nan(R[user], R[v])

            # keep only non-zero sims (0 means no useful overlap or no variance)
            if s != 0.0:
                sims.append((v, s))

        # If nobody rated the item (or no similarity), fallback to user mean
        if not sims:
            return user_mean

        # B3) Keep top-K neighbors by ABS(similarity)
        # (both strong positive and strong negative correlations are "strong")
        sims.sort(key=lambda x: abs(x[1]), reverse=True)
        sims = sims[: self.k]

        # B4) Weighted mean-centered aggregation
        num = 0.0
        den = 0.0

        for v, s in sims:
            v_mean = nanmean_safe(R[v])

            # neighbor's "deviation" from their mean on this item
            # if neighbor rated above their mean, that's a positive signal
            num += s * (R[v, item] - v_mean)

            # use abs(s) to keep denominator positive and stable
            den += abs(s)

        # B5) Final prediction
        # If den is 0 , fallback to user mean
        return float(user_mean + num / den) if den != 0 else user_mean

    def recommend(self, user_id: int, seen: set, k: int, n_items: int) -> List[int]:
        """
        Recommend top-k items for user_id by predicting ratings for all unseen items.

        Steps:
          1) For every item i the user has not seen:
             score(i) = predicted_rating(user_id, i)
          2) Sort items by score descending
          3) Return top-k item IDs

        Note:
          This is O(n_items * n_users) per user in the worst case (slow for large data),
          but fine for a toy dataset / learning exercise.
        """
        scores = []

        for i in range(n_items):
            if i in seen:
                continue

            pred = self.predict_rating(user_id, i)
            scores.append((i, pred))

        # Sort by predicted rating high -> low
        scores.sort(key=lambda x: x[1], reverse=True)

        # Return top-k item IDs
        return [int(i) for i, _ in scores[:k]]


In [108]:
R_train = build_dense_rating_matrix(train_df, n_users, n_items)

user_knn = UserKNNCF(k_neighbors=30)
user_knn.fit(R_train)

# Demo recommendations for one user
recs_demo = user_knn.recommend(user_id=u_demo, seen=seen_demo, k=10, n_items=n_items)
show_recommendations(u_demo, recs_demo, "User-based kNN CF recommendations")

# Evaluate (using your leave-one-out pipeline)
evaluate_model(
    recommend_fn=lambda u, seen, k: user_knn.recommend(u, seen, k, n_items),
    train_df=train_df,
    test_df=test_df,
    k=10,
    name="User-kNN CF"
)


User-based kNN CF recommendations (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
25,Item_25_Category_04,4
48,Item_48_Category_00,0
32,Item_32_Category_03,3
53,Item_53_Category_00,0
6,Item_06_Category_02,2
1,Item_01_Category_04,4
56,Item_56_Category_01,1
9,Item_09_Category_04,4
0,Item_00_Category_03,3
54,Item_54_Category_01,1



User-kNN CF @ 10
Precision@10: 0.037
Recall@10:    0.367


### Item-based kNN CF

In [109]:
class ItemKNNCF:
    """
    Item-based kNN Collaborative Filtering (CF)

    Core idea:
      Items that are rated similarly by many users are "similar".
      If a user liked (rated high) items similar to item i, then the user will
      likely also like item i.

    Prediction strategy for rating(u, i):
      1) Look at items j that the user u HAS rated.
      2) Compute similarity(sim(i, j)) between the target item i and each item j
         using their rating vectors across users (columns of R).
      3) Weighted average of the user's ratings on those neighbors:
            pred(u,i) = sum_j sim(i,j) * r(u,j) / sum_j |sim(i,j)|

    Notes:
      - We use cosine similarity on item vectors (common for item-based CF).
      - We handle missing ratings (NaN) by computing cosine on overlap only (cosine_sim_nan).
      - If we can't compute any neighbors, we fall back to the user's mean rating.
    """

    def __init__(self, k_neighbors: int = 50):
        # Max number of similar items to use as neighbors
        self.k = k_neighbors

        # Rating matrix: shape (n_users, n_items)
        # R[u, i] = rating or NaN if missing
        self.R: Optional[np.ndarray] = None

    def fit(self, R: np.ndarray) -> None:
        """
        Store the training rating matrix.
        Item-based kNN is also "lazy": it doesn't learn parameters,
        it just uses R at prediction time.
        """
        self.R = R

    def predict_rating(self, user: int, item: int) -> float:
        """
        Predict rating for a specific (user, item).

        Steps:
          A) If user already rated item in training -> return that rating.
          B) Otherwise:
             - Take the target item's rating vector across users: R[:, item]
             - Find neighbor items j that this user rated (R[user, j] not NaN)
             - Compute sim(target_item, item_j)
             - Predict using weighted average of the user's ratings on those neighbors
        """
        R = self.R
        assert R is not None, "Call fit() before predict_rating()."

        # Make sure indices are real ints (avoids NumPy indexing errors)
        user = int(user)
        item = int(item)

        # A) If already rated, no need to predict
        if not np.isnan(R[user, item]):
            return float(R[user, item])

        # B0) Baseline fallback: user's average rating (ignoring NaNs)
        user_mean = nanmean_safe(R[user])

        # B1) Target item vector = ratings of this item by all users
        # shape: (n_users,)
        target_item_vec = R[:, item]

        # Collect (neighbor_item_id, similarity) pairs
        sims: List[Tuple[int, float]] = []

        # Loop over all possible neighbor items j
        for j in range(R.shape[1]):
            if j == item:
                continue

            # Only consider neighbor items that the user has rated
            # (otherwise user has no signal about item j)
            if np.isnan(R[user, j]):
                continue

            # Similarity between items i and j using their rating patterns across users
            # Uses only overlapping users who rated both items (handled inside cosine_sim_nan)
            s = cosine_sim_nan(target_item_vec, R[:, j])

            if s != 0.0:
                sims.append((j, s))

        # If no similar neighbor items found, fallback
        if not sims:
            return user_mean

        # Keep top-K neighbors by strength of similarity
        sims.sort(key=lambda x: abs(x[1]), reverse=True)
        sims = sims[: self.k]

        # B2) Weighted average of user's ratings on neighbor items
        num = 0.0
        den = 0.0

        for j, s in sims:
            # user’s rating on neighbor item j influences prediction,
            # scaled by similarity between j and the target item
            num += s * R[user, j]
            den += abs(s)

        # B3) Final prediction
        # If den=0 for some edge case, fallback to user mean
        return float(num / den) if den != 0 else user_mean

    def recommend(self, user_id: int, seen: set, k: int, n_items: int) -> List[int]:
        """
        Recommend top-k items for user_id.

        Strategy:
          - For each item i not in seen:
              score(i) = predict_rating(user_id, i)
          - Sort by score descending
          - Return top-k item IDs
        """
        user_id = int(user_id)
        seen = set(int(x) for x in seen)

        scores = []

        for i in range(int(n_items)):
            if i in seen:
                continue

            pred = self.predict_rating(user_id, i)
            scores.append((i, pred))

        # Sort high-to-low predicted rating
        scores.sort(key=lambda x: x[1], reverse=True)

        return [int(i) for i, _ in scores[:k]]

In [110]:
# -------------------------
# Train and run the model
# -------------------------

R_train = build_dense_rating_matrix(train_df, n_users, n_items)

item_knn = ItemKNNCF(k_neighbors=50)
item_knn.fit(R_train)

# Demo: recommend for one user
recs_demo = item_knn.recommend(user_id=u_demo, seen=seen_demo, k=10, n_items=n_items)
show_recommendations(u_demo, recs_demo, "Item-based kNN CF recommendations")

# Evaluate with your leave-one-out pipeline
evaluate_model(
    recommend_fn=lambda u, seen, k: item_knn.recommend(u, seen, k, n_items),
    train_df=train_df,
    test_df=test_df,
    k=10,
    name="Item-kNN CF"
)



Item-based kNN CF recommendations (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
15,Item_15_Category_03,3
3,Item_03_Category_04,4
14,Item_14_Category_01,1
23,Item_23_Category_01,1
31,Item_31_Category_01,1
45,Item_45_Category_00,0
46,Item_46_Category_01,1
52,Item_52_Category_01,1
55,Item_55_Category_04,4
42,Item_42_Category_04,4



Item-kNN CF @ 10
Precision@10: 0.033
Recall@10:    0.333


### Collaborative Filtering in TensorFlow(Matrix Factorization)

In [111]:

# Goal:
#   We want to learn to predict ratings:
#       rating(user, item) ≈ model(user, item)
#
#   Then for recommendations:
#       "score every item for this user" -> pick top-K unseen items
#
# MF idea (very common in recommender systems):
#   - Give every user a learnable vector (embedding)
#   - Give every item a learnable vector (embedding)
#   - If user vector and item vector match well (dot product is big),
#     user is predicted to like the item.
#
# Also add "biases":
#   - Some users rate high overall (user bias)
#   - Some items are liked by everyone (item bias)
#   - Plus a global average rating (mu)
#
# Prediction formula:
#   r_hat(u, i) = mu + b_user[u] + b_item[i] + dot(user_vec[u], item_vec[i])
# ============================================================

import tensorflow as tf


# ------------------------------------------------------------
# PART 1) Convert pandas DataFrame -> tf.data.Dataset
# ------------------------------------------------------------
def make_tf_rating_dataset(
    df: pd.DataFrame,
    batch_size: int = 256,
    shuffle: bool = True
) -> tf.data.Dataset:
    """
    TensorFlow training usually expects batches of tensors.

    Your ratings are in a DataFrame like:
        user_id | item_id | rating

    This function converts it into a TensorFlow dataset that yields batches:
        inputs  = (user_ids_batch, item_ids_batch)
        labels  = ratings_batch

    Why do we do this?
    - TensorFlow trains fastest on batches (not one row at a time)
    - tf.data lets TF stream data efficiently (shuffle, batch, prefetch)

    Important:
    - user_id and item_id must be integers for Embedding layers
    - rating must be float (we train a regression model)
    """

    # Convert DataFrame columns to numpy arrays with correct dtypes:
    # - Embedding layers need integer indices (int32 is standard)
    user_ids = df["user_id"].to_numpy().astype(np.int32)   # shape [N]
    item_ids = df["item_id"].to_numpy().astype(np.int32)   # shape [N]

    # Ratings are continuous values -> float32
    ratings  = df["rating"].to_numpy().astype(np.float32)  # shape [N]

    # Create dataset of individual examples:
    # Each example looks like: ((user_id, item_id), rating)
    ds = tf.data.Dataset.from_tensor_slices(((user_ids, item_ids), ratings))

    # Shuffle training data so the model doesn’t see users/items in the same order every epoch
    # (this helps training generalize and not learn ordering patterns)
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df), seed=42)

    # Batch = group many examples together:
    # Instead of one (user,item)->rating, you get a batch of 256 examples at a time.
    # Prefetch overlaps data preparation and model training (speed boost).
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return ds


# ------------------------------------------------------------
# PART 2) Define the Matrix Factorization model
# ------------------------------------------------------------
class TFMatrixFactorization(tf.keras.Model):
    """
    A TensorFlow model that predicts ratings from (user_id, item_id).

    Key TensorFlow concept:
    - Embedding layer is basically a "lookup table":
        Embedding(n_users, k) means:
            you store a matrix of shape [n_users, k]
            user_id=7 returns row 7 (a vector of length k)

    So we learn:
    - user_emb[user_id] -> user vector (preferences)
    - item_emb[item_id] -> item vector (properties)
    - user_bias[user_id] -> scalar
    - item_bias[item_id] -> scalar
    - mu -> scalar (global mean)
    """

    def __init__(self, n_users: int, n_items: int, k: int = 32):
        super().__init__()

        # This creates a learnable table:
        # user_emb_matrix: [n_users, k]
        # When you pass user_ids [B], you get vectors [B, k]
        self.user_emb = tf.keras.layers.Embedding(input_dim=n_users, output_dim=k)

        # Same for items:
        # item_emb_matrix: [n_items, k]
        self.item_emb = tf.keras.layers.Embedding(input_dim=n_items, output_dim=k)

        # Bias tables (1 number per user/item):
        self.user_bias = tf.keras.layers.Embedding(input_dim=n_users, output_dim=1)
        self.item_bias = tf.keras.layers.Embedding(input_dim=n_items, output_dim=1)

        # Global mean rating (one trainable scalar)
        self.mu = tf.Variable(0.0, dtype=tf.float32, trainable=True)

    def call(self, inputs):
        """
        The forward pass: given (user_ids, item_ids), output predicted ratings.

        inputs:
            (user_ids, item_ids)
            user_ids: shape [B]
            item_ids: shape [B]

        output:
            predicted ratings: shape [B]
        """
        user_ids, item_ids = inputs

        # 1) Look up the user vectors and item vectors
        # u_vec: [B, k], i_vec: [B, k]
        u_vec = self.user_emb(user_ids)
        i_vec = self.item_emb(item_ids)

        # 2) Dot product between user and item vectors
        # elementwise multiply u_vec * i_vec -> [B, k]
        # sum over k -> [B, 1]
        dot = tf.reduce_sum(u_vec * i_vec, axis=1, keepdims=True)

        # 3) Look up user and item biases -> [B, 1]
        b_u = self.user_bias(user_ids)
        b_i = self.item_bias(item_ids)

        # 4) Combine everything into the final prediction
        # pred: [B, 1]
        pred = self.mu + b_u + b_i + dot

        # Convert shape [B, 1] -> [B]
        return tf.squeeze(pred, axis=1)


# ------------------------------------------------------------
# PART 3) Build train/test datasets
# ------------------------------------------------------------
train_ds = make_tf_rating_dataset(train_df, batch_size=256, shuffle=True)
test_ds  = make_tf_rating_dataset(test_df,  batch_size=256, shuffle=False)

# Create the MF model
tf_mf = TFMatrixFactorization(n_users=n_users, n_items=n_items, k=32)

# Initialize global mean mu to average training rating
# (This helps training start from a reasonable baseline)
tf_mf.mu.assign(float(train_df["rating"].mean()))

# Optimizer: controls how model parameters are updated
# Adam is a popular default optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Regularization: discourage huge embedding values (helps avoid overfitting)
l2_reg = 1e-5


# ------------------------------------------------------------
# PART 4) One training step (one batch update)
# ------------------------------------------------------------
@tf.function
def train_step(batch_inputs, batch_ratings):
    """
    This function does ONE learning update on ONE batch.

    TensorFlow concepts:
    - GradientTape tracks operations to compute gradients (derivatives).
    - gradients tell us how to change parameters to reduce loss.
    - optimizer.apply_gradients updates the weights.

    Inputs:
      batch_inputs  = (user_ids_batch, item_ids_batch), shape [B]
      batch_ratings = true ratings, shape [B]
    """
    with tf.GradientTape() as tape:
        # Forward pass: model predicts ratings for this batch
        preds = tf_mf(batch_inputs)  # shape [B]

        # Compute error (loss):
        # MSE = mean((true - predicted)^2)
        mse = tf.reduce_mean(tf.square(batch_ratings - preds))

        # L2 penalty: sum of squared weights for all trainable variables
        # This includes embeddings + biases + mu
        l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf_mf.trainable_variables])

        # Total loss: fit the data + keep weights small
        loss = mse + l2_reg * l2

    # Compute gradients of loss with respect to all model parameters
    grads = tape.gradient(loss, tf_mf.trainable_variables)

    # Apply gradients (update weights)
    optimizer.apply_gradients(zip(grads, tf_mf.trainable_variables))

    return loss


# ------------------------------------------------------------
# PART 5) Training loop over epochs
# ------------------------------------------------------------
#
# Epoch = one pass over the full training dataset.
# Each epoch contains many batches.
#
for epoch in range(10):
    batch_losses = []

    # train_ds yields batches:
    #   batch_inputs  = (user_ids_batch, item_ids_batch)
    #   batch_ratings = ratings_batch
    for (batch_inputs, batch_ratings) in train_ds:
        loss_val = train_step(batch_inputs, batch_ratings)
        batch_losses.append(float(loss_val.numpy()))

    # Print average loss across all batches in this epoch
    print(f"TF-MF Epoch {epoch+1} | loss={np.mean(batch_losses):.4f}")


TF-MF Epoch 1 | loss=0.8039
TF-MF Epoch 2 | loss=0.8318
TF-MF Epoch 3 | loss=0.8050
TF-MF Epoch 4 | loss=0.6996
TF-MF Epoch 5 | loss=0.6760
TF-MF Epoch 6 | loss=0.6340
TF-MF Epoch 7 | loss=0.5480
TF-MF Epoch 8 | loss=0.4863
TF-MF Epoch 9 | loss=0.4314
TF-MF Epoch 10 | loss=0.3583


In [112]:
def tf_mf_recommend(user_id: int, seen: set, k: int, n_items: int) -> List[int]:
    """
    Generate top-k recommendations for one user using the trained TF MF model.

    How MF recommend works:
    - The MF model can predict a rating for ANY (user, item) pair:
        tf_mf((user_ids, item_ids)) -> predicted_ratings
    - To recommend, we score ALL items for the user and take the highest predictions
      among the items the user has not already seen.

    This is a brute-force recommender:
    - It predicts scores for all n_items in one shot.
    - Totally fine for a toy dataset (60 items).
    - For real systems (millions of items), you'd use ANN / candidate generation.
    """

    # Make sure types are correct for TensorFlow embedding lookups
    user_id = int(user_id)
    seen = set(int(x) for x in seen)

    # ---------------------------------------------------------
    # (1) Build candidate item IDs: [0, 1, 2, ..., n_items-1]
    # ---------------------------------------------------------
    item_ids = np.arange(n_items, dtype=np.int32)  # shape [n_items]

    # ---------------------------------------------------------
    # (2) Build a user_id array of the same length
    # ---------------------------------------------------------
    # We want to predict scores for:
    #   (user_id, item_0), (user_id, item_1), ..., (user_id, item_{n_items-1})
    #
    # So we create:
    #   user_ids = [user_id, user_id, ..., user_id]  (length n_items)
    user_ids = np.full(shape=(n_items,), fill_value=user_id, dtype=np.int32)  # shape [n_items]

    # ---------------------------------------------------------
    # (3) Predict ratings for ALL items in one model call
    # ---------------------------------------------------------
    # tf_mf expects a tuple: (user_ids, item_ids)
    # It returns predicted ratings (floats) for each pair
    #
    # preds shape = [n_items]
    preds = tf_mf((user_ids, item_ids)).numpy()

    # ---------------------------------------------------------
    # (4) Remove items the user already saw
    # ---------------------------------------------------------
    # We do that by assigning a very low score so they never appear in top-k.
    for it in seen:
        # it is an item_id, so it is also the index in preds (because item_ids = 0..n_items-1)
        preds[it] = -1e9

    # ---------------------------------------------------------
    # (5) Pick top-k items by predicted rating
    # ---------------------------------------------------------
    # np.argsort sorts ascending, so use -preds for descending.
    top_items = np.argsort(-preds)[:k]

    # Return item IDs as Python ints
    return [int(i) for i in top_items]


# -------------------------
# Demo: recommendations for one user
# -------------------------
recs_demo = tf_mf_recommend(user_id=u_demo, seen=seen_demo, k=10, n_items=n_items)
show_recommendations(u_demo, recs_demo, "TensorFlow MF (Collaborative Filtering) recommendations")


# -------------------------
# Evaluate using your leave-one-out pipeline
# -------------------------
evaluate_model(
    recommend_fn=lambda u, seen, k: tf_mf_recommend(u, seen, k, n_items),
    train_df=train_df,
    test_df=test_df,
    k=10,
    name="TF Matrix Factorization CF"
)



TensorFlow MF (Collaborative Filtering) recommendations (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
46,Item_46_Category_01,1
51,Item_51_Category_01,1
56,Item_56_Category_01,1
23,Item_23_Category_01,1
57,Item_57_Category_03,3
25,Item_25_Category_04,4
5,Item_05_Category_01,1
31,Item_31_Category_01,1
4,Item_04_Category_04,4
1,Item_01_Category_04,4



TF Matrix Factorization CF @ 10
Precision@10: 0.047
Recall@10:    0.467


### Two Tower Retrieval

In [113]:
# Two-Tower is a RETRIEVAL model (not a rating predictor).
# - It learns embeddings so that "positive" user-item pairs have high similarity.
# - It is typically trained on IMPLICIT feedback (click/view/buy), not explicit ratings.
#
# Architecture:
#   user tower: user_id -> user embedding vector u
#   item tower: item_id -> item embedding vector v
#   score(u,i) = dot(u, v)
#
# Training objective:
# - Binary classification with negative sampling:
#     label=1 for observed (user,item)
#     label=0 for randomly sampled (user,non-interacted-item)
# - Use sigmoid cross entropy on logits = dot(u,v)
#
# Why L2 normalize?
# - After normalization, dot(u,v) becomes cosine similarity.
# - Keeps scores bounded and stabilizes training.
# =========================

import numpy as np
import pandas as pd
import tensorflow as tf
from typing import List


class TwoTower(tf.keras.Model):
    """
    Two-Tower retrieval model.

    This is used in production recommender systems for candidate generation:
    - retrieve top-N items for a user from a huge catalog quickly

    Conceptually:
      - We learn a user embedding space and an item embedding space.
      - We train them so that true interactions (positives) are close.
      - Non-interactions (negatives) are far.

    Implementation here:
      - Both towers are simple Embedding layers (no extra features).
      - In real-world setups, towers often include:
          * user/item features (age, country, category, text embeddings, etc.)
          * deep MLP layers
          * context features (time, device, query, etc.)
    """

    def __init__(self, n_users: int, n_items: int, dim: int = 32):
        super().__init__()

        # Embedding lookup tables:
        # user_emb: [n_users, dim], item_emb: [n_items, dim]
        self.user_emb = tf.keras.layers.Embedding(n_users, dim)
        self.item_emb = tf.keras.layers.Embedding(n_items, dim)

    def call(self, user_ids, item_ids):
        """
        Forward pass returns a similarity score (logit) for each (user,item) pair.

        Inputs:
          user_ids: shape [B]  (batch of users)
          item_ids: shape [B]  (batch of items aligned with users)

        Output:
          logits/scores: shape [B]
          (These are unbounded real numbers; we treat them as logits for sigmoid.)
        """

        # 1) Lookup embeddings:
        # u: [B, dim], v: [B, dim]
        u = self.user_emb(user_ids)
        v = self.item_emb(item_ids)

        # 2) Normalize:
        # After normalization:
        #   dot(u,v) == cosine_similarity(u,v)
        # This avoids embedding norms becoming a "cheat" to inflate dot products.
        u = tf.nn.l2_normalize(u, axis=-1)
        v = tf.nn.l2_normalize(v, axis=-1)

        # 3) Dot product (per row): sum(u * v) over dim -> [B]
        # For each pair in the batch, we get one similarity score.
        return tf.reduce_sum(u * v, axis=-1)


# ------------------------------------------------------------
# Helper: Convert explicit ratings -> implicit positives
# ------------------------------------------------------------
def build_implicit_training_data(train_df: pd.DataFrame) -> np.ndarray:
    """
    Convert explicit ratings into implicit positive interactions.

    In implicit retrieval, we don't train on "rating value".
    We only train on:
      - positive = user interacted with item (exists in train)
      - negative = user did not interact with item (sampled)

    Here we treat ANY (user,item) in train_df as a positive interaction.

    Output:
      pos_pairs: ndarray shape [N_pos, 2]
        pos_pairs[:,0] = user_id
        pos_pairs[:,1] = item_id
    """
    pos = train_df[["user_id", "item_id"]].drop_duplicates()
    return pos.to_numpy().astype(np.int32)


pos_pairs = build_implicit_training_data(train_df)


# ------------------------------------------------------------
# Helper: Build dataset with negative sampling
# ------------------------------------------------------------
def build_two_tower_dataset(
    pos_pairs: np.ndarray,
    n_items: int,
    neg_per_pos: int = 3,
    batch_size: int = 256,
    seed: int = 42
) -> tf.data.Dataset:
    """
    Build tf.data dataset for two-tower training using negative sampling.

    For each positive pair (u, pos_item):
      - yield (u, pos_item, label=1)
      - sample neg_per_pos random items as negatives:
          yield (u, neg_item, label=0)

    Why negative sampling?
      In implicit data we usually only observe positives (clicks/views).
      We must invent negatives by sampling items the user did not interact with.

    Important note:
      Random negatives are "easy negatives".
      In production, you'd often use "hard negatives":
        items from same category / popular items / items retrieved by a baseline model,
      to improve ranking quality.
    """
    rng = np.random.default_rng(seed)

    users = pos_pairs[:, 0]
    pos_items = pos_pairs[:, 1]

    def gen():
        for u, pi in zip(users, pos_items):
            # Positive interaction example
            yield int(u), int(pi), 1.0

            # Negative interactions: same user, random items not equal to the positive item
            for _ in range(neg_per_pos):
                ni = int(rng.integers(0, n_items))
                while ni == int(pi):
                    ni = int(rng.integers(0, n_items))
                yield int(u), ni, 0.0

    # Build TF dataset from Python generator
    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            tf.TensorSpec(shape=(), dtype=tf.int32),   # user_id
            tf.TensorSpec(shape=(), dtype=tf.int32),   # item_id
            tf.TensorSpec(shape=(), dtype=tf.float32), # label
        )
    )

    # Shuffle + batch + prefetch
    return ds.shuffle(5000, seed=seed).batch(batch_size).prefetch(tf.data.AUTOTUNE)


two_tower_ds = build_two_tower_dataset(pos_pairs, n_items=n_items, neg_per_pos=3)

# Create model + optimizer
tt = TwoTower(n_users=n_users, n_items=n_items, dim=32)
tt_opt = tf.keras.optimizers.Adam(learning_rate=0.01)


# ------------------------------------------------------------
# Training step: binary classification on (u,i)
# ------------------------------------------------------------
@tf.function
def two_tower_train_step(user_ids, item_ids, labels):
    """
    One optimization step.

    We treat the dot-product scores as logits for sigmoid:
      p(positive | u,i) = sigmoid(score(u,i))

    Loss:
      sigmoid_cross_entropy(labels, logits)

    If label=1:
      we want logits to be large positive (sigmoid close to 1)
    If label=0:
      we want logits to be large negative (sigmoid close to 0)
    """
    with tf.GradientTape() as tape:
        logits = tt(user_ids, item_ids)  # shape [B]

        # Per-example loss (vector [B]) -> mean scalar
        loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(loss)

    grads = tape.gradient(loss, tt.trainable_variables)
    tt_opt.apply_gradients(zip(grads, tt.trainable_variables))
    return loss


# Train a few epochs (toy example)
for epoch in range(10):
    losses = []
    for user_ids, item_ids, labels in two_tower_ds:
        loss_val = two_tower_train_step(user_ids, item_ids, labels)
        losses.append(float(loss_val.numpy()))
    print(f"Two-Tower Epoch {epoch+1} | loss={np.mean(losses):.4f}")

Two-Tower Epoch 1 | loss=0.6911
Two-Tower Epoch 2 | loss=0.6430
Two-Tower Epoch 3 | loss=0.6131
Two-Tower Epoch 4 | loss=0.5805
Two-Tower Epoch 5 | loss=0.5675
Two-Tower Epoch 6 | loss=0.5558
Two-Tower Epoch 7 | loss=0.5404
Two-Tower Epoch 8 | loss=0.5620
Two-Tower Epoch 9 | loss=0.5523
Two-Tower Epoch 10 | loss=0.5364


In [114]:
# Recommendation with Two-Tower:
# - For a given user u:
#     score(u,i) = dot(u_vec, i_vec) for all items i
# - Take top-K highest scores (excluding seen)
#
# This is brute force scoring (O(n_items)).
# Real production retrieval:
# - precompute item embeddings and index them with ANN (FAISS/ScaNN)
# - query the index with user embedding to retrieve top-N fast
# =========================

def two_tower_recommend(user_id: int, seen: set, k: int, n_items: int) -> List[int]:
    """
    Recommend top-k items for a user using the trained two-tower model.

    Steps:
      1) compute user embedding u_vec
      2) compute all item embeddings i_vec (for all items)
      3) compute scores = dot(u_vec, i_vec) for each item
      4) remove already seen items
      5) return top-k

    Note:
      This assumes item IDs are 0..n_items-1 so we can index scores by item_id.
    """
    user_id = int(user_id)
    seen = set(int(x) for x in seen)

    # 1) User embedding: [1, dim]
    u = tf.constant([user_id], dtype=tf.int32)
    u_vec = tt.user_emb(u)
    u_vec = tf.nn.l2_normalize(u_vec, axis=-1)

    # 2) All item embeddings: [n_items, dim]
    item_ids = tf.range(n_items, dtype=tf.int32)
    i_vec = tt.item_emb(item_ids)
    i_vec = tf.nn.l2_normalize(i_vec, axis=-1)

    # 3) Compute scores for all items:
    #    scores[i] = dot(u_vec, i_vec[i])
    # u_vec: [1, dim]
    # i_vec^T: [dim, n_items]
    # result: [1, n_items] -> squeeze -> [n_items]
    scores = tf.squeeze(tf.matmul(u_vec, i_vec, transpose_b=True)).numpy()

    # 4) Exclude seen items by forcing score very low
    for it in seen:
        scores[it] = -1e9

    # 5) Top-K by score descending
    top = np.argsort(-scores)[:k]
    return [int(i) for i in top]


# Demo
recs_demo = two_tower_recommend(u_demo, seen_demo, k=10, n_items=n_items)
show_recommendations(u_demo, recs_demo, "Two-Tower recommendations")

# Evaluate using your pipeline
evaluate_model(
    lambda u, seen, k: two_tower_recommend(u, seen, k, n_items),
    train_df,
    test_df,
    k=10,
    name="Two-Tower Retrieval (TF)"
)


Two-Tower recommendations (user=2)


Unnamed: 0_level_0,title,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
50,Item_50_Category_05,5
36,Item_36_Category_05,5
16,Item_16_Category_05,5
56,Item_56_Category_01,1
54,Item_54_Category_01,1
17,Item_17_Category_05,5
24,Item_24_Category_05,5
5,Item_05_Category_01,1
25,Item_25_Category_04,4
1,Item_01_Category_04,4



Two-Tower Retrieval (TF) @ 10
Precision@10: 0.050
Recall@10:    0.500
