## Recommender Systems Notebook

### Setup & Demo Data

We implement some  common recommender algorithms used in production :

- Popularity baseline
- Content-based TF-IDF
- Item-Item Co-visitation
- Collaborative Filtering (kNN) user-based and item-based
- Collaborative Filtering Matrix Factorization with Tensorflow
- Two-Tower Retrieval with Tensorflow

### Imports

In [1]:
# Numerical computing
import numpy as np

# Data handling
import pandas as pd

# For clean "struct-like" models (optional)
from dataclasses import dataclass

# Typing clarity (optional but good practice)
from typing import Dict, List, Tuple, Optional, Callable

# Useful for co-visitation counting
from collections import defaultdict, Counter

# Content-based TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from twisted.python.reflect import prefixedMethods

# Reproducibility
np.random.seed(42)


### Demo dataset generator

In [9]:
def make_demo_data(
        n_users:int = 30,
        n_items:int = 60,
        n_categories:int = 6,
        ratings_per_user:int = 12,
        session_len:int = 8,
        seed:int = 42,
):
    """
    Creates a toy dataset that behaves like a real recommendation dataset.

    Users:
      - each user prefers one category

    Items:
      - each item belongs to one category
      - each item has a text description (category-specific keywords)

    Ratings:
      - user gives higher ratings to items in their preferred category

    Sessions:
      - implicit sequences of interacted items (mostly from preferred category)
    """

    rng = np.random.RandomState(seed)

    # Assign each iteam a category ID
    item_category = rng.randint(0,n_categories,size=n_items)

    # Assign each user a preferred category
    user_pref = rng.randint(0,n_categories,size=n_users)

     # Words to generate item descriptions per category
    category_words = {
        0: ["action", "fast", "adventure", "hero"],
        1: ["romance", "love", "drama", "heart"],
        2: ["scifi", "space", "future", "alien"],
        3: ["comedy", "funny", "joke", "laugh"],
        4: ["horror", "scary", "ghost", "dark"],
        5: ["documentary", "history", "facts", "nature"],
    }

    # Create item text and titles
    item_text:Dict[int,str] = {}
    item_title:Dict[int,str] = {}
    for i in range(n_items):
        category = int(item_category[i])
        words = category_words[category]

        desc = " ".join(rng.choice(words,size=10,replace=True))

        item_text[i] = desc
        item_title[i] = f"Item_{i:02d}_Category_{category:02d}"


    # Build ratings as a list of (user_id,item_id,rating)
    ratings: List[Tuple[int,int,float]] = []
    for u in range(n_users):
        # Items in user's preferred category
        preferred_items = np.where(item_category == user_pref[u])[0]

        # Items Not in preferred category
        other_items = np.where(item_category != user_pref[u])[0]

        # Choose ~70% from preferred and ~30% from others
        n_pref = int(ratings_per_user * 0.7)
        n_other = ratings_per_user - n_pref

        # Choose without replacement
        chosen_pref = rng.choice(preferred_items, size=min(n_pref, len(preferred_items)), replace=False)
        chosen_other = rng.choice(other_items, size=min(n_other, len(other_items)), replace=False)

        chosen = np.concatenate((chosen_pref, chosen_other))
        rng.shuffle(chosen)

        for item_id in chosen:

            # Base rating is higher if matches preference
            base = 4.2 if item_category[item_id] == user_pref[u] else 2.8

            # Add Gaussian noise and clip into [1...5]
            r = np.clip(rng.normal(base,0.6),1.0,5.0)

            # Round to 0.1 to look more realistic
            ratings.append((u,int(item_id),float(np.round(r,1))))

    # Build the sessions ( view/click sequences)

    sessions: List[List[int]] = []

    for u in range(n_users):

        # sample from preferred category with replacement ( views  can repeat )
        pref_items = np.where(item_category == user_pref[u])[0]
        session = rng.choice(pref_items, size=session_len, replace=True)
        sessions.append(session)

    # Item metadata table
    items_df = pd.DataFrame({
        "item_id":np.arange(n_items),
        "title": [item_title[i] for i in range(n_items)],
        "category": item_category,
    })

    # Ratings dataframe
    ratings_df = pd.DataFrame(
        ratings,
        columns=["user_id","item_id","rating"]
    )

    return ratings_df, sessions, items_df, item_text

In [10]:
ratings_df, sessions, items_df, item_text = make_demo_data()

In [14]:
ratings_df.head(10)

Unnamed: 0,user_id,item_id,rating
0,0,39,4.0
1,0,33,4.8
2,0,16,2.9
3,0,6,3.0
4,0,57,3.4
5,0,32,2.2
6,0,38,4.9
7,0,10,5.0
8,0,34,2.6
9,0,19,4.3


In [16]:
items_df.head(10)

Unnamed: 0,item_id,title,category
0,0,Item_00_Category_03,3
1,1,Item_01_Category_04,4
2,2,Item_02_Category_02,2
3,3,Item_03_Category_04,4
4,4,Item_04_Category_04,4
5,5,Item_05_Category_01,1
6,6,Item_06_Category_02,2
7,7,Item_07_Category_02,2
8,8,Item_08_Category_02,2
9,9,Item_09_Category_04,4
