Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from collections import defaultdict

from loader import load_to_dict, load_user_likes, save_likes

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


Data Loading

In [44]:
DATA_FILE = "review-Washington.json.gz"

POS_THRESHOLD = 4
MIN_INTERACTIONS_PER_USER = 2
MAX_USERS = 100000
MAX_ITEMS = 100000

LATENT_DIM = 20
LEARNING_RATE = 0.01
REG_LAMBDA = 1e-5
NSAMPLES_PER_BATCH = 50000
N_TRAIN_STEPS = 100
TOP_K = 10

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [6]:
raw_data = load_to_dict(DATA_FILE)

df = pd.DataFrame(raw_data)

len(df), df.head()

(16541734,
                  user_id              name           time  rating  \
 0  101193190259063567920       Ian Hampton  1627207184860     5.0   
 1  103093043835388050629     Casper Steele  1626907411534     2.0   
 2  111014066796803341223         Judy Maes  1613028426989     5.0   
 3  112375894485126147782     Jengibre Caro  1590119768856     3.0   
 4  111724423355988809570  Daniel Hernandez  1536710665852     5.0   
 
                                                 text  pics  resp  \
 0  Drivers are helpful with directions or assista...  None  None   
 1  Drivers say security has bathroom key. Securit...  None  None   
 2  It's a CTRAN transit center with schedules to ...  None  None   
 3  Positives: usually clean, convenient location,...  None  None   
 4  Neat little Transit Center with lots of schedu...  None  None   
 
                                  gmap_id  
 0  0x5495ae7d3bf7d097:0xbcbc06152a3ccebc  
 1  0x5495ae7d3bf7d097:0xbcbc06152a3ccebc  
 2  0x5495ae7d3bf7d

In [46]:
df_pos = df[df["rating"] >= POS_THRESHOLD].copy()

user_counts = df_pos["user_id"].value_counts()
eligible_users = user_counts[user_counts >= MIN_INTERACTIONS_PER_USER].index

df_pos = df_pos[df_pos["user_id"].isin(eligible_users)].reset_index(drop=True)

len(df_pos), df_pos["user_id"].nunique(), df_pos["gmap_id"].nunique()

(11886874, 1325104, 118903)

In [45]:
unique_users = df_pos["user_id"].unique()
if MAX_USERS is not None:
    unique_users = unique_users[:MAX_USERS]

df_pos = df_pos[df_pos["user_id"].isin(unique_users)]

unique_items = df_pos["gmap_id"].unique()
if MAX_ITEMS is not None:
    unique_items = unique_items[:MAX_ITEMS]

df_pos = df_pos[df_pos["gmap_id"].isin(unique_items)].reset_index(drop=True)

user_id_to_idx = {u: idx for idx, u in enumerate(unique_users)}
item_id_to_idx = {i: idx for idx, i in enumerate(unique_items)}

df_pos["user_idx"] = df_pos["user_id"].map(user_id_to_idx)
df_pos["item_idx"] = df_pos["gmap_id"].map(item_id_to_idx)

num_users = len(user_id_to_idx)
num_items = len(item_id_to_idx)

num_users, num_items

(50000, 50000)

In [35]:
# 1. Sort once
if "time" in df_pos.columns:
    df_pos = df_pos.sort_values(["user_idx", "time"])
else:
    df_pos = df_pos.sample(frac=1.0, random_state=RANDOM_SEED)

user_counts = df_pos["user_idx"].value_counts()

# Users with at least 2 interactions
multi_users = user_counts[user_counts >= 2].index

# 3. For users with 2+ interactions: last one is test
test_df = (
    df_pos[df_pos["user_idx"].isin(multi_users)]
    .groupby("user_idx")
    .tail(1)
)

train_df = df_pos.drop(test_df.index)

# 5. Clean up indices
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

len(train_df), len(test_df)

(220034, 41098)

In [36]:
interactions_train = list(
    zip(
        train_df["user_idx"].astype(int).tolist(),
        train_df["item_idx"].astype(int).tolist(),
        train_df["rating"].tolist()
    )
)

items_per_user_train = defaultdict(set)
for u, i, r in interactions_train:
    items_per_user_train[u].add(i)

all_items = list(range(num_items))

len(interactions_train), len(all_items)


(220034, 50000)

In [37]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super().__init__()
        self.lamb = lamb

        # Global item bias
        self.betaI = self.add_weight(
            name="betaI",
            shape=(num_items,),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

        # User latent factors
        self.gammaU = self.add_weight(
            name="gammaU",
            shape=(num_users, K),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

        # Item latent factors
        self.gammaI = self.add_weight(
            name="gammaI",
            shape=(num_items, K),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

    def score(self, sampleU, sampleI):
        # sampleU, sampleI are index tensors
        u = tf.cast(sampleU, tf.int32)
        i = tf.cast(sampleI, tf.int32)

        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)

        x_ui = beta_i + tf.reduce_sum(gamma_u * gamma_i, axis=1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        # BPR loss: -log σ(x_ui - x_uj)
        loss = -tf.reduce_mean(tf.math.log_sigmoid(x_ui - x_uj))
        return loss

    def reg(self):
        return self.lamb * (
            tf.nn.l2_loss(self.betaI)
            + tf.nn.l2_loss(self.gammaU)
            + tf.nn.l2_loss(self.gammaI)
        )


In [38]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
modelBPR = BPRbatch(LATENT_DIM, REG_LAMBDA)

def trainingStepBPR(model, interactions, items_per_user, items, Nsamples):
    sampleU, sampleI, sampleJ = [], [], []

    for _ in range(Nsamples):
        u, i, r = random.choice(interactions)
        j = random.choice(items)
        while j in items_per_user[u]:
            j = random.choice(items)

        sampleU.append(u)
        sampleI.append(i)
        sampleJ.append(j)

    # Convert lists → tensors
    sampleU_tf = tf.convert_to_tensor(sampleU, dtype=tf.int32)
    sampleI_tf = tf.convert_to_tensor(sampleI, dtype=tf.int32)
    sampleJ_tf = tf.convert_to_tensor(sampleJ, dtype=tf.int32)

    with tf.GradientTape() as tape:
        loss = model(sampleU_tf, sampleI_tf, sampleJ_tf)
        loss += model.reg()

    grads = tape.gradient(loss, model.trainable_variables)

    # Pair gradients with variables, skipping any None grads just in case
    grads_and_vars = [
        (g, v) for g, v in zip(grads, model.trainable_variables) if g is not None
    ]

    if grads_and_vars:
        optimizer.apply_gradients(grads_and_vars)
    else:
        print("Warning: no gradients to apply this step.")

    return float(loss.numpy())


In [39]:
test_loss = trainingStepBPR(
    modelBPR,
    interactions_train,
    items_per_user_train,
    all_items,
    Nsamples=10000  # mini testing
)

print(f"One mini-batch objective: {test_loss:.4f}")

One mini-batch objective: 0.6932


In [40]:
for step in range(N_TRAIN_STEPS):
    obj = trainingStepBPR(
        modelBPR,
        interactions_train,
        items_per_user_train,
        all_items,
        NSAMPLES_PER_BATCH,
    )
    if (step + 1) % 5 == 0:
        print(f"Step {step + 1}, objective = {obj:.4f}")

Step 5, objective = 0.6821
Step 10, objective = 0.6698
Step 15, objective = 0.6582
Step 20, objective = 0.6478
Step 25, objective = 0.6372
Step 30, objective = 0.6271
Step 35, objective = 0.6190
Step 40, objective = 0.6075
Step 45, objective = 0.5976
Step 50, objective = 0.5863
Step 55, objective = 0.5735
Step 60, objective = 0.5607
Step 65, objective = 0.5485
Step 70, objective = 0.5430
Step 75, objective = 0.5339
Step 80, objective = 0.5292
Step 85, objective = 0.5265
Step 90, objective = 0.5233
Step 95, objective = 0.5193
Step 100, objective = 0.5164


Evaluation


In [41]:
test_items_per_user = defaultdict(list)
for _, row in test_df.iterrows():
    u = int(row["user_idx"])
    i = int(row["item_idx"])
    test_items_per_user[u].append(i)

def evaluate_hit_rate_at_k(model, train_items_per_user, test_items_per_user, items, k):
    users = list(test_items_per_user.keys())
    hits = 0
    total = 0

    # Convert items to a NumPy array once
    all_items_array = np.array(items, dtype=np.int32)

    for idx, u in enumerate(users):
        train_items = train_items_per_user[u]  # this is already a set

        # Mask out training items
        candidate_mask = ~np.isin(all_items_array, list(train_items))
        candidate_items = all_items_array[candidate_mask]

        if len(candidate_items) == 0:
            continue

        u_list = np.full(len(candidate_items), u, dtype=np.int32)
        scores = model.score(u_list, candidate_items).numpy()

        top_k_idx = np.argpartition(-scores, k - 1)[:k]
        top_k_items = set(candidate_items[top_k_idx])

        test_items = set(test_items_per_user[u])  # usually size 1
        if top_k_items & test_items:
            hits += 1

        total += 1

        # Optional: progress print every 1000 users so you know it's moving
        if (idx + 1) % 1000 == 0:
            print(f"Evaluated {idx + 1}/{len(users)} users...")

    if total == 0:
        return None
    return hits / total

In [42]:
hit_at_k = evaluate_hit_rate_at_k(
    modelBPR,
    items_per_user_train,
    test_items_per_user,
    all_items,
    TOP_K,
)

print(f"HitRate@{TOP_K}: {hit_at_k:.4f}" if hit_at_k is not None else "No users with test items to evaluate.")


Evaluated 1000/41098 users...
Evaluated 2000/41098 users...
Evaluated 3000/41098 users...
Evaluated 4000/41098 users...
Evaluated 5000/41098 users...
Evaluated 6000/41098 users...
Evaluated 7000/41098 users...
Evaluated 8000/41098 users...
Evaluated 9000/41098 users...
Evaluated 10000/41098 users...
Evaluated 11000/41098 users...
Evaluated 12000/41098 users...
Evaluated 13000/41098 users...
Evaluated 14000/41098 users...
Evaluated 15000/41098 users...
Evaluated 16000/41098 users...
Evaluated 17000/41098 users...
Evaluated 18000/41098 users...
Evaluated 19000/41098 users...
Evaluated 20000/41098 users...
Evaluated 21000/41098 users...
Evaluated 22000/41098 users...
Evaluated 23000/41098 users...
Evaluated 24000/41098 users...
Evaluated 25000/41098 users...
Evaluated 26000/41098 users...
Evaluated 27000/41098 users...
Evaluated 28000/41098 users...
Evaluated 29000/41098 users...
Evaluated 30000/41098 users...
Evaluated 31000/41098 users...
Evaluated 32000/41098 users...
Evaluated 33000/4