Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import shutil
from collections import defaultdict


from loader import load_to_dict, load_user_likes, save_likes

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


Variables and Hyperparameters

In [36]:
DATA_FILE = "review-Washington_10.json.gz"

# Data settings
POS_THRESHOLD = 4
MIN_INTERACTIONS_PER_USER = 2
MAX_USERS = 150000
MAX_ITEMS = 150000

# hyperparams
LATENT_DIM = 20
LEARNING_RATE = 0.05
REG_LAMBDA = 1e-5
NSAMPLES_PER_BATCH = 20000
N_TRAIN_STEPS = 250
TOP_K = 30


RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

Data Loading

In [7]:
raw_data = load_to_dict(DATA_FILE)

df = pd.DataFrame(raw_data)

len(df), df.head()

(10192020,
                  user_id              name           time  rating  \
 0  103093043835388050629     Casper Steele  1626907411534       2   
 1  111014066796803341223         Judy Maes  1613028426989       5   
 2  111724423355988809570  Daniel Hernandez  1536710665852       5   
 3  115331094085411087904         Lumi Nosa  1484359806540       5   
 4  106906383883851362879    James Conright  1500643741079       1   
 
                                                 text  pics  resp  \
 0  Drivers say security has bathroom key. Securit...  None  None   
 1  It's a CTRAN transit center with schedules to ...  None  None   
 2  Neat little Transit Center with lots of schedu...  None  None   
 3                              Very helpful.  Thanks  None  None   
 4                             They don't sell tacos.  None  None   
 
                                  gmap_id  
 0  0x5495ae7d3bf7d097:0xbcbc06152a3ccebc  
 1  0x5495ae7d3bf7d097:0xbcbc06152a3ccebc  
 2  0x5495ae7d3bf7d

In [8]:
df_pos_full = df[df["rating"] >= POS_THRESHOLD].copy()

user_counts = df_pos_full["user_id"].value_counts()
eligible_users = user_counts[user_counts >= MIN_INTERACTIONS_PER_USER].index

df_pos_full = df_pos_full[df_pos_full["user_id"].isin(eligible_users)].reset_index(drop=True)

print("Full positive set (after MIN_INTERACTIONS_PER_USER):")
print(
    "rows:", len(df_pos_full),
    "unique users:", df_pos_full["user_id"].nunique(),
    "unique items:", df_pos_full["gmap_id"].nunique()
)

Full positive set (after MIN_INTERACTIONS_PER_USER):
rows: 8407046 unique users: 331442 unique items: 72412


Building BPR Subset

In [24]:
df_pos = df_pos_full.copy()

# limit users
unique_users = df_pos["user_id"].unique()
if MAX_USERS is not None:
    unique_users = unique_users[:MAX_USERS]
df_pos = df_pos[df_pos["user_id"].isin(unique_users)]

# limit items
unique_items = df_pos["gmap_id"].unique()
if MAX_ITEMS is not None:
    unique_items = unique_items[:MAX_ITEMS]
df_pos = df_pos[df_pos["gmap_id"].isin(unique_items)].reset_index(drop=True)

# Recompute the *subset* users/items after filtering
subset_users = df_pos["user_id"].unique()
subset_items = df_pos["gmap_id"].unique()

user_id_to_idx = {u: idx for idx, u in enumerate(subset_users)}
item_id_to_idx = {i: idx for idx, i in enumerate(subset_items)}

df_pos["user_idx"] = df_pos["user_id"].map(user_id_to_idx)
df_pos["item_idx"] = df_pos["gmap_id"].map(item_id_to_idx)

num_users = len(user_id_to_idx)
num_items = len(item_id_to_idx)

idx_to_user_id = {idx: u for u, idx in user_id_to_idx.items()}
idx_to_item_id = {idx: i for i, idx in item_id_to_idx.items()}

print("BPR subset:")
print("num_users:", num_users, "num_items:", num_items)

BPR subset:
num_users: 150000 num_items: 72396


Training

In [25]:
# 1. Sort once
if "time" in df_pos.columns:
    df_pos = df_pos.sort_values(["user_idx", "time"])
else:
    df_pos = df_pos.sample(frac=1.0, random_state=RANDOM_SEED)

user_counts = df_pos["user_idx"].value_counts()

# Users with at least 2 interactions
multi_users = user_counts[user_counts >= 2].index

# 3. For users with 2+ interactions: last one is test
test_df = (
    df_pos[df_pos["user_idx"].isin(multi_users)]
    .groupby("user_idx")
    .tail(1)
)

train_df = df_pos.drop(test_df.index)

# 5. Clean up indices
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

len(train_df), len(test_df)

(4982429, 150000)

In [26]:
interactions_train = list(
    zip(
        train_df["user_idx"].astype(int).tolist(),
        train_df["item_idx"].astype(int).tolist(),
        train_df["rating"].tolist()
    )
)

items_per_user_train = defaultdict(set)
for u, i, r in interactions_train:
    items_per_user_train[u].add(i)

all_items = list(range(num_items))

len(interactions_train), len(all_items)


(4982429, 72396)

BPR Model

In [37]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super().__init__()
        self.lamb = lamb

        # Global item bias
        self.betaI = self.add_weight(
            name="betaI",
            shape=(num_items,),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

        # User latent factors
        self.gammaU = self.add_weight(
            name="gammaU",
            shape=(num_users, K),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

        # Item latent factors
        self.gammaI = self.add_weight(
            name="gammaI",
            shape=(num_items, K),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

    def score(self, sampleU, sampleI):
        # sampleU, sampleI are index tensors
        u = tf.cast(sampleU, tf.int32)
        i = tf.cast(sampleI, tf.int32)

        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)

        x_ui = beta_i + tf.reduce_sum(gamma_u * gamma_i, axis=1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        # BPR loss: -log σ(x_ui - x_uj)
        loss = -tf.reduce_mean(tf.math.log_sigmoid(x_ui - x_uj))
        return loss

    def reg(self):
        return self.lamb * (
            tf.nn.l2_loss(self.betaI)
            + tf.nn.l2_loss(self.gammaU)
            + tf.nn.l2_loss(self.gammaI)
        )


Training Steps

In [38]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
modelBPR = BPRbatch(LATENT_DIM, REG_LAMBDA)

def trainingStepBPR(model, interactions, items_per_user, items, Nsamples):
    sampleU, sampleI, sampleJ = [], [], []

    for _ in range(Nsamples):
        u, i, r = random.choice(interactions)
        j = random.choice(items)
        while j in items_per_user[u]:
            j = random.choice(items)

        sampleU.append(u)
        sampleI.append(i)
        sampleJ.append(j)

    # Convert lists → tensors
    sampleU_tf = tf.convert_to_tensor(sampleU, dtype=tf.int32)
    sampleI_tf = tf.convert_to_tensor(sampleI, dtype=tf.int32)
    sampleJ_tf = tf.convert_to_tensor(sampleJ, dtype=tf.int32)

    with tf.GradientTape() as tape:
        loss = model(sampleU_tf, sampleI_tf, sampleJ_tf)
        loss += model.reg()

    grads = tape.gradient(loss, model.trainable_variables)

    # Pair gradients with variables, skipping any None grads just in case
    grads_and_vars = [
        (g, v) for g, v in zip(grads, model.trainable_variables) if g is not None
    ]

    if grads_and_vars:
        optimizer.apply_gradients(grads_and_vars)
    else:
        print("Warning: no gradients to apply this step.")

    return float(loss.numpy())


In [39]:
test_loss = trainingStepBPR(
    modelBPR,
    interactions_train,
    items_per_user_train,
    all_items,
    Nsamples=10000  # mini testing
)

print(f"One mini-batch objective: {test_loss:.4f}")

One mini-batch objective: 0.6932


In [40]:
for step in range(N_TRAIN_STEPS):
    obj = trainingStepBPR(
        modelBPR,
        interactions_train,
        items_per_user_train,
        all_items,
        NSAMPLES_PER_BATCH,
    )
    if (step + 1) % 10 == 0:
        print(f"Step {step + 1}, objective = {obj:.4f}")

Step 10, objective = 0.6138
Step 20, objective = 0.5707
Step 30, objective = 0.5509
Step 40, objective = 0.5434
Step 50, objective = 0.5327
Step 60, objective = 0.5361
Step 70, objective = 0.5325
Step 80, objective = 0.5304
Step 90, objective = 0.5305
Step 100, objective = 0.5296
Step 110, objective = 0.5344
Step 120, objective = 0.5361
Step 130, objective = 0.5343
Step 140, objective = 0.5354
Step 150, objective = 0.5396
Step 160, objective = 0.5404
Step 170, objective = 0.5430
Step 180, objective = 0.5449
Step 190, objective = 0.5438
Step 200, objective = 0.5450
Step 210, objective = 0.5511
Step 220, objective = 0.5494
Step 230, objective = 0.5495
Step 240, objective = 0.5480
Step 250, objective = 0.5448


Grabbing Oregon Split

In [31]:
users_revealed_likes = load_user_likes("users_revealed_likes.json")
users_hidden_likes = load_user_likes("users_hidden_likes.json")

print("Oregon users with revealed likes:", len(users_revealed_likes))
print("Oregon users with hidden likes:", len(users_hidden_likes))

Oregon users with revealed likes: 204512
Oregon users with hidden likes: 204512


In [None]:
def build_bpr_recommendations_for_oregon(model, k):
    """
    Use a model trained on Washington to recommend items for Oregon users.
    Returns: dict user_id -> list of gmap_ids (top-k recommendations)
    """
    recs = {}

    # All item indices that the model knows about (Washington items)
    all_item_indices = np.arange(num_items, dtype=np.int32)

    for user_id, hidden_items in users_hidden_likes.items():
        # If this Oregon user never appeared in Washington training, we can't get an embedding
        if user_id not in user_id_to_idx:
            continue

        u_idx = user_id_to_idx[user_id]

        # Start with all known items
        candidate_item_indices = all_item_indices.copy()

        # Remove items that are "revealed likes" in Oregon (like baseline)
        revealed_items = users_revealed_likes.get(user_id, [])
        revealed_item_indices = [
            item_id_to_idx[g] for g in revealed_items if g in item_id_to_idx
        ]

        if revealed_item_indices:
            mask = ~np.isin(candidate_item_indices, revealed_item_indices)
            candidate_item_indices = candidate_item_indices[mask]

        if len(candidate_item_indices) == 0:
            recs[user_id] = []
            continue

        # Score all candidate items for this user
        u_list = np.full(len(candidate_item_indices), u_idx, dtype=np.int32)
        scores = model.score(u_list, candidate_item_indices).numpy()

        # Take top-k items by score
        if len(candidate_item_indices) <= k:
            top_item_indices = candidate_item_indices
        else:
            top_idx = np.argpartition(-scores, k - 1)[:k]
            top_item_indices = candidate_item_indices[top_idx]

        #sort those top-k by score descending for nicer order
        u_list_top = np.full(len(top_item_indices), u_idx, dtype=np.int32)
        top_scores = model.score(u_list_top, top_item_indices).numpy()
        pairs = list(zip(top_item_indices, top_scores))
        pairs.sort(key=lambda x: x[1], reverse=True)
        top_item_indices_sorted = [i for i, s in pairs]

        # Map from item indices back to original gmap_ids
        top_item_ids = [idx_to_item_id[i] for i in top_item_indices_sorted]

        recs[user_id] = top_item_ids

    return recs


Evaluation


In [None]:
test_items_per_user = defaultdict(list)
for _, row in test_df.iterrows():
    u = int(row["user_idx"])
    i = int(row["item_idx"])
    test_items_per_user[u].append(i)

def evaluate_hit_rate_at_k(model, train_items_per_user, test_items_per_user, items, k):
    users = list(test_items_per_user.keys())
    hits = 0
    total = 0

    # Convert items to a NumPy array once
    all_items_array = np.array(items, dtype=np.int32)

    for idx, u in enumerate(users):
        train_items = train_items_per_user[u] 

        # Mask out training items
        candidate_mask = ~np.isin(all_items_array, list(train_items))
        candidate_items = all_items_array[candidate_mask]

        if len(candidate_items) == 0:
            continue

        u_list = np.full(len(candidate_items), u, dtype=np.int32)
        scores = model.score(u_list, candidate_items).numpy()

        top_k_idx = np.argpartition(-scores, k - 1)[:k]
        top_k_items = set(candidate_items[top_k_idx])

        test_items = set(test_items_per_user[u])  
        if top_k_items & test_items:
            hits += 1

        total += 1

        # progress print every 10000 users so we know it's moving
        if (idx + 1) % 10000 == 0:
            print(f"Evaluated {idx+ 1}/{len(users)} users...")

    if total == 0:
        return None
    return hits / total

In [41]:
print("Training finished, now building BPR recommendations for Oregon users...")

bpr_recs_original = build_bpr_recommendations_for_oregon(
    modelBPR,
    TOP_K,  
)

print("Number of Oregon users with BPR recs:", len(bpr_recs_original))

# Save in the same format as baseline_recommendation_per_user.json
save_likes("bpr_recommendation_per_user.json", bpr_recs_original)
print("Saved BPR recs to bpr_recommendation_per_user.json")


shutil.copyfile(
    "eval/baseline_recommendation_per_user.json",
    "eval/baseline_recommendation_per_user_baseline_backup.json"
)
print("Backed up original baseline recs to baseline_recommendation_per_user_baseline_backup.json")

# Overwrite the file eval.ipynb uses with BPR recs
save_likes("baseline_recommendation_per_user.json", bpr_recs_original)
print("Overwrote baseline_recommendation_per_user.json with BPR recs")


Training finished, now building BPR recommendations for Oregon users...
Number of Oregon users with BPR recs: 10965
Saved to  eval/bpr_recommendation_per_user.json
Saved BPR recs to bpr_recommendation_per_user.json
Backed up original baseline recs to baseline_recommendation_per_user_baseline_backup.json
Saved to  eval/baseline_recommendation_per_user.json
Overwrote baseline_recommendation_per_user.json with BPR recs


In [42]:
print(TOP_K)
hit_at_k = evaluate_hit_rate_at_k(
    modelBPR,
    items_per_user_train,
    test_items_per_user,
    all_items,
    TOP_K,
)

print(f"HitRate@{TOP_K}: {hit_at_k:.4f}" if hit_at_k is not None else "No users with test items to evaluate.")


30


Evaluated 10000/150000 users...
Evaluated 20000/150000 users...
Evaluated 30000/150000 users...
Evaluated 40000/150000 users...
Evaluated 50000/150000 users...
Evaluated 60000/150000 users...
Evaluated 70000/150000 users...
Evaluated 80000/150000 users...
Evaluated 90000/150000 users...
Evaluated 100000/150000 users...
Evaluated 110000/150000 users...
Evaluated 120000/150000 users...
Evaluated 130000/150000 users...
Evaluated 140000/150000 users...
Evaluated 150000/150000 users...
HitRate@30: 0.0091
