Imports

In [199]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from collections import defaultdict

from loader import load_to_dict, load_user_likes, save_likes

Data Loading

In [200]:
DATA_FILE = "review-Oregon_10.json.gz"

POS_THRESHOLD = 4
MIN_INTERACTIONS_PER_USER = 1
MAX_USERS = 100000
MAX_ITEMS = 100000

LATENT_DIM = 64
LEARNING_RATE = 0.01
REG_LAMBDA = 1e-5
NSAMPLES_PER_BATCH = 50000
N_TRAIN_STEPS = 250
TOP_K = 30

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [201]:
raw_data = load_to_dict(DATA_FILE)

df = pd.DataFrame(raw_data)

len(df), df.head()

(6270332,
                  user_id               name           time  rating  \
 0  116238557567455956213          Mike Alan  1607013385211       5   
 1  116988773242398246268   Jennifer Zumwalt  1607571121183       5   
 2  118003928746583471938  Courtney Saunders  1582762237754       5   
 3  118003928746583471938  Courtney Saunders  1582762237754       5   
 4  103097287486867336142       Becky proulx  1584064731306       5   
 
                                                 text  pics  \
 0  Every staff member in this office is absolutel...  None   
 1  They are amazing. Always so kinda and helpful....  None   
 2  With three kids and myself, our family has bee...  None   
 3  With three kids and myself, our family has bee...  None   
 4  Three of our children have received their brac...  None   
 
                                                 resp  \
 0                                               None   
 1                                               None   
 2  {'time'

In [202]:
df_pos = df[df["rating"] >= POS_THRESHOLD].copy()

user_counts = df_pos["user_id"].value_counts()
eligible_users = user_counts[user_counts >= MIN_INTERACTIONS_PER_USER].index

df_pos = df_pos[df_pos["user_id"].isin(eligible_users)].reset_index(drop=True)

len(df_pos), df_pos["user_id"].nunique(), df_pos["gmap_id"].nunique()

(5180611, 204512, 47119)

In [203]:
unique_users = df_pos["user_id"].unique()
if MAX_USERS is not None:
    unique_users = unique_users[:MAX_USERS]

df_pos = df_pos[df_pos["user_id"].isin(unique_users)]

unique_items = df_pos["gmap_id"].unique()
if MAX_ITEMS is not None:
    unique_items = unique_items[:MAX_ITEMS]

df_pos = df_pos[df_pos["gmap_id"].isin(unique_items)].reset_index(drop=True)

user_id_to_idx = {u: idx for idx, u in enumerate(unique_users)}
item_id_to_idx = {i: idx for idx, i in enumerate(unique_items)}

df_pos["user_idx"] = df_pos["user_id"].map(user_id_to_idx)
df_pos["item_idx"] = df_pos["gmap_id"].map(item_id_to_idx)

num_users = len(user_id_to_idx)
num_items = len(item_id_to_idx)

num_users, num_items

(100000, 47107)

In [204]:
# Shuffle rows (if no timestamp, just randomize)
if "time" in df_pos.columns:
    df_pos = df_pos.sort_values(["user_idx", "time"])
else:
    df_pos = df_pos.sample(frac=1.0, random_state=RANDOM_SEED)

train_rows = []
test_rows = []

# Split each user's interactions 80:20
for user_id, group in df_pos.groupby("user_id"):
    idx_list = group.index.tolist()
    random.shuffle(idx_list)
    
    split_point = max(1, int(0.8 * len(idx_list)))  # ensure at least 1 row in train
    train_rows.extend(idx_list[:split_point])
    test_rows.extend(idx_list[split_point:])

train_df = df_pos.loc[train_rows].reset_index(drop=True)
test_df = df_pos.loc[test_rows].reset_index(drop=True)


len(train_df), len(test_df)

(2671266, 717261)

In [205]:
# Get a translation dictionary so we know the user and location id we're referring to by their idx
idx_to_user_id = {idx: u for u, idx in user_id_to_idx.items()}
idx_to_item_id = {idx: i for i, idx in item_id_to_idx.items()}

In [206]:
interactions_train = list(
    zip(
        train_df["user_idx"].astype(int).tolist(),
        train_df["item_idx"].astype(int).tolist(),
        train_df["rating"].tolist()
    )
)

items_per_user_train = defaultdict(set)
for u, i, r in interactions_train:
    items_per_user_train[u].add(i)

all_items = list(range(num_items))

len(interactions_train), len(all_items)


(2671266, 47107)

In [207]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super().__init__()
        self.lamb = lamb

        # Global item bias
        self.betaI = self.add_weight(
            name="betaI",
            shape=(num_items,),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

        # User latent factors
        self.gammaU = self.add_weight(
            name="gammaU",
            shape=(num_users, K),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

        # Item latent factors
        self.gammaI = self.add_weight(
            name="gammaI",
            shape=(num_items, K),
            initializer=tf.random_normal_initializer(stddev=0.001),
            trainable=True,
        )

    def score(self, sampleU, sampleI):
        # sampleU, sampleI are index tensors
        u = tf.cast(sampleU, tf.int32)
        i = tf.cast(sampleI, tf.int32)

        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)

        x_ui = beta_i + tf.reduce_sum(gamma_u * gamma_i, axis=1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        # BPR loss: -log σ(x_ui - x_uj)
        loss = -tf.reduce_mean(tf.math.log_sigmoid(x_ui - x_uj))
        return loss

    def reg(self):
        return self.lamb * (
            tf.nn.l2_loss(self.betaI)
            + tf.nn.l2_loss(self.gammaU)
            + tf.nn.l2_loss(self.gammaI)
        )


In [208]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
modelBPR = BPRbatch(LATENT_DIM, REG_LAMBDA)

def trainingStepBPR(model, interactions, items_per_user, items, Nsamples):
    sampleU, sampleI, sampleJ = [], [], []

    for _ in range(Nsamples):
        u, i, r = random.choice(interactions)
        j = random.choice(items)
        while j in items_per_user[u]:
            j = random.choice(items)

        sampleU.append(u)
        sampleI.append(i)
        sampleJ.append(j)

    # Convert lists → tensors
    sampleU_tf = tf.convert_to_tensor(sampleU, dtype=tf.int32)
    sampleI_tf = tf.convert_to_tensor(sampleI, dtype=tf.int32)
    sampleJ_tf = tf.convert_to_tensor(sampleJ, dtype=tf.int32)

    with tf.GradientTape() as tape:
        loss = model(sampleU_tf, sampleI_tf, sampleJ_tf)
        loss += model.reg()

    grads = tape.gradient(loss, model.trainable_variables)

    # Pair gradients with variables, skipping any None grads just in case
    grads_and_vars = [
        (g, v) for g, v in zip(grads, model.trainable_variables) if g is not None
    ]

    if grads_and_vars:
        optimizer.apply_gradients(grads_and_vars)
    else:
        print("Warning: no gradients to apply this step.")

    return float(loss.numpy())


In [209]:
test_loss = trainingStepBPR(
    modelBPR,
    interactions_train,
    items_per_user_train,
    all_items,
    Nsamples=10000  # mini testing
)

print(f"One mini-batch objective: {test_loss:.4f}")

One mini-batch objective: 0.6932


In [210]:
for step in range(N_TRAIN_STEPS):
    obj = trainingStepBPR(
        modelBPR,
        interactions_train,
        items_per_user_train,
        all_items,
        NSAMPLES_PER_BATCH,
    )
    if (step + 1) % 5 == 0:
        print(f"Step {step + 1}, objective = {obj:.4f}")

Step 5, objective = 0.6807
Step 10, objective = 0.6663
Step 15, objective = 0.6528
Step 20, objective = 0.6395
Step 25, objective = 0.6289
Step 30, objective = 0.6177
Step 35, objective = 0.6077
Step 40, objective = 0.6001
Step 45, objective = 0.5909
Step 50, objective = 0.5851
Step 55, objective = 0.5767
Step 60, objective = 0.5723
Step 65, objective = 0.5665
Step 70, objective = 0.5606
Step 75, objective = 0.5578
Step 80, objective = 0.5542
Step 85, objective = 0.5511
Step 90, objective = 0.5474
Step 95, objective = 0.5442
Step 100, objective = 0.5418
Step 105, objective = 0.5401
Step 110, objective = 0.5370
Step 115, objective = 0.5355
Step 120, objective = 0.5353
Step 125, objective = 0.5331
Step 130, objective = 0.5332
Step 135, objective = 0.5317
Step 140, objective = 0.5335
Step 145, objective = 0.5310
Step 150, objective = 0.5263
Step 155, objective = 0.5296
Step 160, objective = 0.5281
Step 165, objective = 0.5263
Step 170, objective = 0.5238
Step 175, objective = 0.5251
Step 

Evaluation


In [211]:
test_items_per_user = defaultdict(list)
for _, row in test_df.iterrows():
    u = int(row["user_idx"])
    i = int(row["item_idx"])
    test_items_per_user[u].append(i)

def evaluate_hit_rate_at_k(model, train_items_per_user, test_items_per_user, items, k):
    users = list(test_items_per_user.keys())
    hits = 0
    total = 0

    # Convert items to a NumPy array once
    all_items_array = np.array(items, dtype=np.int32)

    for idx, u in enumerate(users):
        train_items = train_items_per_user[u]  # this is already a set

        # Mask out training items
        candidate_mask = ~np.isin(all_items_array, list(train_items))
        candidate_items = all_items_array[candidate_mask]

        if len(candidate_items) == 0:
            continue

        u_list = np.full(len(candidate_items), u, dtype=np.int32)
        scores = model.score(u_list, candidate_items).numpy()

        top_k_idx = np.argpartition(-scores, k - 1)[:k]
        top_k_items = set(candidate_items[top_k_idx])

        test_items = set(test_items_per_user[u])  # usually size 1
        if top_k_items & test_items:
            hits += 1

        total += 1

        # Optional: progress print every 1000 users so you know it's moving
        if (idx + 1) % 1000 == 0:
            print(f"Evaluated {idx + 1}/{len(users)} users...")

    if total == 0:
        return None
    return hits / total

def get_top_k_recommendations(model, train_items_per_user, test_items_per_user, items, k, idx_to_user_id, idx_to_item_id):
    """
    Returns: dict of user_id -> list of top-k gmap_ids, excluding training items.
    """
    recs = {}
    users = list(test_items_per_user.keys())
    all_items_array = np.array(items, dtype=np.int32)

    for idx, u in enumerate(users):
        user_id = idx_to_user_id[u]

        train_items = train_items_per_user[u]

        # Candidate items = all items not already interacted with
        candidate_mask = ~np.isin(all_items_array, list(train_items))
        candidate_items = all_items_array[candidate_mask]

        if len(candidate_items) == 0:
            recs[user_id] = []
            continue

        # Score all candidate items
        u_list = np.full(len(candidate_items), u, dtype=np.int32)
        scores = model.score(u_list, candidate_items).numpy()

        # Sort scores and get top-k
        top_idx = np.argsort(-scores)[:k]  # sorted descending
        top_item_indices_sorted = candidate_items[top_idx]

        # Map indices back to gmap_ids
        top_item_ids = [idx_to_item_id[i] for i in top_item_indices_sorted]

        recs[user_id] = top_item_ids

    return recs


In [212]:
# hit_at_k = evaluate_hit_rate_at_k(
#     modelBPR,
#     items_per_user_train,
#     test_items_per_user,
#     all_items,
#     TOP_K,
# )

# print(f"HitRate@{TOP_K}: {hit_at_k:.4f}" if hit_at_k is not None else "No users with test items to evaluate.")

recommendation = get_top_k_recommendations(
    modelBPR,
    items_per_user_train,
    test_items_per_user,
    all_items,
    TOP_K,
    idx_to_user_id,
    idx_to_item_id
)


In [213]:
print(list(recommendation.keys())[1])
print(recommendation[list(recommendation.keys())[1]])

100000304052561681853
['0x54ea92b04ed31491:0xf497b5d87639810', '0x54eb2c9d759f3191:0xe3b93c57067a899a', '0x54c5b55eda22e2a9:0x1f87d215776a4838', '0x54bff8cd95e81b37:0x20ebe35b7c3faec2', '0x54b8c632c16b82f3:0xc2979d3396b38cf0', '0x54955c451f42e579:0xf61e6c7552a070a1', '0x54c0b0d955ef6c6b:0x506a85cae9ce9896', '0x54c0e23c13ef825d:0x9d110bf00de7aa33', '0x87ed7de3f0a078b3:0x6c233c3709ef1cc4', '0x54950a2eb856b805:0xe7e720b09eddfc26', '0x54c11de4265cd325:0xa3a7d53c0aae45d3', '0x54bfedda86d4a279:0x3d55c036fa2ccf71', '0x54b8a6aaaaaaaaab:0x8440cb69bc8cdd24', '0x549618d8d788f7cd:0xe1cbc1f79d7ed701', '0x54bffe899860d069:0x477233dbe7157358', '0x54cf7bb81d124cd7:0x2e2cf576e39bfd1', '0x54be1eaaa37312a3:0x6738b4912b8ca4f2', '0x54bfff0c4f6796df:0x6eae6fd7e411983', '0x5495a40b739922e1:0x1386e3022fac1181', '0x54959ad6cca60deb:0xc5a043e2f635f', '0x5495a2ae617645a7:0x9975eb0f90219342', '0x54bfff731a23c4c9:0x6309553425b0e578', '0x54c11c1a4910471f:0xc03728bb729561f5', '0x54bff8c93cfa3507:0xa6c31cf6985002d6',

In [214]:
# Save the recommendations to a file so it can be used in eval.ipynb
save_likes("bpr_one_state_recommendation_per_user.json", recommendation)

# Also save the hidden likes
users_hidden_likes = defaultdict(list)

for _, row in test_df.iterrows():
    user_id = idx_to_user_id[row["user_idx"]]
    item_id = idx_to_item_id[row["item_idx"]]
    users_hidden_likes[user_id].append(item_id)

# If you want a normal dict instead of defaultdict
users_hidden_likes = dict(users_hidden_likes)

save_likes("users_hidden_likes.json", users_hidden_likes)

Saved to  eval/bpr_one_state_recommendation_per_user.json
Saved to  eval/users_hidden_likes.json
