In [222]:
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

data = pd.read_csv('100k_a.csv', names=['user_id', 'stream_id', 'streamer_username', 'time_start', 'time_stop'])
# Create train and test splits temporally sorted by time_start
data['streamer_id'], uniques = pd.factorize(data['streamer_username'])
# start indexing at 0 instead of 1
data['user_id'] = data['user_id'] - 1
data = data.sort_values('time_start').reset_index(drop=True)
data

Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop,streamer_id
0,11586,33827617344,miltontpike1,0,5,1866
1,13762,33827755632,rekinss,0,1,6845
2,13762,33827475024,airon29,0,1,18105
3,13762,33827351664,tonytubo,0,1,4949
4,13762,33827169440,eliasmerk,0,1,47618
...,...,...,...,...,...,...
3051728,8975,34415693328,purple_hs,6147,6148,727
3051729,29709,34414041536,forsen,6147,6148,202
3051730,41485,34416038384,rekkles,6147,6148,2524
3051731,84280,34413422016,dlxowns45,6147,6148,2190


In [223]:
split_point = int(len(data) * 0.8)
# split_point = 100000 # for reproducibility
shuffled_data = data.sample(frac=1, random_state=42).reset_index()

train_data = data.iloc[:split_point]
test_data = data.iloc[split_point:]
len(train_data), len(test_data)

(2441386, 610347)

In [224]:
itemIDs = train_data['streamer_id'].unique().tolist()
userIDs = train_data['user_id'].unique().tolist()
len(userIDs)

98184

In [225]:
# Pairs of (user_id, streamer_id) in the training data
trainInteractions = list(zip(train_data['user_id'], train_data['streamer_id']))
# For each user id, this gets the set of consumed item ids (streamers they watched)
user_consumed_items = train_data.groupby('user_id')['streamer_id'].apply(set).to_dict()
len(trainInteractions)

2441386

In [226]:
num_users = int(data['user_id'].max()) + 1
num_items = int(data['streamer_id'].max()) + 1
interactionsArr = np.array(trainInteractions)

class MFModel(tf.keras.Model):
    def __init__(self, K, lamb):
        super(MFModel, self).__init__()
        # Initialize with stddev=0.1 for better learning signal
        self.gammaU = tf.Variable(tf.random.normal([num_users, K], stddev=0.1))
        self.gammaI = tf.Variable(tf.random.normal([num_items, K], stddev=0.1))
        self.lamb = lamb

    def predict(self, u, i):
        p = tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p
    
    def recommend(self, u, N=10):
        u = tf.convert_to_tensor(u, dtype=tf.int64)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        interaction_scores = tf.matmul(self.gammaI, tf.expand_dims(gamma_u, axis=-1))
        scores = tf.squeeze(interaction_scores, axis=-1)
        top_N = tf.math.top_k(scores, k=N)
        return top_N.indices.numpy(), top_N.values.numpy()

    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.gammaU) + tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int64)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int64)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, inputs):
        sampleU, sampleI, sampleJ = inputs
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

    # --- NEW: Custom Training Step for model.fit() ---
    def train_step(self, data):
        # Unpack the data tuple (u, i, j) provided by the dataset
        u, i, j = data
        
        with tf.GradientTape() as tape:
            loss = self.call((u, i, j))
            loss += self.reg()

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, [self.gammaU, self.gammaI])

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, [self.gammaU, self.gammaI]))

        # Return metrics
        return {"loss": loss}

In [232]:
from tensorflow.keras import mixed_precision

# 1. Enable Mixed Precision (Huge speedup on modern GPUs)
mixed_precision.set_global_policy('mixed_float16')

def samplePositiveBatch(interactionsArr, batch_size):
    indices = np.random.choice(len(interactionsArr), batch_size)
    batch_pos = interactionsArr[indices]
    sampleU = batch_pos[:,0]
    sampleI = batch_pos[:,1]

    return sampleU, sampleI

def sampleNegativeBatch(sampleU, sampleI, user_consumed_arrays, num_items, Prepeat=0.5):
    batch_size = len(sampleU)
    sampleJ = np.zeros(batch_size, dtype=np.int64)

    for k in range(batch_size):
        u = sampleU[k]
        i = sampleI[k]
        
        # Logic: Try to sample from history (Repeat)
        if np.random.rand() < Prepeat:
            consumed = user_consumed_arrays[u]
            
            # We need a consumed item that is NOT 'i'.
            # If the user has watched more than just 'i', we can proceed.
            if len(consumed) > 1 or (len(consumed) == 1 and consumed[0] != i):
                j = np.random.choice(consumed)
                while j == i: # Rejection sampling (fast)
                    j = np.random.choice(consumed)
                sampleJ[k] = j
                continue
            # If user only watched 'i', fall through to random sampling below
        
        # Logic: Sample from all items (Novel)
        j = np.random.randint(0, num_items)
        while j == i: # Rejection sampling (fast)
            j = np.random.randint(0, num_items)
        sampleJ[k] = j

    return sampleJ

# 1. Wrapper to make your sampler compatible with tf.data
def negative_sampling_wrapper(u, i):
    # Convert to numpy for your existing function
    u_np = u.numpy().astype(np.int64)
    i_np = i.numpy().astype(np.int64)
    # Call your existing logic
    j_np = sampleNegativeBatch(u_np, i_np, user_consumed_arrays, num_items, Prepeat=0.5)
    return j_np


# Convert user_consumed_items to arrays for new negative sampling function
user_consumed_arrays = {}
for u in user_consumed_items:
    user_consumed_arrays[u] = np.array(list(user_consumed_items[u]), dtype=np.int64)

print("Pre-computing negative samples for the entire dataset...")
train_u = interactionsArr[:, 0]
train_i = interactionsArr[:, 1]

# Generate negatives ONCE. This might take 10-20 seconds but saves massive time later.
train_j = sampleNegativeBatch(train_u, train_i, user_consumed_arrays, num_items, Prepeat=0.5)

# Create a high-performance TensorFlow dataset
# Since data is already prepared, we don't need tf.py_function anymore!
def create_fast_dataset(u, i, j, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((u, i, j))
    dataset = dataset.shuffle(buffer_size=100000, reshuffle_each_iteration=True)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# 3. Initialize and Train
LAMB = 0.000005
LR = 0.1
BATCH_SIZE = 2**8 # Larger batch size is better for GPU

model = MFModel(K=20, lamb=LAMB)
model.compile(optimizer=tf.keras.optimizers.Adam(LR), jit_compile=True)

train_ds = create_fast_dataset(train_u, train_i, train_j, BATCH_SIZE)

print("Starting optimized training...")
history = model.fit(train_ds, epochs=10)

Pre-computing negative samples for the entire dataset...
Starting optimized training...
Epoch 1/10
[1m1272/9537[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m54s[0m 7ms/step - loss: 0.8120

KeyboardInterrupt: 

In [None]:
# Plot the loss
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.title('Training Loss')
plt.ylabel('Loss')
# filepath: /Users/alexcojocaru/School/UCSD/FA2025/CSE158/cse158-assignment2/alexnotebook.ipynb
# Capture the history object
history = model.fit(train_ds, epochs=10)

# Plot the loss
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.title('Training Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
### SAVE THE MODEL ###\
# After training, save embeddings
# np.save('betaI.npy', model.betaI.numpy())
np.save('gammaU.npy', model.gammaU.numpy())
np.save('gammaI.npy', model.gammaI.numpy())

In [None]:
# In a new Python file or cell, recreate model with same sizes
load_model = MFModel(K=20, lamb=LAMB)  # Same K and lamb, but lamb not needed for inference
# Assign saved embeddings
# load_model.betaI.assign(np.load('betaI.npy'))
load_model.gammaU.assign(np.load('gammaU.npy'))
load_model.gammaI.assign(np.load('gammaI.npy'))
# Now use load_model.score() or load_model.predict() for inference

<tf.Variable 'UnreadVariable' shape=(162625, 20) dtype=float32, numpy=
array([[-8.86068121e-02,  3.03519852e-02,  9.68260393e-02, ...,
         2.43059799e-01,  2.88234019e+00,  1.21501759e-01],
       [-1.15319360e-02, -1.42626604e-03,  2.89392658e-03, ...,
        -1.32554248e-01,  1.79231837e-01,  1.26726385e-02],
       [-3.75783741e-02,  4.94230678e-03, -2.90561408e-01, ...,
        -6.49818033e-02,  1.67742372e+00, -1.23492397e-01],
       ...,
       [ 4.74854466e-03,  2.13176031e-02,  4.09641266e-02, ...,
         2.03448720e-02, -6.37830496e-02, -1.59771722e-02],
       [ 2.90319207e-04, -1.40437915e-03,  1.16859016e-03, ...,
        -4.86185960e-02,  3.90350615e-04, -3.25675402e-03],
       [ 8.52638696e-05,  3.06422031e-03, -2.82874302e-04, ...,
        -5.61612724e-05,  1.45954476e-03, -1.14599585e-04]],
      shape=(162625, 20), dtype=float32)>

In [231]:
# Evaluating the model with hit@1 (BATCHED)
test_pairs = list(zip(test_data['user_id'], test_data['streamer_id']))


# CRITICAL OPTIMIZATION: Convert list to set for O(1) lookup
trainInteractionsSet = set(trainInteractions)

hitkNovel = 0
hitkNovelTotal = 0
hitkRepeat = 0
hitkRepeatTotal = 0

# Create batches
EVAL_BATCH_SIZE = 2**11
num_test = len(test_pairs)
K_TOP = 1  # Look at top 10 recommendations

print(f"Starting evaluation on {num_test} pairs...")

for i in range(0, num_test, EVAL_BATCH_SIZE):
    batch = test_pairs[i : i + EVAL_BATCH_SIZE]
    u_batch = [p[0] for p in batch]
    i_batch = [p[1] for p in batch]
    
    # Get embeddings for this batch of users: (BatchSize, K)
    u_tensor = tf.convert_to_tensor(u_batch, dtype=tf.int64)
    batch_gamma_u = tf.nn.embedding_lookup(model.gammaU, u_tensor)
    
    # Compute scores for ALL items for these users: (BatchSize, K) x (K, NumItems) -> (BatchSize, NumItems)
    # Note: Transpose gammaI to be (K, NumItems)
    batch_scores = tf.matmul(batch_gamma_u, model.gammaI, transpose_b=True)
    
    # Get top K items
    # Returns values and indices. We only need indices.
    _, top_indices = tf.math.top_k(batch_scores, k=K_TOP)
    top_indices = top_indices.numpy()
    
    # Check hits
    for k in range(len(batch)):
        uid, iid = batch[k]
        pred_items = top_indices[k]
        
        if (uid, iid) in trainInteractionsSet:
            if iid in pred_items:
                hitkRepeat += 1
            hitkRepeatTotal += 1
        else:
            if iid in pred_items:
                hitkNovel += 1
            hitkNovelTotal += 1
            
    if (i + EVAL_BATCH_SIZE) % 1000 == 0:
        print(f"Evaluated {i + EVAL_BATCH_SIZE}...")

print('Hit@1 prediction accuracy for novel interactions:', 1.0 * hitkNovel / hitkNovelTotal)
print('Hit@1 prediction accuracy for repeat interactions:', 1.0 * hitkRepeat / hitkRepeatTotal)

Starting evaluation on 610347 pairs...
Evaluated 256000...
Evaluated 512000...
Hit@1 prediction accuracy for novel interactions: 0.0
Hit@1 prediction accuracy for repeat interactions: 1.71488837505752e-05
