In [143]:
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

data = pd.read_csv('100k_a.csv', names=['user_id', 'stream_id', 'streamer_username', 'time_start', 'time_stop'])
# Create train and test splits temporally sorted by time_start
data['streamer_id'], uniques = pd.factorize(data['streamer_username'])
# start indexing at 0 instead of 1
data['user_id'] = data['user_id'] - 1
data = data.sort_values('time_start').reset_index(drop=True)
data

Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop,streamer_id
0,11586,33827617344,miltontpike1,0,5,1866
1,13762,33827755632,rekinss,0,1,6845
2,13762,33827475024,airon29,0,1,18105
3,13762,33827351664,tonytubo,0,1,4949
4,13762,33827169440,eliasmerk,0,1,47618
...,...,...,...,...,...,...
3051728,8975,34415693328,purple_hs,6147,6148,727
3051729,29709,34414041536,forsen,6147,6148,202
3051730,41485,34416038384,rekkles,6147,6148,2524
3051731,84280,34413422016,dlxowns45,6147,6148,2190


In [144]:
# How many of the last rounds to keep?
split_point = 250

# Get the last 250 unique start times for the test set
start_times = data['time_start'].unique()
last_start_times = start_times[-split_point:]
test_data = data[data['time_start'].isin(last_start_times)]

# Keep the beginning start times for training
train_data = data[~data['time_start'].isin(last_start_times)]

assert len(train_data) + len(test_data) == len(data)

# Filter to keep only the last interaction per user (largest time_start)
test_data = test_data.drop_duplicates(subset=['user_id'], keep='last')

test_data

Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop,streamer_id
2925064,67470,34395278048,shrimp9710,5898,5899,2189
2925076,5530,34394569824,enchatin,5898,5900,21835
2925085,73313,34394440032,jukes,5898,5901,117
2925087,37797,34392164880,couragejd,5898,5900,103
2925104,8423,34393587008,trumpsc,5898,5900,6344
...,...,...,...,...,...,...
3051728,8975,34415693328,purple_hs,6147,6148,727
3051729,29709,34414041536,forsen,6147,6148,202
3051730,41485,34416038384,rekkles,6147,6148,2524
3051731,84280,34413422016,dlxowns45,6147,6148,2190


In [145]:
itemIDs = train_data['streamer_id'].unique().tolist()
userIDs = train_data['user_id'].unique().tolist()
len(itemIDs), len(userIDs)

(159263, 99822)

In [146]:
# Pairs of (user_id, streamer_id) in the training data
trainInteractions = list(zip(train_data['user_id'], train_data['streamer_id']))
# For each user id, this gets the set of consumed item ids (streamers they watched)
userInteractions = train_data.groupby('user_id')['streamer_id'].apply(set).to_dict()

In [153]:
### DO NOT DELETE THIS CODE BLOCK ###
num_users = int(train_data['user_id'].max()) + 1
num_items = int(train_data['streamer_id'].max()) + 1


class MFModel(tf.keras.Model):
    def __init__(self, K, lamb):
        super(MFModel, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([num_items],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([num_users, K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([num_items, K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p
    
    def recommend(self, u, N=10, availItems=None, userTime = None):
        # Use average user vector if u is invalid
        num_users = tf.shape(self.gammaU)[0]
        if u >= num_users or u < 0:
            # Use average user vector
            gamma_u = tf.reduce_mean(self.gammaU, axis=0)
        else:
            gamma_u = tf.nn.embedding_lookup(self.gammaU, u)

        # Compute dot product: (Items x K) . (K x 1) -> (Items x 1)
        interaction_scores = tf.matmul(self.gammaI, tf.expand_dims(gamma_u, axis=-1))
        
        # Squeeze to (Items,) so it matches betaI shape
        interaction_scores = tf.squeeze(interaction_scores, axis=-1)
        
        # Now shapes match: (Items,) + (Items,)
        scores = self.betaI + interaction_scores
        
        if availItems is not None and userTime is not None:
            available_mask = tf.constant(availItems, dtype=tf.int64)
            scores = tf.gather(scores, available_mask)
            # Note: top_N will now be from available items only
            top_N = tf.math.top_k(scores, k=min(N, len(availItems)))
            # Map back to original item indices
            top_indices = tf.gather(availItems, top_N.indices)
            return top_indices.numpy(), top_N.values.numpy()
        else:
            top_N = tf.math.top_k(scores, k=N)
            return top_N.indices.numpy(), top_N.values.numpy()

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int64)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int64)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [154]:
import random
# Negative sampling function with controlled repetition
def sample_negative(u, i, user_consumed_train, all_items, Prepeat=0.5):
    # Repeat negative sampling with probability Prepeat
    if np.random.rand() < Prepeat:
        consumed = list(user_consumed_train[u])
        negItem = random.choice(consumed)
        while negItem == i and len(consumed) > 1:
            negItem = random.choice(consumed)
    else:
        negItem = random.choice(all_items)
        while negItem == i and len(all_items) > 1:
            negItem = random.choice(all_items)

    return negItem

In [None]:
# Train the model
import random
model = MFModel(20, 0.00001)
optimizer = tf.keras.optimizers.Adam(0.0005)

def trainingStep(model, interactions):
    Nsamples = 2**8
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for c in range(Nsamples):
            u,i = random.choice(interactions) # positive sample
            j = sample_negative(u, i, userInteractions, itemIDs, Prepeat=0.5)
            sampleU.append(u)
            sampleI.append(i)
            sampleJ.append(j)

        loss = model.call(sampleU,sampleI,sampleJ)
        loss += model.reg()
    
    gradients = tape.gradient(loss, [model.betaI, model.gammaU, model.gammaI])
    optimizer.apply_gradients(zip(gradients, [model.betaI, model.gammaU, model.gammaI]))

    return loss.numpy()

for i in range(1000):
    obj = trainingStep(model, trainInteractions)
    if (i+1) % 100 == 0: print("iteration " + str(i+1) + ", objective = " + str(obj))
    

iteration 10, objective = 0.6930116
iteration 20, objective = 0.693044
iteration 30, objective = 0.6928355
iteration 40, objective = 0.69252014
iteration 50, objective = 0.692701
iteration 60, objective = 0.69229233
iteration 70, objective = 0.6918272
iteration 80, objective = 0.69226575
iteration 90, objective = 0.692529
iteration 100, objective = 0.6920325
iteration 110, objective = 0.6919879
iteration 120, objective = 0.69125396
iteration 130, objective = 0.6911702
iteration 140, objective = 0.69068205
iteration 150, objective = 0.69074553
iteration 160, objective = 0.69118565
iteration 170, objective = 0.6903305
iteration 180, objective = 0.6905602
iteration 190, objective = 0.69029963
iteration 200, objective = 0.69038504
iteration 210, objective = 0.68919474
iteration 220, objective = 0.68979913
iteration 230, objective = 0.6913753
iteration 240, objective = 0.6886865
iteration 250, objective = 0.6891781
iteration 260, objective = 0.68960863
iteration 270, objective = 0.6896234
i

In [157]:
### SAVE THE MODEL ###\
# After training, save embeddings
np.save('betaI.npy', model.betaI.numpy())
np.save('gammaU.npy', model.gammaU.numpy())
np.save('gammaI.npy', model.gammaI.numpy())
len(test_data)

44334

In [None]:
# In a new Python file or cell, recreate model with same sizes
load_model = MFModel(K=20, lamb=LAMB)  # Same K and lamb, but lamb not needed for inference
# Assign saved embeddings
load_model.betaI.assign(np.load('betaI.npy'))
load_model.gammaU.assign(np.load('gammaU.npy'))
load_model.gammaI.assign(np.load('gammaI.npy'))
# Now use load_model.score() or load_model.predict() for inference

In [147]:
tempdf = data[['streamer_id', 'time_start']].drop_duplicates()
availMap = tempdf.groupby('time_start')['streamer_id'].apply(list).to_dict()

In [158]:
# Evaluating the model with hit@1
test_pairs = list(zip(test_data['user_id'], test_data['streamer_id'], test_data['time_start']))
test_pairs = test_pairs  # limit to first 1000 for faster evaluation

specific_value = 161
filtered_rows = data[data['streamer_id'] == specific_value]
# print(filtered_rows)

hitkNovel = 0
hitkNovelTotal = 0
hitkRepeat = 0
hitkRepeatTotal = 0

i = 0
trainInteractionsSet = set(trainInteractions)
for uid, iid, user_time in test_pairs:
    ## get available streamers at that time
    availItems = availMap[user_time]

    topItems, score = model.recommend(uid, N=1, availItems=availItems, userTime=user_time)
    # print('Top recommended item for user', uid, 'is', topItem)
    isHit = iid in topItems

    if (uid, iid) in trainInteractionsSet:
      if isHit:
        hitkRepeat += 1
      hitkRepeatTotal += 1
    else:
      if isHit:
        hitkNovel += 1
      hitkNovelTotal += 1

    if (i+1) % 1000 == 0:
      print("Processed " + str(i+1) + " test interactions.")
    i += 1


print('Hit@1 prediction accuracy for novel interactions:', hitkNovel / hitkNovelTotal)
print('Hit@1 prediction accuracy for repeat interactions:', hitkRepeat / hitkRepeatTotal)
print('Hit@1 prediction accuracy total', (hitkRepeat + hitkNovel) / (hitkRepeatTotal + hitkNovelTotal))

Processed 1000 test interactions.
Processed 2000 test interactions.
Processed 3000 test interactions.
Processed 4000 test interactions.
Processed 5000 test interactions.
Processed 6000 test interactions.
Processed 7000 test interactions.
Processed 8000 test interactions.
Processed 9000 test interactions.
Processed 10000 test interactions.
Processed 11000 test interactions.
Processed 12000 test interactions.
Processed 13000 test interactions.
Processed 14000 test interactions.
Processed 15000 test interactions.
Processed 16000 test interactions.
Processed 17000 test interactions.
Processed 18000 test interactions.
Processed 19000 test interactions.
Processed 20000 test interactions.
Processed 21000 test interactions.
Processed 22000 test interactions.
Processed 23000 test interactions.
Processed 24000 test interactions.
Processed 25000 test interactions.
Processed 26000 test interactions.
Processed 27000 test interactions.
Processed 28000 test interactions.
Processed 29000 test interact