In [43]:
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

data = pd.read_csv('100k_a.csv', names=['user_id', 'stream_id', 'streamer_username', 'time_start', 'time_stop'])
# Create train and test splits temporally sorted by time_start
data['streamer_id'], uniques = pd.factorize(data['streamer_username'])
# start indexing at 0 instead of 1
data['user_id'] = data['user_id'] - 1
data = data.sort_values('time_start').reset_index(drop=True)
data

Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop,streamer_id
0,11586,33827617344,miltontpike1,0,5,1866
1,13762,33827755632,rekinss,0,1,6845
2,13762,33827475024,airon29,0,1,18105
3,13762,33827351664,tonytubo,0,1,4949
4,13762,33827169440,eliasmerk,0,1,47618
...,...,...,...,...,...,...
3051728,8975,34415693328,purple_hs,6147,6148,727
3051729,29709,34414041536,forsen,6147,6148,202
3051730,41485,34416038384,rekkles,6147,6148,2524
3051731,84280,34413422016,dlxowns45,6147,6148,2190


In [80]:
split_point = int(len(data) * 0.8)
# split_point = 100000 # for reproducibility
shuffled_data = data.sample(frac=1, random_state=42).reset_index()

train_data = data.iloc[:split_point]
test_data = data.iloc[split_point:]
len(train_data), len(test_data)

(2441386, 610347)

In [45]:
itemIDs = train_data['streamer_id'].unique().tolist()
userIDs = train_data['user_id'].unique().tolist()
len(userIDs)

39708

In [71]:
num_users = int(data['user_id'].max()) + 1
num_items = int(data['streamer_id'].max()) + 1

class MFModel(tf.keras.Model):
    def __init__(self, K, lamb):
        super(MFModel, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([num_items],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([num_users, K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([num_items, K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p
    
    def recommend(self, u, N=10):
        u = tf.convert_to_tensor(u, dtype=tf.int64)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        
        # Compute dot product: (Items x K) . (K x 1) -> (Items x 1)
        interaction_scores = tf.matmul(self.gammaI, tf.expand_dims(gamma_u, axis=-1))
        
        # Squeeze to (Items,) so it matches betaI shape
        interaction_scores = tf.squeeze(interaction_scores, axis=-1)
        
        # Now shapes match: (Items,) + (Items,)
        scores = self.betaI + interaction_scores
        
        top_N = tf.math.top_k(scores, k=N)
        return top_N.indices.numpy(), top_N.values.numpy()

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int64)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int64)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))


# Negative sampling function with controlled repetition
def sample_negative(u, i, user_consumed_train, all_items, Prepeat=0.5):
    if np.random.rand() < Prepeat:
        candidates = list(user_consumed_train[u] - {i})
        # If this is the only streamer the user has consumed, fall back to random sampling
        if not candidates:
            candidates = list(all_items - {i})
    else:
        candidates = list(all_items - {i})
    return np.random.choice(candidates)

In [53]:
# Pairs of (user_id, streamer_id) in the training data
trainInteractions = list(zip(train_data['user_id'], train_data['streamer_id']))
# For each user id, this gets the set of consumed item ids (streamers they watched)
user_consumed_items = train_data.groupby('user_id')['streamer_id'].apply(set).to_dict()
len(trainInteractions)

100000

In [None]:
# Train the model
import random
model = MFModel(20, 0.0001)
optimizer = tf.keras.optimizers.Adam(0.01)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

@tf.function
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for c in range(Nsamples):
            u,i = random.choice(interactions) # positive sample
            j = sample_negative(u, i, user_consumed_items, set(itemIDs), Prepeat=0.5)
            sampleU.append(u)
            sampleI.append(i)
            sampleJ.append(j)

        loss = model.call(sampleU,sampleI,sampleJ)
        loss += model.reg()
    
    gradients = tape.gradient(loss, [model.betaI, model.gammaU, model.gammaI])
    optimizer.apply_gradients(zip(gradients, [model.betaI, model.gammaU, model.gammaI]))
    return loss

for epoch in range(10):
    obj = trainingStep(model, trainInteractions)
    print(f"Epoch {epoch+1}: Objective={obj}")
    

Epoch 1: Objective=0.7201906442642212
Epoch 2: Objective=1.5532201528549194
Epoch 3: Objective=0.7863692045211792
Epoch 4: Objective=1.155800223350525
Epoch 5: Objective=1.1351368427276611
Epoch 6: Objective=0.8244389295578003
Epoch 7: Objective=0.7779063582420349
Epoch 8: Objective=0.9378997087478638
Epoch 9: Objective=0.9549134969711304
Epoch 10: Objective=0.8131558895111084


In [76]:
# Train the model with batch
import random
model = MFModel(20, 0.0001)
optimizer = tf.keras.optimizers.Adam(0.1)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
interactionsArr = np.array(trainInteractions)

def samplePositiveBatch(interactionsArr, batch_size):
    indices = np.random.choice(len(interactionsArr), batch_size)
    batch_pos = interactionsArr[indices]
    sampleU = batch_pos[:,0]
    sampleI = batch_pos[:,1]

    return sampleU, sampleI

def sampleNegativeBatch(sampleU, sampleI, user_consumed_train, all_items, Prepeat=0.5):
    batch_size = len(sampleU)
    sampleJ = np.zeros(batch_size, dtype=np.int64)

    for k in range(batch_size):
        u = sampleU[k]
        i = sampleI[k]
        if np.random.rand() < Prepeat:
            candidates = list(user_consumed_train[u] - {i})
            # If this is the only streamer the user has consumed, fall back to random sampling
            if not candidates:
                candidates = list(all_items - {i})
        else:
            candidates = list(all_items - {i})
        sampleJ[k] = np.random.choice(candidates)

    return sampleJ

@tf.function
def trainStepBatch(modela, sampleU, sampleI, sampleJ):
    with tf.GradientTape() as tape:
        loss = modela.call(sampleU, sampleI, sampleJ)
        loss += modela.reg()
    
    gradients = tape.gradient(loss, [modela.betaI, modela.gammaU, modela.gammaI])
    optimizer.apply_gradients(zip(gradients, [modela.betaI, modela.gammaU, modela.gammaI]))
    return loss

for epoch in range(10):
    batch_size = 50000
    sampleU, sampleI = samplePositiveBatch(interactionsArr, batch_size)
    sampleJ = sampleNegativeBatch(sampleU, sampleI, user_consumed_items, set(itemIDs), Prepeat=0.5)
    loss = trainStepBatch(model, sampleU, sampleI, sampleJ)
    print(f"Epoch {epoch+1}: Objective={loss.numpy()}")

Epoch 1: Objective=0.6934216022491455
Epoch 2: Objective=0.6776120662689209
Epoch 3: Objective=0.6664932370185852
Epoch 4: Objective=0.6596283316612244
Epoch 5: Objective=0.6521666049957275
Epoch 6: Objective=0.6462004780769348
Epoch 7: Objective=0.6402756571769714
Epoch 8: Objective=0.6349462270736694
Epoch 9: Objective=0.6291728615760803
Epoch 10: Objective=0.6262641549110413


In [77]:
### SAVE THE MODEL ###\
# After training, save embeddings
np.save('betaI.npy', model.betaI.numpy())
np.save('gammaU.npy', model.gammaU.numpy())
np.save('gammaI.npy', model.gammaI.numpy())

In [78]:
# In a new Python file or cell, recreate model with same sizes
load_model = MFModel(K=20, lamb=0.01)  # Same K and lamb, but lamb not needed for inference
# Assign saved embeddings
load_model.betaI.assign(np.load('betaI.npy'))
load_model.gammaU.assign(np.load('gammaU.npy'))
load_model.gammaI.assign(np.load('gammaI.npy'))
# Now use load_model.score() or load_model.predict() for inference

<tf.Variable 'UnreadVariable' shape=(162625, 20) dtype=float32, numpy=
array([[ 2.1511354e-03,  1.2093936e-03, -2.5088901e-03, ...,
        -4.5883367e-03, -4.5958390e-03,  4.6573300e-03],
       [ 7.4074338e-03, -2.3072297e-03, -3.6806138e-03, ...,
         2.2531627e-04, -3.7314799e-03,  1.8882528e-03],
       [-2.3223944e-03,  9.5738182e-03,  9.5977564e-04, ...,
        -8.7231342e-03, -5.4972470e-03,  8.9788400e-03],
       ...,
       [ 2.2710092e-05, -5.5188179e-04, -8.6150470e-04, ...,
        -8.6693431e-04,  5.2702997e-04,  9.5913078e-05],
       [ 1.1642183e-03,  3.2924878e-04, -1.8543992e-04, ...,
        -1.4152072e-04, -1.2926928e-04,  6.7921262e-04],
       [ 1.4613352e-03, -2.7657047e-04,  5.7279586e-04, ...,
        -5.8601319e-04,  3.3536420e-04, -5.8109596e-05]],
      shape=(162625, 20), dtype=float32)>

In [79]:
# Evaluating the model with hit@1
test_pairs = list(zip(test_data['user_id'], test_data['streamer_id']))
test_pairs = test_pairs[:100000]  # limit to first 1000 for faster evaluation

hit1Novel = 0
hit1NovelTotal = 0
hit1Repeat = 0
hit1RepeatTotal = 0
for uid, iid in test_pairs:
    topItem, score = load_model.recommend(uid, N=1)
    print('Top recommended item for user', uid, 'is', topItem)

    if (uid, iid) in trainInteractions:
      if topItem[0] == iid:
        hit1Repeat += 1
      hit1RepeatTotal += 1
    else:
      if topItem[0] == iid:
        hit1Novel += 1
      hit1NovelTotal += 1

print('Hit@1 prediction accuracy for novel interactions:', hit1Novel / hit1NovelTotal)
print('Hit@1 prediction accuracy for repeat interactions:', hit1Repeat / hit1RepeatTotal)

Top recommended item for user 56177 is [281]
Top recommended item for user 14957 is [52]
Top recommended item for user 13148 is [281]
Top recommended item for user 39285 is [281]
Top recommended item for user 95847 is [281]
Top recommended item for user 44957 is [281]
Top recommended item for user 6420 is [281]
Top recommended item for user 2944 is [281]
Top recommended item for user 90918 is [281]
Top recommended item for user 61267 is [2616]
Top recommended item for user 85636 is [2616]
Top recommended item for user 13148 is [281]
Top recommended item for user 32667 is [281]
Top recommended item for user 49936 is [281]
Top recommended item for user 12886 is [2616]
Top recommended item for user 23023 is [281]
Top recommended item for user 26733 is [281]
Top recommended item for user 8081 is [281]
Top recommended item for user 80744 is [281]
Top recommended item for user 93550 is [281]
Top recommended item for user 4740 is [281]
Top recommended item for user 28136 is [281]
Top recommen