In [11]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [464]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import tensorflow as tf

In [465]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [466]:
userIDs = {}
itemIDs = {}
interactions = []

for d in parse("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/hw3/train.json.gz"):
    u = d['userID']
    i = d['gameID']
    r = d['hours_transformed']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [467]:
random.shuffle(interactions)
len(interactions)

175000

In [468]:
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

In [469]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

In [470]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [619]:
K=1
lamb_beta=0.00001
lamb_gamma=0.0005
learning_rate=0.1

# optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate, decay = 0.001)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.1,
    decay_steps=50,
    decay_rate=0.9)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb_beta, lamb_gamma):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        # self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        # self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        # Average rating per user
        user_avg = {user: np.mean([r for _, u, r in interactions if u == user]) for user in userIDs}
        # Average rating per item
        item_avg = {item: np.mean([r for i, _, r in interactions if i == item]) for item in itemIDs}

        # Initialize betaU and betaI with average ratings
        self.betaU = tf.Variable([np.mean([r for _, u, r in interactions if u == user]) if user in userIDs else mu for user in range(len(userIDs))])
        self.betaI = tf.Variable([np.mean([r for i, _, r in interactions if i == item]) if item in itemIDs else mu for item in range(len(itemIDs))])


        self.gammaU = tf.Variable(tf.random.normal([len(userIDs), K], stddev=0.001))  # Smaller stddev
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs), K], stddev=0.001))
        self.lamb_beta = lamb_beta
        self.lamb_gamma = lamb_gamma

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg_beta(self):
        return self.lamb_beta * (tf.reduce_sum(self.betaU**2) + tf.reduce_sum(self.betaI**2))
    def reg_gamma(self):
        return self.lamb_gamma * (tf.reduce_sum(self.gammaU**2) + tf.reduce_sum(self.gammaI**2))

    # def reg(self):
    #     return self.lamb * (tf.reduce_sum(self.betaU**2) +\
    #                         tf.reduce_sum(self.betaI**2) +\
    #                         tf.reduce_sum(self.gammaU**2) +\
    #                         tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    # def call(self, sampleU, sampleI, sampleR):
    #     pred = self.predictSample(sampleU, sampleI)
    #     r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
    #     return tf.nn.l2_loss(pred - r) / len(sampleR)
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR) + self.reg_beta() + self.reg_gamma()

###########
modelLFM = LatentFactorModel(mu, K, lamb_beta, lamb_gamma)
def trainingStep(model, train_data, val_data, optimizer):
    def map_ids(data, userIDs, itemIDs):
        return [(userIDs[u], itemIDs[i], r) for u, i, r in data]

    # Map IDs for training data
    train_data_mapped = map_ids(train_data, userIDs, itemIDs)
    sampleU_train, sampleI_train, sampleR_train = zip(*train_data_mapped)
    sampleU_train = tf.convert_to_tensor(sampleU_train, dtype=tf.int32)
    sampleI_train = tf.convert_to_tensor(sampleI_train, dtype=tf.int32)
    sampleR_train = tf.convert_to_tensor(sampleR_train, dtype=tf.float32)


    with tf.GradientTape() as tape:
        train_loss = model(sampleU_train, sampleI_train, sampleR_train)
        # train_loss += model.reg()
    # gradients = tape.gradient(train_loss, model.trainable_variables)
    # optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    gradients = tape.gradient(train_loss, model.trainable_variables)
    clipped_gradients = [tf.clip_by_value(grad, -1.0, 1.0) for grad in gradients]
    optimizer.apply_gradients(zip(clipped_gradients, model.trainable_variables))

    train_loss = train_loss.numpy()

    val_data_mapped = map_ids(val_data, userIDs, itemIDs)
    sampleU_val, sampleI_val, sampleR_val = zip(*val_data_mapped)
    sampleU_val = tf.convert_to_tensor(sampleU_val, dtype=tf.int32)
    sampleI_val = tf.convert_to_tensor(sampleI_val, dtype=tf.int32)
    sampleR_val = tf.convert_to_tensor(sampleR_val, dtype=tf.float32)

    # val_loss = model(sampleU_val, sampleI_val, sampleR_val).numpy() + model.reg().numpy()
    val_loss = model(sampleU_val, sampleI_val, sampleR_val)
    
    return train_loss, val_loss



In [620]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [621]:

modelLFM = LatentFactorModel(mu, K, lamb_beta, lamb_gamma)


train_data, val_data = train_test_split(interactionsTrain, test_size=0.1, random_state=42)



# EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)

# Variables to keep track of best loss and patience counter
best_val_loss = float('inf')
patience_counter = 0
pvalloss = float('inf')
for epoch in range(2000):
    train_loss, val_loss = trainingStep(modelLFM, train_data, val_data, optimizer)

    if val_loss>pvalloss:
        break
    # # Early stopping logic
    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    #     patience_counter = 0
    # else:
    #     patience_counter += 1
    if epoch%10 == 0:
        print(f"Epoch {epoch+1}, Training Loss: {train_loss}, Validation Loss: {val_loss}")

    # if patience_counter >= early_stopping.patience:
    #     print("Early stopping triggered")
    #     break
        
    pvalloss = val_loss

# batch_sizes = [100, 500, 1000]

# for batch_size in batch_sizes:
#     print(f"Training with batch size: {batch_size}")

#     # Reset the model for each batch size
#     modelLFM = LatentFactorModel(mu, K, lam)
#     optimizer = tf.keras.optimizers.Adam(learning_rate)

#     best_val_loss = float('inf')
#     patience_counter = 0

#     for epoch in range(100):
#         # Modify trainingStep to use a portion of data based on the current batch size
#         train_loss, val_loss = trainingStep(modelLFM, train_data, val_data, userIDs, itemIDs, optimizer, batch_size)

#         # Early stopping logic
#         if val_loss < best_val_loss:
#             best_val_loss = val_loss
#             patience_counter = 0
#         else:
#             patience_counter += 1

#         print(f"Epoch {epoch+1}, Training Loss: {train_loss}, Validation Loss: {val_loss}")

#         if patience_counter >= early_stopping.patience:
#             print("Early stopping triggered")
#             break

#     # Evaluate the model's performance for the current batch size
#     mse_test = calculate_mse(modelLFM, interactionsTest)
#     print(f"Test MSE for batch size {batch_size}: {mse_test}")

Epoch 1, Training Loss: 31.534032821655273, Validation Loss: 29.19029998779297
Epoch 11, Training Loss: 13.331841468811035, Validation Loss: 11.989518165588379
Epoch 21, Training Loss: 4.904242515563965, Validation Loss: 4.509769439697266
Epoch 31, Training Loss: 2.559687614440918, Validation Loss: 2.5301759243011475
Epoch 41, Training Loss: 2.214419364929199, Validation Loss: 2.2481439113616943
Epoch 51, Training Loss: 1.7829275131225586, Validation Loss: 1.8511751890182495
Epoch 61, Training Loss: 1.643142580986023, Validation Loss: 1.7466673851013184
Epoch 71, Training Loss: 1.586799144744873, Validation Loss: 1.7022171020507812
Epoch 81, Training Loss: 1.547845482826233, Validation Loss: 1.6771260499954224
Epoch 91, Training Loss: 1.531005859375, Validation Loss: 1.6653110980987549
Epoch 101, Training Loss: 1.517656922340393, Validation Loss: 1.6539571285247803
Epoch 111, Training Loss: 1.5092713832855225, Validation Loss: 1.6467496156692505
Epoch 121, Training Loss: 1.502494931221

In [622]:
u,i,r = interactionsTest[0]
print(u,i,r,modelLFM.predict(userIDs[u], itemIDs[i]).numpy())
def calculate_mse(model, interactions):
    """
    Calculate the mean squared error for the given interactions.

    Args:
    model: The LatentFactorModel instance.
    interactions: A list of tuples (user, item, rating).

    Returns:
    float: The mean squared error.
    """
    total_squared_error = 0
    count = 0

    for u, i, r in interactions:
        # Convert user and item to the indices used in the model
        u_index = userIDs[u]
        i_index = itemIDs[i]

        # Predict the rating
        predicted_rating = model.predict(u_index, i_index)
        
        # Compute squared error
        squared_error = tf.square(predicted_rating - r)
        total_squared_error += squared_error
        count += 1

    # Compute mean squared error
    mse = total_squared_error / count
    return mse.numpy()  # Convert to a regular Python number

# Example usage
mse_test = calculate_mse(modelLFM, interactionsTest)
print(f"Test MSE: {mse_test}")

u42222023 g64596037 5.815063017192867 3.181687
Test MSE: 3.0494325160980225


In [488]:
#3.32 0.001 75, 0.00001

In [None]:
#3.22 0.1 75, 0.0001
#3.224193811416626 0.1 65, 0.0001
#3.224165439605713 0.1 55, 0.0001

In [None]:
#3.2026658058166504 
# K=1
# lamb_beta=0.0001
# lamb_gamma=0.0001
# learning_rate=0.1

In [None]:
# 3.17633056640625
# K=1
# lamb_beta=0.0001
# lamb_gamma=0.001
# learning_rate=0.1

In [None]:
# 3.1763246059417725
# K=1
# lamb_beta=0.0001
# lamb_gamma=0.0005
# learning_rate=0.1

In [None]:
# 3.081282377243042
# K=1
# lamb_beta=0.00005
# lamb_gamma=0.0005
# learning_rate=0.1

In [None]:
# 3.049272060394287
# K=1
# lamb_beta=0.00001
# lamb_gamma=0.0005
# learning_rate=0.1

In [565]:
predictions = open("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/assignment1/test/predictions_Hours.csv", 'w')
for l in open("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/hw3/pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    # bu = 0
    # bi = 0
    # if u in betaU:
    #     bu = betaU[u]
    # if g in betaI:
    #     bi = betaI[g]
    p = modelLFM.predict(userIDs[u], itemIDs[g]).numpy()
    _ = predictions.write(u + ',' + g + ',' + str(p) + '\n')

predictions.close()

In [261]:
for i in range(500):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))


TypeError: trainingStep() missing 2 required positional arguments: 'val_data' and 'optimizer'

In [15]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [16]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [17]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [18]:
answers = {}

In [19]:
# Some data structures that will be useful

In [20]:
allHours = []
for l in readJSON("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/hw3/train.json.gz"):
    allHours.append(l)

In [21]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]
hoursPerUser = defaultdict(list)
hoursPerItem = defaultdict(list)
for u,g,d in hoursTrain:
    r = d['hours_transformed']
    hoursPerUser[u].append((g,r))
    hoursPerItem[g].append((u,r))

In [22]:
##################################################
# Play prediction                                #
##################################################

In [23]:
#bpr

In [24]:
userSet = set()
gameSet = set()
playedSet = set()

for u,g,d in allHours:
    userSet.add(u)
    gameSet.add(g)
    playedSet.add((u,g))

lUserSet = list(userSet)
lGameSet = list(gameSet)

notPlayedValid = set()
for u,g,d in hoursValid:
    g = random.choice(lGameSet)
    while (u,g) in playedSet or (u,g) in notPlayedValid:
        g = random.choice(lGameSet)
    notPlayedValid.add((u,g))

playedValid = set()
for u,g,r in hoursValid:
    playedValid.add((u,g))

In [42]:
user_ids = {u: i for i, u in enumerate(userSet)}
game_ids = {g: i for i, g in enumerate(gameSet)}

num_users = len(user_ids)
num_games = len(game_ids)

# Initialize the user-item matrix with zeros
user_item_matrix = np.zeros((num_users, num_games))

for u, g, _ in hoursTrain:
    user_item_matrix[user_ids[u], game_ids[g]] = 1

# BPR Model
class BPR:
    def __init__(self, num_users, num_items, num_factors=10):
        self.user_factors = np.random.normal(0, 0.1, (num_users, num_factors))
        self.item_factors = np.random.normal(0, 0.1, (num_items, num_factors))
    
    def predict(self, user, item):
        return np.dot(self.user_factors[user], self.item_factors[item])
    
    def train(self, matrix, epochs=100, learning_rate=0.01, lambda_reg=0.01):
         # Initialize Adam optimizer parameters
        beta1, beta2, epsilon = 0.9, 0.999, 1e-8
        m_user, v_user = np.zeros(self.user_factors.shape), np.zeros(self.user_factors.shape)
        m_item, v_item = np.zeros(self.item_factors.shape), np.zeros(self.item_factors.shape)
        t = 0
        for _ in range(epochs):
            t += 1
            user = np.random.randint(num_users)
            pos_items = np.where(matrix[user] == 1)[0]
            neg_items = self.get_neg_item_candidates(user, matrix, pos_items)

            if not neg_items:
                continue

            pos_item = np.random.choice(pos_items)
            neg_item = np.random.choice(neg_items)

            
            
            # Calculate error and gradients
            x_uij = self.predict(user, pos_item) - self.predict(user, neg_item)
            exp_x = np.exp(-x_uij)
            loss = exp_x / (1 + exp_x)

            user_gradient = (self.item_factors[pos_item] - self.item_factors[neg_item]) * loss - lambda_reg * self.user_factors[user]
            item_pos_gradient = self.user_factors[user] * loss - lambda_reg * self.item_factors[pos_item]
            item_neg_gradient = -self.user_factors[user] * loss - lambda_reg * self.item_factors[neg_item]

            # Update user and item factors using Adam optimizer
            m_user = beta1 * m_user + (1 - beta1) * user_gradient
            v_user = beta2 * v_user + (1 - beta2) * (user_gradient ** 2)
            m_item[pos_item] = beta1 * m_item[pos_item] + (1 - beta1) * item_pos_gradient
            v_item[pos_item] = beta2 * v_item[pos_item] + (1 - beta2) * (item_pos_gradient ** 2)
            m_item[neg_item] = beta1 * m_item[neg_item] + (1 - beta1) * item_neg_gradient
            v_item[neg_item] = beta2 * v_item[neg_item] + (1 - beta2) * (item_neg_gradient ** 2)

            m_user_corr = m_user / (1 - beta1 ** t)
            v_user_corr = v_user / (1 - beta2 ** t)
            m_item_corr = m_item / (1 - beta1 ** t)
            v_item_corr = v_item / (1 - beta2 ** t)

            self.user_factors -= learning_rate * m_user_corr / (np.sqrt(v_user_corr) + epsilon)
            self.item_factors[pos_item] -= learning_rate * m_item_corr[pos_item] / (np.sqrt(v_item_corr[pos_item]) + epsilon)
            self.item_factors[neg_item] -= learning_rate * m_item_corr[neg_item] / (np.sqrt(v_item_corr[neg_item]) + epsilon)

    def get_neg_item_candidates(self, user, matrix, pos_items):
        # Implement a logic to select negative item candidates
        # Example: Choose items not interacted with by the user and are popular among other users
        # Return a list of item indices
        all_items = set(range(matrix.shape[1]))
        neg_candidates = list(all_items - set(pos_items))
        return neg_candidates

# Train the BPR model
bpr_model = BPR(num_users, num_games)
bpr_model.train(user_item_matrix)

# Example prediction (You can replace these with actual user and game IDs from your data)
user_id = 0  # example user index
game_id = 10  # example game index
prediction = bpr_model.predict(user_id, game_id)
print("Prediction score:", prediction)

Prediction score: -0.017762354532299417


In [43]:
# Function to predict for a user-item pair
def predict_bpr(bpr_model, user_id, item_id):
    return bpr_model.predict(user_ids[user_id], game_ids[item_id])

# Calculate predictions for the validation set and compute MSE
def calculate_mse(bpr_model, played_valid, not_played_valid):
    mse = 0
    count = 0

    # For pairs in playedValid, the actual interaction is 1
    for (u, g) in played_valid:
        predicted_score = predict_bpr(bpr_model, u, g)
        mse += (predicted_score - 1) ** 2  # actual interaction is 1
        count += 1

    # For pairs in notPlayedValid, the actual interaction is 0
    for (u, g) in not_played_valid:
        predicted_score = predict_bpr(bpr_model, u, g)
        mse += (predicted_score - 0) ** 2  # actual interaction is 0
        count += 1

    return mse / count if count > 0 else 0

# Calculate MSE on the validation set
mse_validation = calculate_mse(bpr_model, playedValid, notPlayedValid)
print("MSE on Validation Set:", mse_validation)


MSE on Validation Set: 0.5037769518840333


In [44]:
def calculate_accuracy(bpr_model, played_valid, not_played_valid, threshold=0.5):
    correct_predictions = 0
    total_predictions = len(played_valid) + len(not_played_valid)

    # Check predictions for playedValid (should be predicted as played, i.e., score > threshold)
    for (u, g) in played_valid:
        predicted_score = predict_bpr(bpr_model, u, g)
        if predicted_score > threshold:
            correct_predictions += 1

    # Check predictions for notPlayedValid (should be predicted as not played, i.e., score <= threshold)
    for (u, g) in not_played_valid:
        predicted_score = predict_bpr(bpr_model, u, g)
        if predicted_score <= threshold:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

# Calculate Accuracy on the validation set
accuracy_validation = calculate_accuracy(bpr_model, playedValid, notPlayedValid)
print("Accuracy on Validation Set:", accuracy_validation)

Accuracy on Validation Set: 0.5


In [45]:
# List of hyperparameters to try
factor_options = [10, 20, 30]  # Number of latent factors
learning_rate_options = [0.005, 0.01, 0.05]
regularization_options = [0.001, 0.01, 0.1]

best_accuracy = 0
best_params = {}

for factors in factor_options:
    for lr in learning_rate_options:
        for reg in regularization_options:
            # Initialize and train the BPR model
            bpr_model = BPR(num_users, num_games, num_factors=factors)
            bpr_model.train(user_item_matrix, learning_rate=lr, lambda_reg=reg)

            print(current_accuracy)
            # Evaluate the model
            current_accuracy = calculate_accuracy(bpr_model, playedValid, notPlayedValid)
            
            # Update best parameters if current model is better
            if current_accuracy > best_accuracy:
                best_accuracy = current_accuracy
                best_params = {'factors': factors, 'learning_rate': lr, 'regularization': reg}

print("Best Accuracy:", best_accuracy)
print("Best Hyperparameters:", best_params)


0.47554755475547555
0.5
0.5
0.5
0.5
0.5
0.5
0.49884988498849886
0.4986998699869987
0.4881988198819882
0.5
0.5
0.5
0.5
0.5
0.5
0.4936993699369937
0.49064906490649063
0.46744674467446745
0.5
0.5
0.5
0.5
0.5
0.5
0.49004900490049
0.49744974497449745
Best Accuracy: 0.5001500150015001
Best Hyperparameters: {'factors': 30, 'learning_rate': 0.05, 'regularization': 0.1}


In [375]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [376]:
userSet = set()
gameSet = set()
playedSet = set()

for u,g,d in allHours:
    userSet.add(u)
    gameSet.add(g)
    playedSet.add((u,g))

lUserSet = list(userSet)
lGameSet = list(gameSet)

notPlayedValid = set()
for u,g,d in hoursValid:
    g = random.choice(lGameSet)
    while (u,g) in playedSet or (u,g) in notPlayedValid:
        g = random.choice(lGameSet)
    notPlayedValid.add((u,g))

playedValid = set()
for u,g,r in hoursValid:
    playedValid.add((u,g))

In [377]:
popThreshold = 2/3
simPlayedThreshold = 0.0235

In [378]:
thpop = [i/30 for i in range(15,30)]
mses = []
for t in thpop:
    mses.append(popTh(t))

TypeError: popTh() missing 1 required positional argument: 'jth'

In [None]:
thjaccards = [1*i/10000 for i in range(230,350)]
mses = []
for t in thjaccards:
    mses.append(popJaccardPredict(2/3,t))

In [379]:
def popJaccardPredict(popth,jth):
    gameCount = defaultdict(int)
    totalPlayed = 0
    
    for u,g,_ in hoursTrain:
        gameCount[g] += 1
        totalPlayed += 1
    
    mostPopular = [(gameCount[x], x) for x in gameCount]
    mostPopular.sort()
    mostPopular.reverse()
    
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > popth * totalPlayed: break
    
    predictions = 0
    for (label,sample) in [(1, playedValid), (0, notPlayedValid)]:
        for (u,g) in sample:
            maxJaccard = 0
            users = set(hoursPerItem[g])
            for g2,_ in hoursPerUser[u]:
                sim = Jaccard(users,set(hoursPerItem[g2]))
                if sim > maxJaccard:
                    maxJaccard = sim
            if maxJaccard > jth or g in return1:
                pred = 1
            else:
                pred = 0
            if pred == label:
                predictions += 1
    print(str(predictions / (len(playedValid) + len(notPlayedValid)))+" "+str(jth))
    return predictions / (len(playedValid) + len(notPlayedValid))

In [380]:
popJaccardPredict(popThreshold, simPlayedThreshold)

0.7025702570257025 0.0235


0.7025702570257025

In [381]:
predictions = open("predictions_Played.csv", 'w')
for l in open("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/hw3/pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > popThreshold * totalPlayed: break
    
    maxJaccard = 0
    users = set(hoursPerItem[g])
    for g2,_ in hoursPerUser[u]:
        sim = Jaccard(users,set(hoursPerItem[g2]))
        if sim > maxJaccard:
            maxJaccard = sim
    if maxJaccard > simPlayedThreshold or g in return1:
        pred = 1
    else:
        pred = 0
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()

In [31]:
##################################################
# Hours played prediction                        #
##################################################

In [382]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [383]:
def UsersGames(data):
    usergames = defaultdict(set)
    gameusers = defaultdict(set)
    for u, g, d in data:
        usergames[u].add(g)
        gameusers[g].add(u)
    return usergames, gameusers

In [384]:
usergames_train,gameusers_train = UsersGames(hoursTrain)

In [234]:
from random import random
from math import sqrt

# Initialize parameters
num_features = 10  # This is an example, can be tuned

# Latent feature matrices
gamma_u = {u: [random() / sqrt(num_features) for _ in range(num_features)] for u in hoursPerUser}
gamma_i = {g: [random() / sqrt(num_features) for _ in range(num_features)] for g in hoursPerItem}

def predict(u, g):
    dot_product = sum(gamma_u[u][k] * gamma_i[g][k] for k in range(num_features))
    return alpha + betaU.get(u, 0) + betaI.get(g, 0) + dot_product



KeyboardInterrupt: 

In [385]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [386]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [387]:
lambda_ = 1
iter = 100
tolerance = 1e-5 
pmse = 0
converged = False

In [388]:
usergameHours = {}
for u, g, d in hoursTrain:
    usergameHours[(u,g)] = d['hours_transformed']

In [249]:
from random import random
from math import sqrt

# Initialize parameters
num_features = 10  # This is an example, can be tuned

# Latent feature matrices
gamma_u = {u: [random() / sqrt(num_features) for _ in range(num_features)] for u in hoursPerUser}
gamma_i = {g: [random() / sqrt(num_features) for _ in range(num_features)] for g in hoursPerItem}

def predict(u, g):
    dot_product = sum(gamma_u[u][k] * gamma_i[g][k] for k in range(num_features))
    return alpha + betaU.get(u, 0) + betaI.get(g, 0) + dot_product

def train(lamb, iterations, learning_rate, tolerance, early_stopping_rounds):
    global alpha, betaU, betaI, gamma_u, gamma_i
    
    last_loss = float('inf')
    best_loss = float('inf')
    loss_increasing_rounds = 0
    learning_rate_schedule = learning_rate
    
    for iteration in range(iterations):
        total_loss = 0
        learning_rate_schedule *= (0.99 ** iteration)  # Gradually decrease learning rate
        
        for (u, g), r in usergameHours.items():
            prediction = predict(u, g)
            error = r - prediction

            # Update biases
            betaU[u] = betaU.get(u, 0) + learning_rate_schedule * (error - lamb * betaU.get(u, 0))
            betaI[g] = betaI.get(g, 0) + learning_rate_schedule * (error - lamb * betaI.get(g, 0))

            # Update latent factors
            for k in range(num_features):
                pu = gamma_u[u][k]
                qi = gamma_i[g][k]

                gamma_u[u][k] = pu + learning_rate_schedule * (error * qi - lamb * pu)
                gamma_i[g][k] = qi + learning_rate_schedule * (error * pu - lamb * qi)
            
            total_loss += error ** 2

        # Regularization term for biases and latent factors
        for u in betaU:
            total_loss += lamb * betaU[u] ** 2
        for g in betaI:
            total_loss += lamb * betaI[g] ** 2
        for u in gamma_u:
            for k in range(num_features):
                total_loss += lamb * gamma_u[u][k] ** 2
        for g in gamma_i:
            for k in range(num_features):
                total_loss += lamb * gamma_i[g][k] ** 2

        print(total_loss
        # Early stopping condition
        if total_loss > last_loss:
            loss_increasing_rounds += 1
            if loss_increasing_rounds >= early_stopping_rounds:
                print(f'Stopping early at iteration {iteration}')
                break
        else:
            loss_increasing_rounds = 0
            if total_loss < best_loss:
                best_loss = total_loss
                # Save the best parameters if necessary
        
        if abs(last_loss - total_loss) < tolerance:
            print(f'Converged at iteration {iteration}')
            break
        
        last_loss = total_loss
    
    return betaU, betaI, gamma_u, gamma_i, best_loss



# Initialize the biases
betaU = {u: 0 for u in hoursPerUser}
betaI = {g: 0 for g in hoursPerItem}
alpha = globalAverage  # This is the global average we calculated earlier

# Run training
train(lamb=0.1, iterations=1000, learning_rate=0.005, tolerance=1e-4, early_stopping_rounds=5)

predictions = [predict(u,g) for u,g,d in hoursValid]
y = [d['hours_transformed'] for u,g,d in hoursValid]

mse = mean_squared_error(predictions, y)
mse

TypeError: train() missing 1 required positional argument: 'validation_data'

In [393]:
from random import random
from math import sqrt

# Initialize parameters
num_features = 25  # This is an example, can be tuned

# Latent feature matrices
gamma_u = {u: [random() / sqrt(num_features) for _ in range(num_features)] for u in hoursPerUser}
gamma_i = {g: [random() / sqrt(num_features) for _ in range(num_features)] for g in hoursPerItem}

# Predict function
def predict(u, g):
    dot_product = sum(gamma_u[u][k] * gamma_i[g][k] for k in range(num_features))
    return alpha + betaU.get(u, 0) + betaI.get(g, 0) + dot_product

# Function to calculate the loss on validation set
def calculate_validation_loss(validation_data, lamb):
    validation_loss = 0
    for u, g, r in validation_data:
        prediction = predict(u, g)
        error = r - prediction
        validation_loss += error ** 2
    
    # Regularization term
    for u in betaU:
        validation_loss += lamb * betaU[u] ** 2
    for g in betaI:
        validation_loss += lamb * betaI[g] ** 2
    for u in gamma_u:
        for k in range(num_features):
            validation_loss += lamb * gamma_u[u][k] ** 2
    for g in gamma_i:
        for k in range(num_features):
            validation_loss += lamb * gamma_i[g][k] ** 2
    
    return validation_loss / len(validation_data)

# Training function
def train(lamb, iterations, initial_learning_rate, tolerance, early_stopping_rounds, validation_data):
    global alpha, betaU, betaI, gamma_u, gamma_i

    learning_rate = initial_learning_rate
    best_loss = float('inf')
    no_improvement_count = 0
    best_betaU, best_betaI, best_gamma_u, best_gamma_i = {}, {}, {}, {}
    
    for iteration in range(iterations):
        total_loss = 0

        learning_rate = learning_rate / (1 + decay_rate * iteration)

        for (u, g), r in usergameHours.items():
            prediction = predict(u, g)
            error = r - prediction

            


            # Update biases
            betaU[u] = betaU.get(u, 0) + learning_rate * (error - lamb * betaU.get(u, 0))
            betaI[g] = betaI.get(g, 0) + learning_rate * (error - lamb * betaI.get(g, 0))

            # Update latent factors
            for k in range(num_features):
                pu = gamma_u[u][k]
                qi = gamma_i[g][k]

                gamma_u[u][k] = pu + learning_rate * (error * qi - lamb * pu)
                gamma_i[g][k] = qi + learning_rate * (error * pu - lamb * qi)
            
            total_loss += error ** 2

        # Regularization term for biases and latent factors
        for u in betaU:
            total_loss += lamb * betaU[u] ** 2
        for g in betaI:
            total_loss += lamb * betaI[g] ** 2
        for u in gamma_u:
            for k in range(num_features):
                total_loss += lamb * gamma_u[u][k] ** 2
        for g in gamma_i:
            for k in range(num_features):
                total_loss += lamb * gamma_i[g][k] ** 2
        
        validation_loss = calculate_validation_loss(validation_data, lamb)

        print(best_loss)
        # Check if validation loss improved
        if validation_loss < best_loss:
            best_loss = validation_loss
            best_betaU, best_betaI, best_gamma_u, best_gamma_i = betaU.copy(), betaI.copy(), gamma_u.copy(), gamma_i.copy()
            no_improvement_count = 0
        else:
            no_improvement_count += 1

        if no_improvement_count >= early_stopping_rounds:
            print(f'Stopping early at iteration {iteration}')
            break

        if abs(total_loss - validation_loss) < tolerance:
            print(f'Converged at iteration {iteration}')
            break
    
    return best_betaU, best_betaI, best_gamma_u, best_gamma_i, best_loss

# Initialize the biases
alpha = globalAverage
betaU = {u: random() * 0.1 for u in hoursPerUser}
betaI = {g: random() * 0.1 for g in hoursPerItem}

# Validation data
validation_data = [(u, g, d['hours_transformed']) for u, g, d in hoursValid]

decay_rate = 0.001
# Run training with early stopping
best_betaU, best_betaI, best_gamma_u, best_gamma_i, best_loss = train(
    lamb=0.01, 
    iterations=1000, 
    initial_learning_rate=0.003, 
    tolerance=1e-5, 
    early_stopping_rounds=5, 
    validation_data=validation_data
)

print(f'Best validation loss: {best_loss}')
#0.001, 0.003 3.0531

inf
3.9713339312923095
3.601151212561402
3.413654011341416
3.2999124668385167
3.2248003036521764
3.172824859148101
3.1359821046422702
3.1096606559557083
3.0909434376466884
3.07782572498686
3.0688426424422675
3.062890994314282
3.059138911876705
3.0569703485771416
3.05594255988193
3.055750656556877
3.055750656556877
3.055750656556877
3.055750656556877
3.055750656556877
Stopping early at iteration 20
Best validation loss: 3.055750656556877


In [356]:
predictions = [predict(u,g) for u,g,d in hoursValid]
y = [d['hours_transformed'] for u,g,d in hoursValid]

mse = mean_squared_error(predictions, y)
mse

3.0589051556535334

In [353]:
def train_and_evaluate(num_features, lamb, initial_learning_rate, iterations, tolerance, early_stopping_rounds, train_data, validation_data):
    global alpha, betaU, betaI, gamma_u, gamma_i
    
    # Initialize the latent factors with the new number of features
    alpha = sum([r for _, _, r in train_data]) / len(train_data)
    betaU = {u: random() * 0.1 for u, _, _ in train_data}
    betaI = {g: random() * 0.1 for _, g, _ in train_data}
    gamma_u = {u: [random() / sqrt(num_features) for _ in range(num_features)] for u, _, _ in train_data}
    gamma_i = {g: [random() / sqrt(num_features) for _ in range(num_features)] for _, g, _ in train_data}
    
    # Train the model
    best_betaU, best_betaI, best_gamma_u, best_gamma_i, best_loss = train(
        lamb=lamb, 
        iterations=iterations, 
        initial_learning_rate=initial_learning_rate, 
        tolerance=tolerance, 
        early_stopping_rounds=early_stopping_rounds, 
        validation_data=validation_data
    )
    
    # Calculate the loss on the validation set
    validation_loss = calculate_validation_loss(validation_data, lamb)
    
    return validation_loss

# Define a range of feature counts to try
feature_counts = [5, 10, 15, 20, 25, 30]
validation_losses = []

# Iterate over the range of feature counts
for num_features in feature_counts:
    print(f"Evaluating model with {num_features} features...")
    loss = train_and_evaluate(
        num_features=num_features,
        lamb=0.01,
        initial_learning_rate=0.003,
        iterations=1000,
        tolerance=1e-5,
        early_stopping_rounds=20,
        train_data=[(u, g, d['hours_transformed']) for u, g, d in hoursTrain],  # Replace with your actual training data
        validation_data=[(u, g, d['hours_transformed']) for u, g, d in hoursValid]  # Replace with your actual validation data
    )
    validation_losses.append((num_features, loss))

# Select the number of features with the lowest validation loss
best_num_features, best_loss = min(validation_losses, key=lambda x: x[1])
print(f'Best number of features: {best_num_features} with validation loss: {best_loss}')


Evaluating model with 5 features...
inf
3.978091998286789
3.605851378010019
3.4174208409636275
3.3029957163115733
3.2272537247332393
3.174652490353369
3.137182288336349
3.1102485740072567
3.0909688015816608
3.077386097561197
3.0680902071894414
3.0620322598926513
3.058429002824698
3.0567043229222737
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
3.0564453875369932
Stopping early at iteration 34
Evaluating model with 10 features...
inf
3.965387070953643
3.5962943835478676
3.4095383720787695
3.29621958975174
3.221318244299703
3.1694227628603637
3.1325842474136714
3.10622785067199
3.087460639242577
3.074295558423495
3.0652838077643585
3.0593389032149605
3.0556455651366883
3.0536017

In [395]:
#cross valid
from sklearn.model_selection import KFold
import numpy as np
import random

def cross_validate(num_features, lamb, learning_rate, iterations, tolerance, early_stopping_rounds, train_data):
    kf = KFold(n_splits=5)  # 5-fold cross-validation
    fold_validation_losses = []

    for train_index, test_index in kf.split(train_data):
        train_subset = [train_data[i] for i in train_index]
        validation_subset = [train_data[i] for i in test_index]

        # Initialize the latent factors with the new number of features
        alpha = sum([r for _, _, r in train_subset]) / len(train_subset)
        betaU = {u: random.random() * 0.1 for u, _, _ in train_subset}
        betaI = {g: random.random() * 0.1 for _, g, _ in train_subset}
        gamma_u = {u: [random.random() / sqrt(num_features) for _ in range(num_features)] for u, _, _ in train_subset}
        gamma_i = {g: [random.random() / sqrt(num_features) for _ in range(num_features)] for _, g, _ in train_subset}
    
        # Train the model
        best_betaU, best_betaI, best_gamma_u, best_gamma_i, best_loss = train(
            lamb=lamb, 
            iterations=iterations, 
            initial_learning_rate=learning_rate, 
            tolerance=tolerance, 
            early_stopping_rounds=early_stopping_rounds, 
            validation_data=validation_subset
        )
    
        # Calculate the loss on the validation subset
        validation_loss = calculate_validation_loss(validation_subset, lamb)
        fold_validation_losses.append(validation_loss)

    average_validation_loss = np.mean(fold_validation_losses)
    return average_validation_loss

# Define a range of feature counts to try
feature_counts = [25]
average_validation_losses = []

# Perform cross-validation on the training set
for num_features in feature_counts:
    print(f"Evaluating model with {num_features} features...")
    avg_loss = cross_validate(
        num_features=num_features,
        lamb=0.01,
        learning_rate=0.003,
        iterations=1000,
        tolerance=1e-5,
        early_stopping_rounds=5,
        train_data=[(u, g, d['hours_transformed']) for u, g, d in hoursTrain]  # Replace with your actual training data
    )
    average_validation_losses.append((num_features, avg_loss))

# Select the number of features with the lowest average validation loss
best_num_features, best_avg_loss = min(average_validation_losses, key=lambda x: x[1])
print(f'Best number of features: {best_num_features} with average validation loss: {best_avg_loss}')

# Retrain the model with the best number of features on the entire training set
# and evaluate on the separate validation set
final_validation_loss = train_and_evaluate(
    num_features=best_num_features,
    lamb=0.01,
    learning_rate=0.003,
    iterations=1000,
    tolerance=1e-5,
    early_stopping_rounds=5,
    train_data=[(u, g, d['hours_transformed']) for u, g, d in hoursTrain],  # Replace with your entire training data
    validation_data=[(u, g, d['hours_transformed']) for u, g, d in hoursValid]  # Replace with your actual validation data
)

print(f'Final validation loss with {best_num_features} features: {final_validation_loss}')

Evaluating model with 25 features...
inf
2.3249410671631177
2.271673289984262
2.2166251336991953
2.160643738744893
2.1044541556112595
2.048630446623668
1.9935906706034774
1.9396190511434726
1.8869013269483634
1.835558995494893
1.7856750082482788
1.7373099618913364
1.6905108373525686
1.6453147889321118
1.6017500633160306
1.5598356058454446
1.5195804081815913
1.4809831815612433
1.4440325628964863
1.408707821996579
1.374979929267039
1.3428128226424383
1.3121647361098248
1.2829894897338912
1.25523767722717
1.228857716251128
1.2037967479761569
1.1800013869179191
1.157418331081469
1.1359948473167332
1.1156791487084692
1.096420680793193
1.078170332197211
1.0608805835170227
1.044505606306609
1.029001322142164
1.0143254300243574
1.0004374089053292
0.9872985008862761
0.974871679611023
0.9631216075351736
0.9520145850645465
0.941518493986275
0.9316027371504494
0.9222381759733733
0.9133970670135427
0.9050529986051903
0.897180828316379
0.8897566218152047
0.8827575935799403
0.8761620497654268
0.86994

TypeError: train_and_evaluate() got an unexpected keyword argument 'validation_data'

In [396]:
predictions = [predict(u,g) for u,g,d in hoursValid]
y = [d['hours_transformed'] for u,g,d in hoursValid]

mse = mean_squared_error(predictions, y)
mse

5.10839751996922

In [398]:
def predictcross(u, g, alpha, betaU, betaI, gamma_u, gamma_i):
    """Predicts the hours played using the model parameters."""
    user_factor = gamma_u.get(u, [0]*len(gamma_u[list(gamma_u.keys())[0]]))
    item_factor = gamma_i.get(g, [0]*len(gamma_i[list(gamma_i.keys())[0]]))
    dot_product = sum(user_factor[k] * item_factor[k] for k in range(len(user_factor)))
    return alpha + betaU.get(u, 0) + betaI.get(g, 0) + dot_product

def calculate_mse(validation_data, alpha, betaU, betaI, gamma_u, gamma_i):
    """Calculates the mean squared error on the validation set."""
    errors = []
    for u, g, actual_hours in validation_data:
        predicted_hours = predictcross(u, g, alpha, betaU, betaI, gamma_u, gamma_i)
        errors.append((predicted_hours - actual_hours) ** 2)
    mse = sum(errors) / len(errors)
    return mse


mse = calculate_mse([(u, g, d['hours_transformed']) for u, g, d in hoursValid], alpha, betaU, betaI, gamma_u, gamma_i)
print("MSE on validation set:", mse)

MSE on validation set: 5.108397519969221


In [311]:
hoursTrain[0]

('u70666506',
 'g49368897',
 {'userID': 'u70666506',
  'early_access': False,
  'hours': 63.5,
  'hours_transformed': 6.011227255423254,
  'found_funny': 1,
  'text': 'If you want to sit in queue for 10-20min and have 140 ping then this game is perfect for you :)',
  'gameID': 'g49368897',
  'user_id': '76561198030408772',
  'date': '2017-05-20'})

In [229]:
betaU, betaI, mse2 = iterate(2)
mse2

2.999045867911261

In [248]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/hw3/pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    re = predict(u,g)
    
    _ = predictions.write(u + ',' + g + ',' + str(re) + '\n')

predictions.close()

In [36]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [37]:
def iterate(lamb):
    newAlpha = 0
    for u,g,d in hoursTrain:
        r = d['hours_transformed']
        newAlpha += r - (betaU[u] + betaI[g])
    alpha = newAlpha / len(hoursTrain)
    for u in hoursPerUser:
        newBetaU = 0
        for g,r in hoursPerUser[u]:
            newBetaU += r - (alpha + betaI[g])
        betaU[u] = newBetaU / (lamb + len(hoursPerUser[u]))
    for g in hoursPerItem:
        newBetaI = 0
        for u,r in hoursPerItem[g]:
            newBetaI += r - (alpha + betaU[u])
        betaI[g] = newBetaI / (lamb + len(hoursPerItem[g]))
    mse = 0
    for u,g,d in hoursTrain:
        r = d['hours_transformed']
        prediction = alpha + betaU[u] + betaI[g]
        mse += (r - prediction)**2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u]**2
    for g in betaI:
        regularizer += betaI[g]**2
    mse /= len(hoursTrain)
    return mse, mse + lamb*regularizer

In [38]:
mse,objective = iterate(1)
newMSE,newObjective = iterate(1)
iterations = 2

In [39]:
while iterations < 10 or objective - newObjective > 0.01:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(1)
    iterations += 1
    print("Objective after "
        + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after "
        + str(iterations) + " iterations = " + str(newMSE))

Objective after 3 iterations = 6916.291258826528
MSE after 3 iterations = 2.756414053005335
Objective after 4 iterations = 6935.23715550776
MSE after 4 iterations = 2.755604333875777
Objective after 5 iterations = 6924.6062017768245
MSE after 5 iterations = 2.755486661616215
Objective after 6 iterations = 6905.833993834695
MSE after 6 iterations = 2.755457716152593
Objective after 7 iterations = 6885.30918519728
MSE after 7 iterations = 2.7554456693587497
Objective after 8 iterations = 6864.742698779786
MSE after 8 iterations = 2.7554377123048535
Objective after 9 iterations = 6844.576219662253
MSE after 9 iterations = 2.755430906995692
Objective after 10 iterations = 6824.91582215507
MSE after 10 iterations = 2.7554245073468486
Objective after 11 iterations = 6805.779255606049
MSE after 11 iterations = 2.7554183159504917
Objective after 12 iterations = 6787.161156344411
MSE after 12 iterations = 2.755412278370264
Objective after 13 iterations = 6769.050270855048
MSE after 13 iteration

Objective after 90 iterations = 6209.249427881521
MSE after 90 iterations = 2.755187092074258
Objective after 91 iterations = 6207.318844162245
MSE after 91 iterations = 2.7551859380875623
Objective after 92 iterations = 6205.444365248543
MSE after 92 iterations = 2.755184806169715
Objective after 93 iterations = 6203.6243950805465
MSE after 93 iterations = 2.7551836958510476
Objective after 94 iterations = 6201.857382185177
MSE after 94 iterations = 2.755182606672543
Objective after 95 iterations = 6200.141818460075
MSE after 95 iterations = 2.7551815381859104
Objective after 96 iterations = 6198.4762379894
MSE after 96 iterations = 2.755180489953302
Objective after 97 iterations = 6196.859215890833
MSE after 97 iterations = 2.7551794615468492
Objective after 98 iterations = 6195.289367192376
MSE after 98 iterations = 2.7551784525487086
Objective after 99 iterations = 6193.765345739354
MSE after 99 iterations = 2.7551774625507677
Objective after 100 iterations = 6192.28584312967
MSE a

Objective after 176 iterations = 6148.8409431136315
MSE after 176 iterations = 2.7551367224421464
Objective after 177 iterations = 6148.702537764905
MSE after 177 iterations = 2.755136470534834
Objective after 178 iterations = 6148.568557839423
MSE after 178 iterations = 2.7551362227094485
Objective after 179 iterations = 6148.438870841919
MSE after 179 iterations = 2.755135978893863
Objective after 180 iterations = 6148.3133481268615
MSE after 180 iterations = 2.755135739017546
Objective after 181 iterations = 6148.191864788228
MSE after 181 iterations = 2.7551355030114055
Objective after 182 iterations = 6148.074299552734
MSE after 182 iterations = 2.755135270807567
Objective after 183 iterations = 6147.960534675639
MSE after 183 iterations = 2.7551350423393868
Objective after 184 iterations = 6147.850455839888
MSE after 184 iterations = 2.755134817541883
Objective after 185 iterations = 6147.743952057975
MSE after 185 iterations = 2.755134596351097
Objective after 186 iterations = 6

In [40]:
validMSE = 0
for u,g,d in hoursValid:
    r = d['hours_transformed']
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    prediction = alpha + bu + bi
    validMSE += (r - prediction)**2

validMSE /= len(hoursValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 3.3620657269506733


In [41]:
answers['Q6'] = validMSE

In [42]:
assertFloat(answers['Q6'])

In [43]:
### Question 7

In [44]:
betaUs = [(betaU[u], u) for u in betaU]
betaIs = [(betaI[i], i) for i in betaI]
betaUs.sort()
betaIs.sort()

print("Maximum betaU = " + str(betaUs[-1][1]) + ' (' + str(betaUs[-1][0]) + ')')
print("Maximum betaI = " + str(betaIs[-1][1]) + ' (' + str(betaIs[-1][0]) + ')')
print("Minimum betaU = " + str(betaUs[0][1]) + ' (' + str(betaUs[0][0]) + ')')
print("Minimum betaI = " + str(betaIs[0][1]) + ' (' + str(betaIs[0][0]) + ')')

Maximum betaU = u60898505 (5.828316739259239)
Maximum betaI = g17604638 (5.495973739724736)
Minimum betaU = u13037838 (-3.0057870148761894)
Minimum betaI = g84397720 (-2.809328679823356)


In [45]:
answers['Q7'] = [betaUs[-1][0], betaUs[0][0], betaIs[-1][0], betaIs[0][0]]

In [46]:
answers['Q7']

[5.828316739259239, -3.0057870148761894, 5.495973739724736, -2.809328679823356]

In [47]:
assertFloatList(answers['Q7'], 4)

In [48]:
### Question 8

In [49]:
# Better lambda...

iterations = 1
while iterations < 10 or objective - newObjective > 0.01:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(5)
    iterations += 1
    print("Objective after " + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after " + str(iterations) + " iterations = " + str(newMSE))

Objective after 2 iterations = 23723.40581939076
MSE after 2 iterations = 2.7788624145856393
Objective after 3 iterations = 23510.585432916247
MSE after 3 iterations = 2.77950918417825
Objective after 4 iterations = 23487.108448891875
MSE after 4 iterations = 2.779632986564118
Objective after 5 iterations = 23482.603926859705
MSE after 5 iterations = 2.7796579991997605
Objective after 6 iterations = 23481.074962137227
MSE after 6 iterations = 2.7796657598050394
Objective after 7 iterations = 23480.130273496496
MSE after 7 iterations = 2.7796701526871908
Objective after 8 iterations = 23479.338493177987
MSE after 8 iterations = 2.7796737556337834
Objective after 9 iterations = 23478.61426569989
MSE after 9 iterations = 2.7796770737497773
Objective after 10 iterations = 23477.938819973006
MSE after 10 iterations = 2.7796802099553215
Objective after 11 iterations = 23477.30667015269
MSE after 11 iterations = 2.7796831871553858
Objective after 12 iterations = 23476.715002575234
MSE after 1

In [50]:
alpha_ = alpha

In [51]:
validMSE = 0
for u,g,d in hoursValid:
    r = d['hours_transformed']
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    prediction = alpha + bu + bi
    validMSE += (r - prediction)**2

validMSE /= len(hoursValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 3.3246506094357864


In [52]:
answers['Q8'] = (5.0, validMSE)

In [53]:
assertFloatList(answers['Q8'], 2)

In [482]:
predictions = open("predictions_Hours_tf.csv", 'w')
for l in open("/Users/zhiqiaogong/Projects/JupyterNotebook/cse258/hw3/pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    # bu = 0
    # bi = 0
    # if u in betaU:
    #     bu = betaU[u]
    # if g in betaI:
    #     bi = betaI[g]
    p = modelLFM.predict(userIDs[u], itemIDs[g]).numpy()
    _ = predictions.write(u + ',' + g + ',' + str(p) + '\n')

predictions.close()

In [55]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()

In [1]:
import gzip
import random
import scipy
import tensorflow as tf
from collections import defaultdict
from implicit import bpr
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

ModuleNotFoundError: No module named 'implicit'

In [2]:
pip install --upgrade tensorflow

Collecting tensorflow
  Using cached tensorflow-2.15.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.6 kB)
Collecting tensorflow-macos==2.15.0 (from tensorflow)
  Using cached tensorflow_macos-2.15.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.2 kB)
Collecting absl-py>=1.0.0 (from tensorflow-macos==2.15.0->tensorflow)
  Using cached absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-macos==2.15.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-macos==2.15.0->tensorflow)
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-macos==2.15.0->tensorflow)
  Using cached gast-0.5.4-py3-none-any.whl (19 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-macos==2.15.0->tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting h5py>=2.9.0 (from tensorflow-macos==2.15

In [2]:
pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting tqdm>=4.27 (from implicit)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading implicit-0.7.2-cp311-cp311-macosx_11_0_arm64.whl (761 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m761.6/761.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading tqdm-4.66.1-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm, implicit
Successfully installed implicit-0.7.2 tqdm-4.66.1
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install --use-pep517 surprise

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Using cached scikit-surprise-1.1.3.tar.gz (771 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[59 lines of output][0m
  [31m   [0m <string>:65: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Requirements should be satisfied by a PEP 517 installer.
  [31m   [0m         If you are using pip, you can try `pip install --use-pep517`.
  [31m   [0m         ***********************************************************

In [6]:
pip install wheel setuptools pip --upgrade

Collecting setuptools
  Using cached setuptools-68.2.2-py3-none-any.whl.metadata (6.3 kB)
Using cached setuptools-68.2.2-py3-none-any.whl (807 kB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 65.5.0
    Uninstalling setuptools-65.5.0:
      Successfully uninstalled setuptools-65.5.0
Successfully installed setuptools-68.2.2
Note: you may need to restart the kernel to use updated packages.
