In [3]:
import pandas as pd
import numpy as np
import csv

# Read CSV
movies = pd.read_csv('movies.csv',sep=';',encoding = "utf-8",header=None)
predictions = pd.read_csv('predictions.csv',sep=';',header=None)
users = pd.read_csv('users.csv',sep=';',header=None)
ratings = pd.read_csv('ratings.csv',sep=';',header=None)

# Add column names
movies.columns = ['movieId','movieYear','movieTitle']
predictions.columns = ['userId','movieId']
users.columns = ['userId','userGender','userAge','userProfession']
ratings.columns = ['userId','movieId','rating']


In [4]:
def number_rounder(number: int) -> int:
    if (number < 1):
        return 1
    elif (number > 5):
        return 5
    else:
        return int(round(number))


In [21]:
# Split into train-test split |
# We now use the whole data for training and just consider training loss so we don't need to split, I leave this here for clarity sake. 
ratings_train = ratings.sample(frac=0.8, random_state=42)
ratings_validation = ratings.drop(ratings_train.index).sample(frac=1.0)


Unnamed: 0,userId,movieId,rating
241471,5896,1474,1
817851,880,1509,5
853538,2391,2100,2
30530,4227,786,1
876595,1778,1353,3


In [5]:
# READ CSV into NP ARR 
# Don't ask me why the PANDAS way doesn't work it automatically gets rid of 10 movies or somehting @Minh
# So we have to do it the old fashioned way just by iterating, takes about 20s

def create_np_matrix(number_of_users, number_of_movies, appropriate_ratings_DF):
    np_matrix = np.zeros((number_of_users,number_of_movies))
    for index, row in appropriate_ratings_DF.iterrows():
        cur_user_id = row['userId']
        cur_movie_id = row['movieId']
        cur_rating = row['rating']
        np_matrix[cur_user_id - 1,cur_movie_id - 1] = int(cur_rating)
    
    return np_matrix

my_matrix = create_np_matrix(len(users), len(movies), ratings)


    


In [14]:
def mf_sgd(matrix, K=5, epochs=11, alpha=0.0001, beta=0.02, error_threshold=0.905):
    """
    Args:
        matrix: user-item training matrix
        K: Number of latent factors
        epochs: Number of epochs 4 SGD
        a: Learning Rate 4 SGD
        beta: Regularization term 4 SGD
        error_threshold: Error required for an early stop

    Returns:
        The Decomposed Matrices
    """    
    num_users = len(matrix)
    num_movies = len(matrix[0])

    assert num_users == len(users)
    assert num_movies == len(movies)

    non_zero_entries = np.argwhere(matrix>0)

    # Initialise decomposed matrices with random nums
    P = np.random.rand(num_users,K)
    Q = np.random.rand(num_movies,K)

    for epoch in range(epochs):
        entries_in_epoch_condition_holds_test = 0
        for entry in non_zero_entries:
            i = entry[0]
            j = entry[1]
            if matrix[i][j] > 0:
                entries_in_epoch_condition_holds_test +=1

                # self-note: reconstruct by: np.dot(P[i,:],Q[j,:]), 
                # we dot the embeddings together, can be viewed as a matrix-vector product aswell 
                cur_error = matrix[i][j] - np.dot(P[i,:],Q[j,:]) 
                

                #SGD

                for k in range(K):
                    P[i][k] = P[i][k] + alpha * (2 * cur_error * Q[j][k] - 2 * beta * P[i][k])
                    Q[j][k] = Q[j][k] + alpha * (2 * cur_error * P[i][k] - 2 * beta * Q[j][k])

        assert entries_in_epoch_condition_holds_test == len(ratings)

        acc_error = 0
        count = 0

        for entry in non_zero_entries:
            i = entry[0]
            j = entry[1]

            cur_rating = matrix[i][j]
            acc_error = acc_error + (cur_rating - np.dot(P[i,:],Q[j,:])) * (cur_rating - np.dot(P[i,:],Q[j,:]))
            for k in range(K):
                #factor in error for regularization of latent factors 
                acc_error = acc_error + (beta/2) * (P[i][k] * P[i][k] + Q[j][k] * Q[j][k])
            count += 1

        """
        We don't consider validation loss anymore, since we use all the data, SGD doesn't perform well w/ less data.
        """


        # for index, row in validation_set.iterrows():
        #     cur_user_id = row['userId']
        #     cur_movie_id = row['movieId']
        #     cur_rating = row['rating']
        #     # my_matrix[cur_user_id - 1,cur_movie_id - 1] = int(cur_rating)
        #     i = cur_user_id - 1
        #     j = cur_movie_id - 1
        #     acc_error = acc_error + (cur_rating - np.dot(P[i,:],Q[j,:])) * (cur_rating - np.dot(P[i,:],Q[j,:]))
        #     for k in range(K):
        #         #factor in error for regularization
        #         acc_error = acc_error + (beta/2) * (P[i][k] * P[i][k] + Q[j][k] * Q[j][k])
        #     count += 1

        if (count==0):
            print("Continued on Epoch # INVALID DATA OR ALGORITHM BROKEN" + str(epoch + 1))
            continue

        rmse = np.sqrt(acc_error / count)
        if (epoch % 5 == 0):
            print("EPOCH #" + str(epoch) + " Training Error (not validation): " + str(rmse))
            
        if rmse < error_threshold:
            break

    return P, Q






In [15]:
p_arr, q_arr = mf_sgd(my_matrix, error_threshold=0.88)

EPOCH #0 Training Error (not validation): 2.073213471048759
EPOCH #5 Training Error (not validation): 1.2324482370698204


KeyboardInterrupt: 

In [13]:
# CODE HERE
final_result = np.dot(p_arr, q_arr.T)

[[3.34127429 2.7637479  2.55651208 ... 2.40075143 1.96865865 2.99586454]
 [3.6730696  2.76892708 2.78450855 ... 2.40077239 2.15570131 3.20970408]
 [3.42669967 2.59752878 2.38853633 ... 2.56925243 1.97280817 2.92722212]
 ...
 [2.74582391 1.80652757 2.01862944 ... 1.79087162 1.34931832 2.19197826]
 [3.83990581 2.52644848 2.38239603 ... 2.54302212 1.81380701 2.831786  ]
 [4.11389816 3.26033626 3.08530657 ... 2.8052905  2.4673001  3.64557349]]


1.6587257135079283

In [65]:
csv_data = []

# Write to File
for index, row in predictions.iterrows():
    cur_user_id = row['userId']
    cur_movie_id = row['movieId']

    csv_data.append([index + 1, number_rounder(final_result[cur_user_id - 1, cur_movie_id - 1])])

    
with open('guess_no_lib.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

In [66]:
# np.save('temp_result',final_result)
# test = np.load('temp_result.npy')