In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF 
from scipy.sparse.linalg import svds
import csv
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# Read CSV
movies = pd.read_csv('movies.csv',sep=';',encoding = "utf-8",header=None)
predictions = pd.read_csv('predictions.csv',sep=';',header=None)
users = pd.read_csv('users.csv',sep=';',header=None)
ratings = pd.read_csv('ratings.csv',sep=';',header=None)

# Add column names
movies.columns = ['movieId','movieYear','movieTitle']
predictions.columns = ['userId','movieId']
users.columns = ['userId','userGender','userAge','userProfession']
ratings.columns = ['userId','movieId','rating']


In [2]:
def number_rounder(number: int) -> int:
    if (number < 1):
        return 1
    elif (number > 5):
        return 5
    else:
        return int(round(number))


In [3]:
"""
OLD CODE! 
"""
df = pd.merge(ratings,movies,on='movieId',how='outer')

# Matrix Creation
matrix = df.pivot_table(index='userId', columns='movieId', values='rating')

matrix_without_nan_np = matrix.fillna(0).to_numpy()

# Normalize User Ratings to account for scale
normalized_matrix = matrix.subtract(matrix.mean(axis=1), axis='rows')

# Calculate similarity based on cosine distances
user_similarity_matrix = cosine_similarity(normalized_matrix.fillna(0)) # cosine distance doesn't work on NaNs for some reason 

In [4]:
# READ CSV into NP ARR 
# Don't ask me why the PANDAS way doesn't work it automatically gets rid of 10 movies or somehting

my_matrix = np.zeros((len(users),len(movies)))
for index, row in ratings.iterrows():
    cur_user_id = row['userId']
    cur_movie_id = row['movieId']
    cur_rating = row['rating']
    my_matrix[cur_user_id - 1,cur_movie_id - 1] = int(cur_rating)






In [5]:
# ALTERNATE APPROACH USING SURPRISE LIBRARY

surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], Reader(rating_scale=(1, 5)))


In [6]:
svd = SVD(n_epochs=10,)
results = cross_validate(svd, surprise_data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8901  0.8923  0.8852  0.8896  0.8887  0.8889  0.8864  0.8910  0.8910  0.8919  0.8895  0.0022  
MAE (testset)     0.7045  0.7038  0.6988  0.7028  0.7030  0.7025  0.7001  0.7044  0.7045  0.7053  0.7030  0.0020  
Fit time          5.01    4.93    4.71    4.61    4.62    4.70    4.65    4.62    4.57    4.64    4.71    0.14    
Test time         0.85    0.76    0.74    0.72    0.75    0.73    0.55    0.72    0.71    0.79    0.73    0.07    


In [7]:
to_predict = []
csv_data = []

# Write to File
for index, row in predictions.iterrows():
    cur_user_id = row['userId']
    cur_movie_id = row['movieId']

    to_predict.append([cur_user_id,cur_movie_id,1])


test_answers = svd.test(np.array(to_predict))

pred_ratings = [prediction.est for prediction in test_answers]



for i in range(1,len(pred_ratings) + 1):
    csv_data.append([i,number_rounder(pred_ratings[i-1])])


with open('guess.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)



