In [269]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity

In [270]:
df = pd.read_table("train.dta", sep=" ", names = ["User", "Movie", "Date", "Rating"], index_col = False)

In [271]:
df.head()

Unnamed: 0,User,Movie,Date,Rating
0,1,79,1123,2
1,1,144,1349,3
2,1,149,1206,4
3,1,191,1845,4
4,1,211,918,5


In [272]:
moviesRated = dict()   # Keys are users, value for each key is dictionary mapping movie to rating
rated = dict()
onlyUserMovie = dict()    # Keys are users, value for each key is set of movies that user has rated
validNeighbors = dict()
userRatings = dict()
movieRatings = dict()

In [275]:
numUsers = 458293 

In [None]:
for index, user in enumerate(df["User"]):
    if user not in userRatings:
        userRatings[user] = [df["Rating"][index]]
    else:
        userRatings[user].append(df["Rating"][index])
    
    if df["Movie"][index] not in movieRatings:
        movieRatings[df["Movie"][index]] =[ df["Rating"][index]]
    else:  
        movieRatings[df["Movie"][index]].append(df["Rating"][index])
        
    if index > 0 and df["User"][index] != df["User"][index-1]:
        moviesRated[df["User"][index-1]] = rated
        rated = dict()
    rated[df["Movie"][index]] = df["Rating"][index]
    if user in onlyUserMovie:
        onlyUserMovie[user].add(df["Movie"][index])
    else:
        onlyUserMovie[user] = {df["Movie"][index]}

for i in range(1,numUsers + 1):
    validNeighbors[i] = set()
        

In [167]:
moviesRated[df["User"][len(df["User"]) - 1]] = rated

In [172]:
for i in range(1,numUsers):   # Our UserIDs range from 1 to 7
    for j in range(i+1, numUsers + 1):
        if len(onlyUserMovie[i] & onlyUserMovie[j]) >= 0.25 * min(len(onlyUserMovie[i]), len(onlyUserMovie[j])):
            validNeighbors[i].add(j)
            validNeighbors[j].add(i)

In [253]:
def pruneValidNeighbors(movieID, user, validNeighbors, userMovieDict):
    realValidNeighbors = set()
    for i in validNeighbors[user]:
        if movieID in userMovieDict[i]:
            realValidNeighbors.add(i)
    return realValidNeighbors

In [254]:
def getRating(movieID, user, validNeighbors, userMovieDict, userRatings, movieRatings, moviesRated):
    # We return average of averages if there are no validneighbors
    if len(validNeighbors) == 0:
        return (sum(userRatings[user])/ len(userRatings[user]) + sum(movieRatings[movieID])/ len(movieRatings[movieID])) / 2
        
    
    numerator = 0
    denominator = 0
    for neighbor in validNeighbors:
        intersection = userMovieDict[user] & userMovieDict[neighbor]
        userRatings = []
        neighborRatings = []
        for m in intersection:
            userRatings.append(moviesRated[user][m])
            neighborRatings.append(moviesRated[neighbor][m])
        corr = (sc.stats.pearsonr(userRatings, neighborRatings)[0] + 1) / 2   # To scale to [0,1]... maybe use cosine later
        numerator += corr * moviesRated[neighbor][m]
        denominator += corr
    del intersection
    del userRatings
    del neighborRatings
    del validNeighbors
    return (numerator / denominator)

In [264]:
def calculateRMSE(y_predicted, y_actual):
    rms = sqrt(mean_squared_error(y_actual, y_predicted))
    return rms

In [267]:
predictions = []
for index, user in enumerate(df["User"]):
    realNeighbors = pruneValidNeighbors(df["Movie"][index], user, validNeighbors, onlyUserMovie)
    predictions.append(getRating(df["Movie"][index], user, realNeighbors, onlyUserMovie, userRatings, movieRatings, moviesRated))

[5, 5, 5, 5, 3, 4, 5, 2, 3, 4, 5, 4, 4, 5, 5, 4, 4, 4, 3, 5, 5, 5, 3, 5, 4, 4, 4, 4, 4, 5, 5, 5, 4, 4, 4, 5, 5, 4, 5, 5]
[4, 3, 5, 3, 3, 4, 5, 2, 3, 3, 4, 3, 1, 4, 1, 2, 3, 4, 3, 2, 4, 4, 3, 3, 4, 3, 3, 3, 4, 3, 4, 3, 4, 4, 2, 4, 3, 3, 2, 2]
[5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 5, 5, 3, 4, 5, 5, 2, 5, 4, 5, 5, 5, 5, 5, 5, 4, 4, 4, 5, 5, 5, 5, 5, 5, 4, 4, 4, 5, 4, 5, 4, 5, 5, 5, 4, 5, 4, 4, 4, 4, 5, 3]
[4, 4, 3, 5, 4, 5, 3, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 4, 3, 4, 4, 4, 3, 4, 4, 5, 3, 2, 4, 3, 5, 4, 3, 5, 3, 4, 4, 3]
[5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 5, 5, 3, 4, 5, 5, 2, 5, 4, 5, 5, 5, 5, 5, 5, 4, 4, 4, 5, 5, 5, 5, 5, 5, 4, 4, 4, 5, 4, 5, 4, 5, 5, 5, 4, 5, 4, 4, 4, 4, 5, 3]
[4, 4, 3, 5, 4, 5, 3, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 4, 3, 4, 4, 4, 3, 4, 4, 5, 3, 2, 4, 3, 5, 4, 3, 5, 3, 4, 4, 3]
[5, 5, 5, 5, 3, 4, 5, 2, 3, 4, 5, 4, 4, 5, 5, 4, 4, 4, 3, 5, 5, 5, 3, 5, 4, 4, 4, 4, 4, 5, 5, 5, 4, 4, 4, 5, 5, 4, 5, 5]
[4, 3, 5,

In [268]:
calculateRMSE(predictions, df["Rating"])

0.9744485044702403