In [114]:
import matplotlib.pyplot as plt
import csv
import pandas as pd
import operator
from statistics import mean,pstdev
import numpy as np
import random
import copy

In [115]:
path="ml-latest-small/"
ratings_path=path+"ratings.csv"
movies_path=path+"movies.csv"

In [116]:
ratings_df=pd.read_csv(ratings_path)

In [117]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [118]:
#as the movie IDs are not in particular order and not withing the range of number of movies present in the dataset
#we'll create a mapping from indices to movieides
movie_indices=[]
inverse_movie_map={}

In [119]:
movies_df=pd.read_csv(movies_path)

In [120]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [121]:
for i in range(len(movies_df)):
    movie_indices.append(movies_df['movieId'][i])
    inverse_movie_map[movies_df['movieId'][i]]=i

In [122]:
number_of_items=len(movie_indices)

In [123]:
number_of_items

9742

In [124]:
number_of_users={}
for i in range(len(ratings_df)):
    number_of_users[ratings_df['userId'][i]]=1
    

In [125]:
number_of_users=len(number_of_users)

In [126]:
len(ratings_df)

100836

In [127]:
train_nos=random.sample(range(100836), int(0.7*100836))
test_nos=list(range(0,len(ratings_df)))
test_nos=list(set(test_nos)-set(train_nos))

In [128]:
ratings_matrix_train=np.zeros(shape=(number_of_users,number_of_items))

In [129]:
ratings_matrix_train.fill(np.nan)

In [130]:
ratings_matrix_train

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [131]:
for i in train_nos:

    user_id=ratings_df['userId'][i]-1
    movie_id=inverse_movie_map[ratings_df['movieId'][i]]
    ratings_matrix_train[user_id][movie_id]=ratings_df['rating'][i]


In [132]:
ratings_matrix_train

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

In [133]:
len(train_nos)

70585

In [141]:
def mean_with_nan(list_a):
    count=0
    sum=0.0
    for i in list_a:
        if not np.isnan(i):
            #print(i)
            count+=1
            sum+=i
    #print(sum) 
    if(count==0):
        return 0
    return sum/count        
    

In [142]:
def pearson_coff(a,b):
    mean_a=mean_with_nan(a)
    mean_b=mean_with_nan(b)
    #print(mean_a)
    #print(mean_b)
    num=0.0
    den_left=0.0
    den_right=0.0
    for i in range(len(a)):
        if (not np.isnan(a[i])) and (not np.isnan(b[i])):
            num+=(a[i]-mean_a)*(b[i]-mean_b)
            den_left+=(a[i]-mean_a)*(a[i]-mean_a)
            den_right+=(b[i]-mean_b)*(b[i]-mean_b)
    den_left=np.sqrt(den_left)
    #print(den_left)
    den_right=np.sqrt(den_right)
    #print(den_right)
    if (den_right==0 or den_left==0):
        return -1
    return num/((den_left)*(den_right))

In [143]:
user_sim_matrix_pearson=np.zeros(shape=(number_of_users,number_of_users))
user_sim_matrix_pearson.fill(np.nan)
for i in range(0,number_of_users):
    for j in range(i,number_of_users):
        user_sim_matrix_pearson[i][j]=pearson_coff(ratings_matrix_train[i],ratings_matrix_train[j])
        user_sim_matrix_pearson[j][i]=user_sim_matrix_pearson[i][j]

In [144]:
def cosine_sim(a,b):
    mean_a=mean_with_nan(a)
    mean_b=mean_with_nan(b)
    #print(mean_a)
    #print(mean_b)
    num=0.0
    den_left=0.0
    den_right=0.0
    for i in range(len(a)):
        if (not np.isnan(a[i])) and (not np.isnan(b[i])):
            num+=(a[i])*(b[i])
            den_left+=(a[i])*(a[i])
            den_right+=(b[i])*(b[i])
    den_left=np.sqrt(den_left)
    #print(den_left)
    den_right=np.sqrt(den_right)
    #print(den_right)
    if (den_right==0 or den_left==0):
        return -1
    return num/((den_left)*(den_right))

In [145]:
user_sim_matrix_cosine=np.zeros(shape=(number_of_users,number_of_users))
user_sim_matrix_cosine.fill(np.nan)
for i in range(0,number_of_users):
    for j in range(i,number_of_users):
        user_sim_matrix_cosine[i][j]=cosine_sim(ratings_matrix_train[i],ratings_matrix_train[j])
        user_sim_matrix_cosine[j][i]=user_sim_matrix_cosine[i][j]

In [146]:
ratings_matrix_train_mean_centred=ratings_matrix_train.copy()
user_means=[]
for i in range(0,number_of_users):
    user_means.append(mean_with_nan(ratings_matrix_train[i]))
    for j in range(0,number_of_items):
        if(not np.isnan(ratings_matrix_train_mean_centred[i][j])):
            ratings_matrix_train_mean_centred[i][j]-=user_means[i]
        

In [156]:
np.save(path+'user_sim_matrix_cosine',user_sim_matrix_cosine)
np.save(path+'user_sim_matrix_pearson',user_sim_matrix_pearson)
np.save(path+'ratings_matrix_train',ratings_matrix_train)
np.save(path+'train_nos',train_nos)
np.save(path+'test_nos',test_nos)
np.save(path+'ratings_matrix_train_mean_centred',ratings_matrix_train_mean_centred)

In [151]:
correct_predictions=[]
for i in test_nos:
    correct_predictions.append(ratings_df['rating'][i])

In [152]:
def mean_absolute_error(a,b):
    error_sum=0.0
    for i in range(len(a)):
        error_sum+=abs(a[i]-b[i])
    error_sum/=len(a)
    return error_sum        

In [153]:
def rmse_error(a,b):
    error_sum=0.0
    for i in range(len(a)):
        error_sum+=(a[i]-b[i])*(a[i]-b[i])
    error_sum/=len(a)
    return np.sqrt(error_sum)       

In [155]:
k=20
pearson_raw_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_pearson[user_no][j],ratings_matrix_train[j][movie_no]))
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    for p in range(0,k):
        num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
        den+=abs(top_k_similar_users_same_item_rated[p][0])
    pearson_raw_result.append(num/den)    

IndexError: list index out of range

In [161]:
item_sim_matrix_pearson=np.zeros(shape=(number_of_items,number_of_items))
item_sim_matrix_pearson.fill(np.nan)
for i in range(0,number_of_items):
    if(i%50==0):
        print(i)
    for j in range(i,number_of_items):
        item_sim_matrix_pearson[i][j]=pearson_coff(ratings_matrix_train[:,i],ratings_matrix_train[:,j])
        item_sim_matrix_pearson[j][i]=item_sim_matrix_pearson[i][j]

MemoryError: 

In [160]:
np.save(path+'item_sim_matrix_pearson',item_sim_matrix_pearson)

In [None]:
item_sim_matrix_cosine=np.zeros(shape=(number_of_items,number_of_items))
item_sim_matrix_cosine.fill(np.nan)
for i in range(0,number_of_items):
    for j in range(i,number_of_items):
        item_sim_matrix_cosine[i][j]=cosine_sim(ratings_matrix_train[:,i],ratings_matrix_train[:,j])
        item_sim_matrix_cosine[j][i]=item_sim_matrix_cosine[i][j]

In [148]:
np.save(path+'item_sim_matrix_cosine',item_sim_matrix_cosine)