# Recommendation Systems based on neighborhood based approaches

In [None]:
import matplotlib.pyplot as plt
import csv
import pandas as pd
import operator
from statistics import mean,pstdev
import numpy as np
import random
import copy

In [2]:
path="../ml-latest-small/"
ratings_path=path+"ratings.csv"
movies_path=path+"movies.csv"

In [4]:
ratings_df=pd.read_csv(ratings_path)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


The matrix is sparse, so all the movies may not be there, so we need an index to movieId mapping so that we can save space in matrix generation

In [5]:
movie_indices=[] 
inverse_movie_map={}
movies_df=pd.read_csv(movies_path)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


The two steps given below can optimised by converting lists into sets and taking their sizes.

In [6]:
for i in range(len(movies_df)):
    movie_indices.append(movies_df['movieId'][i])
    inverse_movie_map[movies_df['movieId'][i]]=i
number_of_items=len(movie_indices)

In [7]:
number_of_users={}
for i in range(len(ratings_df)):
    number_of_users[ratings_df['userId'][i]]=1
number_of_users=len(number_of_users)

In [10]:
train_nos=random.sample(range(100836), int(0.7*100836))
test_nos=list(range(0,len(ratings_df)))
test_nos=list(set(test_nos)-set(train_nos))

We create our ratings matrix, note that we only put the values from the train set

In [11]:
ratings_matrix_train=np.zeros(shape=(number_of_users,number_of_items))
ratings_matrix_train.fill(np.nan)
for i in train_nos:

    user_id=ratings_df['userId'][i]-1
    movie_id=inverse_movie_map[ratings_df['movieId'][i]]
    ratings_matrix_train[user_id][movie_id]=ratings_df['rating'][i]


array([4, 3])

In [None]:
def mean_with_nan(list_a):
    count=0
    sum=0.0
    for i in list_a:
        if not np.isnan(i):
            #print(i)
            count+=1
            sum+=i
    #print(sum) 
    if(count==0):
        return 0
    return sum/count        
    

In [None]:
def pearson_coff(a,b):
    mean_a=mean_with_nan(a)
    mean_b=mean_with_nan(b)
    #print(mean_a)
    #print(mean_b)
    num=0.0
    den_left=0.0
    den_right=0.0
    for i in range(len(a)):
        if (not np.isnan(a[i])) and (not np.isnan(b[i])):
            num+=(a[i]-mean_a)*(b[i]-mean_b)
            den_left+=(a[i]-mean_a)*(a[i]-mean_a)
            den_right+=(b[i]-mean_b)*(b[i]-mean_b)
    den_left=np.sqrt(den_left)
    #print(den_left)
    den_right=np.sqrt(den_right)
    #print(den_right)
    if (den_right==0 or den_left==0):
        return -1
    return num/((den_left)*(den_right))

Creating User Similarity Matrix based on pearson coefficient

In [None]:
user_sim_matrix_pearson=np.zeros(shape=(number_of_users,number_of_users))
user_sim_matrix_pearson.fill(np.nan)
for i in range(0,number_of_users):
    for j in range(i,number_of_users):
        user_sim_matrix_pearson[i][j]=pearson_coff(ratings_matrix_train[i],ratings_matrix_train[j])
        user_sim_matrix_pearson[j][i]=user_sim_matrix_pearson[i][j]

In [None]:
def cosine_sim(a,b):
    mean_a=mean_with_nan(a)
    mean_b=mean_with_nan(b)
    #print(mean_a)
    #print(mean_b)
    num=0.0
    den_left=0.0
    den_right=0.0
    for i in range(len(a)):
        if (not np.isnan(a[i])) and (not np.isnan(b[i])):
            num+=(a[i])*(b[i])
            den_left+=(a[i])*(a[i])
            den_right+=(b[i])*(b[i])
    den_left=np.sqrt(den_left)
    #print(den_left)
    den_right=np.sqrt(den_right)
    #print(den_right)
    if (den_right==0 or den_left==0):
        return -1
    return num/((den_left)*(den_right))

Creating User Similarity Matrix based on cosine similarity

In [None]:
user_sim_matrix_cosine=np.zeros(shape=(number_of_users,number_of_users))
user_sim_matrix_cosine.fill(np.nan)
for i in range(0,number_of_users):
    for j in range(i,number_of_users):
        user_sim_matrix_cosine[i][j]=cosine_sim(ratings_matrix_train[i],ratings_matrix_train[j])
        user_sim_matrix_cosine[j][i]=user_sim_matrix_cosine[i][j]

In [None]:
# np.save(path+'user_sim_matrix_cosine',user_sim_matrix_cosine)
# np.save(path+'user_sim_matrix_pearson',user_sim_matrix_pearson)
# np.save(path+'ratings_matrix_train',ratings_matrix_train)
# np.save(path+'train_nos',train_nos)
# np.save(path+'test_nos',test_nos)
#np.save(path+'ratings_matrix_train_mean_centred',ratings_matrix_train_mean_centred)

In [None]:
correct_predictions=[]
for i in test_nos:
    correct_predictions.append(ratings_df['rating'][i])

In [None]:
def stdv_with_nan(list_a):

    list_b=[]
    for i in list_a:
        if not np.isnan(i):
            #print(i)
            list_b.append(k)
    #print(sum) 
    if(len(list_b)<2):
        return 0.00001
    if (pstdev(list_b)!=0):
        return pstdev(list_b)
    return 0.00001
    #return sum/count      

In [None]:
def mean_absolute_error(a,b):
    error_sum=0.0
    for i in range(len(a)):
        error_sum+=abs(a[i]-b[i])
    error_sum/=len(a)
    return error_sum        

In [None]:
def rmse_error(a,b):
    error_sum=0.0
    for i in range(len(a)):
        error_sum+=(a[i]-b[i])*(a[i]-b[i])
    error_sum/=len(a)
    return np.sqrt(error_sum)       

Finding predictions without using mean centring

In [None]:
k=10
pearson_raw_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_pearson[user_no][j],ratings_matrix_train[j][movie_no]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        pearson_raw_result.append(num/den) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        pearson_raw_result.append(num/den) 
    else:
        pearson_raw_result.append(mean_with_nan(ratings_matrix_train[user_no]))


In [None]:
k=10
cosine_raw_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_cosine[user_no][j],ratings_matrix_train[j][movie_no]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        cosine_raw_result.append(num/den) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        cosine_raw_result.append(num/den) 
    else:
        cosine_raw_result.append(mean_with_nan(ratings_matrix_train[user_no]))


Finding predictions with using mean centring

In [None]:
k=10
cosine_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_cosine[user_no][j],ratings_matrix_train[j][movie_no]-mean_of_users[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        cosine_result.append(mean_of_users[user_no]+(num/den)) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
            
        cosine_result.append(mean_of_users[user_no]+(num/den)) 
    else:
        cosine_result.append(mean_with_nan(ratings_matrix_train[user_no]))


In [None]:
k=10
pearson_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_pearson[user_no][j],ratings_matrix_train[j][movie_no]-mean_of_users[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        pearson_result.append(mean_of_users[user_no]+(num/den)) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
            
        pearson_result.append(mean_of_users[user_no]+(num/den)) 
    else:
        pearson_result.append(mean_with_nan(ratings_matrix_train[user_no]))


In [None]:
print(rmse_error(correct_predictions,pearson_result))
print(rmse_error(correct_predictions,cosine_result))
print(rmse_error(correct_predictions,pearson_raw_result))
print(rmse_error(correct_predictions,cosine_raw_result))

In [None]:
print(mean_absolute_error(correct_predictions,pearson_result))
print(mean_absolute_error(correct_predictions,cosine_result))
print(mean_absolute_error(correct_predictions,pearson_raw_result))
print(mean_absolute_error(correct_predictions,cosine_raw_result))

Using z score for predictions, i.e, accounting for standard deviation as well

In [None]:
k=10
pearson_std_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_pearson[user_no][j],(ratings_matrix_train[j][movie_no]-mean_of_users[j])/stdv_with_nan(ratings_matrix_train[j])))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        pearson_std_result.append(mean_of_users[user_no]+stdv_with_nan(ratings_matrix_train[user_no])*(num/den)) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
            
        pearson_std_result.append(mean_of_users[user_no]+stdv_with_nan(ratings_matrix_train[user_no])*(num/den)) 
    else:
        pearson_std_result.append(mean_with_nan(ratings_matrix_train[user_no]))


In [None]:
print(rmse_error(correct_predictions,pearson_std_result))
print(mean_absolute_error(correct_predictions,pearson_std_result))

For Long_Tail issue, using inverse user frequency

In [None]:
weights=[]
for i in range(number_of_items):
    count=0.000005
    for j in range(number_of_users):
        if(not np.isnan(ratings_matrix_train[j][i])):
            count+=1
    weights.append(np.log(number_of_users/count))  

In [None]:
user_dict={}
for i in range(number_of_users):
    user_dict[i]=[]
    for j in range(number_of_items):
        if(not np.isnan(ratings_matrix_train[i][j])):
            user_dict[i].append(j)

In [None]:
def sim(a,b):
    mean_a=mean_of_users[a]
    mean_b=mean_of_users[b]
    #print(mean_a)
    #print(mean_b)
    num=0.0
    den_left=0.0
    den_right=0.0
    for i in user_dict[a]:

        if (not np.isnan(ratings_matrix_train[b][i])):
            rating_a=ratings_matrix_train[a][i]
            rating_b=ratings_matrix_train[b][i]
            num+=(rating_a-mean_a)*(rating_b-mean_b)*weights[i]
            den_left+=(rating_a-mean_a)*(rating_a-mean_a)*weights[i]
            den_right+=(rating_b-mean_b)*(rating_b-mean_b)*weights[i]
    den_left=np.sqrt(den_left)
    #print(den_left)
    den_right=np.sqrt(den_right)
    #print(den_right)
    if (den_right==0 or den_left==0):
        return -1
    return num/((den_left)*(den_right))

In [None]:
sim_matrix_tail=np.zeros(shape=(number_of_users,number_of_users))
sim_matrix_tail.fill(np.nan)
for i in range(0,number_of_users):
    if(i%50==0):
        print(i)
    for j in range(i,number_of_users):
        sim_matrix_tail[i][j]=sim(i,j)
        sim_matrix_tail[j][i]=sim_matrix_tail[i][j]

In [None]:
k=10
tail_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    for j in range(0,number_of_users):
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((sim_matrix_tail[user_no][j],ratings_matrix_train[j][movie_no]-mean_of_users[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        tail_result.append(mean_of_users[user_no]+(num/den)) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
            
        tail_result.append(mean_of_users[user_no]+(num/den)) 
    else:
        tail_result.append(mean_with_nan(ratings_matrix_train[user_no]))


In [None]:
item_sim_matrix_pearson=np.zeros(shape=(number_of_items,number_of_items))
item_sim_matrix_pearson.fill(np.nan)
for i in range(0,number_of_items):
    if(i%50==0):
        print(i)
    for j in range(i,number_of_items):
        item_sim_matrix_pearson[i][j]=pearson_coff(ratings_matrix_train[:,i],ratings_matrix_train[:,j])
        item_sim_matrix_pearson[j][i]=item_sim_matrix_pearson[i][j]

In [None]:
item_sim_matrix_cosine=np.zeros(shape=(number_of_items,number_of_items))
item_sim_matrix_cosine.fill(np.nan)
for i in range(0,number_of_items):
    if(i%50==0):
        print(i)
    for j in range(i,number_of_items):
        item_sim_matrix_cosine[i][j]=cosine_sim(ratings_matrix_train[:,i],ratings_matrix_train[:,j])
        item_sim_matrix_cosine[j][i]=item_sim_matrix_cosine[i][j]

In [None]:
k=10
cosine_item_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_items_same_users_rated=[]
    for j in range(0,number_of_items):
        if(j!=movie_no):
            if(not np.isnan(ratings_matrix_train[user_no][j])):
                top_k_similar_items_same_users_rated.append((item_sim_matrix_cosine[movie_no][j],ratings_matrix_train[user_no][j]-mean_of_items[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_items_same_users_rated=sorted(top_k_similar_items_same_users_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_items_same_users_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_items_same_users_rated[p][0]*top_k_similar_items_same_users_rated[p][1]
            den+=abs(top_k_similar_items_same_users_rated[p][0])
        cosine_item_result.append(mean_of_items[movie_no]+(num/den)) 
    elif(len(top_k_similar_items_same_users_rated)>0):
        for p in range(0,len(top_k_similar_items_same_users_rated)):
            num+=top_k_similar_items_same_users_rated[p][0]*top_k_similar_items_same_users_rated[p][1]
            den+=abs(top_k_similar_items_same_users_rated[p][0])
            
        cosine_item_result.append(mean_of_items[item_no]+(num/den)) 
    else:
        cosine_item_result.append(mean_with_nan(ratings_matrix_train[:,movie_no]))


In [None]:
k=10
pearson_item_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_items_same_users_rated=[]
    for j in range(0,number_of_items):
        if(j!=movie_no):
            if(not np.isnan(ratings_matrix_train[user_no][j])):
                top_k_similar_items_same_users_rated.append((item_sim_matrix_pearson[movie_no][j],ratings_matrix_train[user_no][j]-mean_of_items[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_items_same_users_rated=sorted(top_k_similar_items_same_users_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.0
    if(not len(top_k_similar_items_same_users_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_items_same_users_rated[p][0]*top_k_similar_items_same_users_rated[p][1]
            den+=abs(top_k_similar_items_same_users_rated[p][0])
        pearson_item_result.append(mean_of_items[movie_no]+(num/den)) 
    elif(len(top_k_similar_items_same_users_rated)>0):
        for p in range(0,len(top_k_similar_items_same_users_rated)):
            num+=top_k_similar_items_same_users_rated[p][0]*top_k_similar_items_same_users_rated[p][1]
            den+=abs(top_k_similar_items_same_users_rated[p][0])
            
        pearson_item_result.append(mean_of_items[item_no]+(num/den)) 
    else:
        pearson_item_result.append(mean_with_nan(ratings_matrix_train[:,movie_no]))


In [None]:
print(rmse_error(correct_predictions,pearson_item_result))
print(rmse_error(correct_predictions,cosine_item_result))
print(mean_absolute_error(correct_predictions,pearson_item_result))
print(mean_absolute_error(correct_predictions,cosine_item_result))

Trying fusion of both(user and item based approaches)!

In [None]:
def mean_list(a,b):
    c=[]
    for i in range(len(a)):
        c.append((a[i]+b[i])/2)
    return c    

In [None]:
def mean_list_1(a,b,error_a,error_b):
    c=[]
    for i in range(len(a)):
        c.append((a[i]*error_b+b[i]*error_a)/(error_a+error_b))
    return c    

In [None]:
hybrid_pearson=mean_list(pearson_result,pearson_item_result)
hybrid_cosine=mean_list(cosine_result,cosine_item_result)

In [None]:
print(mean_absolute_error(correct_predictions,mean_list_1(cosine_result,cosine_item_result,cosine_error,cosine_item_error)))

Using K Means to determine neighborhood

In [None]:
def manhattan_distance(a,b):
    dist=0.0
    count=0
    for i in range(len(a)):
        if(not np.isnan(a[i]) and not np.isnan(b[i])):
            dist+=abs(a[i]-b[i])
            count+=1
    if(count==0):
        return 99;
    else:
        return dist/count

In [None]:
dist_matrix=np.zeros(shape=(number_of_users,number_of_users))
dist_matrix.fill(np.nan)
for i in range(number_of_users):
    if(i%50==0):
        print(i)
    dist_matrix[i][i]=0
    for j in range(i,number_of_users):
        dist_matrix[i][j]=manhattan_distance(ratings_matrix_train[i],ratings_matrix_train[j])
        dist_matrix[j][i]=dist_matrix[i][j]
        

In [None]:
def mean_sim(list_a):
    sim_sum=0.0
    count=len(list_a)
    for i in range(len(list_a)):
        sim_sum+=list_a[i]
    if(count==0):
        return -1
    return sim_sum/count   

In [None]:
for i in range(number_of_users):
    user_sim_matrix_pearson[i][i]=1.000000002

In [None]:
k=20
centroids={}
for i in range(k):
    centroids[i+20]=[]

Main K-Means Algorithm

In [None]:
for l in range(13):
    for key in centroids:
        centroids[key]=[]    
    for i in range(number_of_users):
        max_sim=-99999
        max_cent=-1
        for key in centroids:
            if(max_sim<user_sim_matrix_pearson[key][i]):
                max_sim=user_sim_matrix_pearson[key][i]
                max_cent=key
                #print(str(max_sim)+" "+str(max_cent)+" "+str(i))
        centroids[max_cent].append(i)
    #print(centroids) 
    #print('\n')
    centroids1={}    
    for key in centroids:
        #print(key)
        
        list_a=centroids[key]
        #print(list_a)
        max_sim=-9999999
        new_centroid=key
        for j in range(len(list_a)):
            list_cur=[]
            for k in range(len(list_a)):
                if(j!=k):
                    list_cur.append(user_sim_matrix_pearson[j][k])
                    
            cur_sim=mean_sim(list_cur)
            if(max_sim<cur_sim):
                max_sim=cur_sim
                new_centroid=list_a[j]
#                 print(cur_sim)
#             else:
#                 print(cur_sim)
        #print(new_centroid)
        centroids1[new_centroid]=[]
        #print(centroids1)
    centroids=centroids1 
    #print(centroids)
    for key in centroids:
        centroids[key]=[]    
    for i in range(number_of_users):
        max_sim=-1.1
        max_cent=-1
        for key in centroids:
            if(max_sim<user_sim_matrix_pearson[key][i]):
                max_sim=user_sim_matrix_pearson[key][i]
                max_cent=key
        centroids[max_cent].append(i)


In [None]:
cluster_dict={}
for key in centroids:
    cluster_dict[key]=key
    list_a=centroids[key]
    for i in list_a:
        cluster_dict[i]=key

Using only entries present in cluster for prediction (basic KMeans)

In [None]:
k=10
pearson_kmeans_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    this_key=cluster_dict[user_no]
    this_list=centroids[this_key]
    for j in this_list:
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                top_k_similar_users_same_item_rated.append((user_sim_matrix_pearson[user_no][j],ratings_matrix_train[j][movie_no]-mean_of_users[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.00000009
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        pearson_kmeans_result.append(mean_of_users[user_no]+(num/den)) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
            
        pearson_kmeans_result.append(mean_of_users[user_no]+(num/den)) 
    else:
        pearson_kmeans_result.append(mean_with_nan(ratings_matrix_train[user_no]))


In [None]:
print(rmse_error(correct_predictions,pearson_kmeans_result))
print(mean_absolute_error(correct_predictions,pearson_kmeans_result))

Extending Normal KMeans for data smoothing

In [None]:
check_matrix=np.zeros(shape=(number_of_users,number_of_items))
for i in range(number_of_users):
    for j in range(number_of_items):
        if(np.isnan(ratings_matrix_train[i][j])):
            check_matrix[i][j]=1

In [None]:
for i in range(number_of_users):
    for j in range(number_of_items):
        if(np.isnan(ratings_matrix_train[i][j])):
            Ru_bar=mean_of_users[i]
            this_key=cluster_dict[i]
            list_a=centroids[this_key]
            num=0.0
            den=0
            for k in list_a:
                if(check_matrix[k][j]==0):
                    num+=ratings_matrix_train[k][j]-mean_of_users[k]
                    den+=1
            to_add=0
            if(den>0):
                to_add=num/den
            ratings_matrix_train[i][j]=Ru_bar+to_add    

In [None]:
ratings_matrix_train=np.load(path+"fake_ratings.npy")

In [None]:
weight=0.6
def new_sim(a,b,mean_a,mean_b,a_no):
#     mean_a=mean_with_nan(a)
#     mean_b=mean_with_nan(b)
    #print(mean_a)
    #print(mean_b)
    num=0.0
    den_left=0.0
    den_right=0.0
    for i in range(len(a)):
        if (not np.isnan(a[i])) and (not np.isnan(b[i])):
            w=0
            if(check_matrix[a_no][i]==0):
                w=weight
            else:
                w=1-weight
            
            num+=(a[i]-mean_a)*(b[i]-mean_b)*w
            den_left+=(a[i]-mean_a)*(a[i]-mean_a)*w*w
            den_right+=(b[i]-mean_b)*(b[i]-mean_b)
    den_left=np.sqrt(den_left)
    #print(den_left)
    den_right=np.sqrt(den_right)
    #print(den_right)
    if (den_right==0 or den_left==0):
        return -1
    return num/((den_left)*(den_right))

In [None]:
latest_sim_matrix=np.ones(shape=(number_of_users,number_of_users))
for i in range(number_of_users):
    if(i%50==0):
        print(i)
    for j in range(number_of_users):
        if(i!=j):
            latest_sim_matrix[i][j]=new_sim(ratings_matrix_train[i],ratings_matrix_train[j],mean_of_users[i],mean_of_users[j],i)

In [None]:
np.save(path+"latest_sim_matrix",latest_sim_matrix)

Predicting Ratings after data smoothing

In [None]:
k=10
second_kmeans_result=[]
for i in test_nos:
    movie_no=inverse_movie_map[ratings_df['movieId'][i]]
    user_no=ratings_df['userId'][i]-1
    
    top_k_similar_users_same_item_rated=[]
    this_key=cluster_dict[user_no]
    this_list=centroids[this_key]
    for j in this_list:
        if(j!=user_no):
            if(not np.isnan(ratings_matrix_train[j][movie_no])):
                w=0
                if(check_matrix[j][movie_no]==0):
                    w=weight
                else:
                    w=1-weight
                top_k_similar_users_same_item_rated.append((w*latest_sim_matrix[user_no][j],ratings_matrix_train[j][movie_no]-mean_of_users[j]))
    #print(len(top_k_similar_users_same_item_rated))
    #print(top_k_similar_users_same_item_rated)            
    top_k_similar_users_same_item_rated=sorted(top_k_similar_users_same_item_rated,key=lambda x: x[0],reverse=True)  
    num=0.0
    den=0.00000009
    if(not len(top_k_similar_users_same_item_rated)<k):
        for p in range(0,k):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
        second_kmeans_result.append(mean_of_users[user_no]+(num/den)) 
    elif(len(top_k_similar_users_same_item_rated)>0):
        for p in range(0,len(top_k_similar_users_same_item_rated)):
            num+=top_k_similar_users_same_item_rated[p][0]*top_k_similar_users_same_item_rated[p][1]
            den+=abs(top_k_similar_users_same_item_rated[p][0])
            
        second_kmeans_result.append(mean_of_users[user_no]+(num/den)) 
    else:
        second_kmeans_result.append(mean_with_nan(ratings_matrix_train[user_no]))

    

In [None]:
print(rmse_error(correct_predictions,second_kmeans_result))
print(mean_absolute_error(correct_predictions,second_kmeans_result))