In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

movies = pd.read_csv("ml-1m/movies.csv",encoding="Latin1")
ratings = pd.read_csv("ml-1m/ratings.csv")
tags = pd.read_csv("ml-1m/tags.csv",encoding="Latin1")

In [2]:
#calculates average rating for each user 
avg_rating = ratings.groupby(by="userId",as_index=False)['rating'].mean()
#merge average rating column with the rest of the data
data = pd.merge(ratings,avg_rating,on='userId')

#get the sum of the ratings that each user gave to that item subtracting the average rating of that user
data['sum'] = data['rating_x'] - data['rating_y']
data

Unnamed: 0,userId,movieId,rating_x,timestamp,rating_y,sum
0,1,1,4.0,964982703,4.366379,-0.366379
1,1,3,4.0,964981247,4.366379,-0.366379
2,1,6,4.0,964982224,4.366379,-0.366379
3,1,47,5.0,964983815,4.366379,0.633621
4,1,50,5.0,964982931,4.366379,0.633621
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,3.688556,0.311444
100832,610,168248,5.0,1493850091,3.688556,1.311444
100833,610,168250,5.0,1494273047,3.688556,1.311444
100834,610,168252,5.0,1493846352,3.688556,1.311444


In [3]:
#Data preprocessing and Cleaning
similarity_matrix = pd.pivot_table(data,values='sum',index='userId',columns='movieId')
similarity_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


In [4]:
#replace NaN with movie average
movie_avg_matrix = similarity_matrix.fillna(similarity_matrix.mean(axis=0))
movie_avg_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,-0.053158,-0.366379,-1.096045,-0.522626,-0.366379,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
2,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
3,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
4,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
5,0.363636,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-1.157399,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
607,0.213904,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
608,-0.634176,-1.134176,-1.134176,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,0.865824,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
609,-0.270270,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,0.729730,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024


In [5]:
#Function to detrmine similarities using cosine formula with a matrix as an input

def cosine_sim(matrix):
    similar = cosine_similarity(matrix)
    np.fill_diagonal(similar, 0 )
    similarities = pd.DataFrame(similar ,index=matrix.index)
    similarities.columns=matrix.index
    return similarities

In [6]:
# determines similarity between different users by movie average using cosine similarity formula
movie_similarities = cosine_sim(movie_avg_matrix)
movie_similarities.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.988283,0.978406,0.96422,0.986819,0.970456,0.971643,0.987468,0.986382,0.973397,...,0.987335,0.978916,0.917922,0.983978,0.978638,0.959693,0.97612,0.932806,0.98938,0.952774
2,0.988283,0.0,0.987141,0.971166,0.995793,0.979893,0.981852,0.995168,0.995108,0.981285,...,0.996067,0.988455,0.929086,0.993014,0.988206,0.968868,0.983619,0.940224,0.997957,0.963114
3,0.978406,0.987141,0.0,0.961237,0.985179,0.970773,0.971932,0.98514,0.985263,0.971464,...,0.986072,0.978562,0.921433,0.983193,0.978363,0.957067,0.974114,0.930653,0.988086,0.954265
4,0.96422,0.971166,0.961237,0.0,0.968638,0.955187,0.958876,0.97009,0.969158,0.959626,...,0.970625,0.964815,0.903118,0.967106,0.963962,0.942701,0.958891,0.911591,0.9721,0.935866
5,0.986819,0.995793,0.985179,0.968638,0.0,0.978368,0.980011,0.992905,0.993494,0.979161,...,0.994448,0.986028,0.928126,0.991066,0.98609,0.96723,0.982366,0.938353,0.996584,0.960854


In [7]:
#function to display similarities between users
def similar_movies_user(user1, user2):
    similiar = data[data.userId==user1].merge(data[data.userId==user2],on="movieId",how="inner")
    return similiar.merge(movies, on='movieId')

#returns comparred users that have similarily rated movies
df = similar_movies_user(30,122)
df = df.loc[:,['rating_x_x','rating_x_y','title']]
df.head()

Unnamed: 0,rating_x_x,rating_x_y,title
0,5.0,5.0,Star Wars: Episode IV - A New Hope (1977)
1,5.0,5.0,"Shawshank Redemption, The (1994)"
2,3.5,5.0,Terminator 2: Judgment Day (1991)
3,5.0,5.0,Star Wars: Episode V - The Empire Strikes Back...
4,3.0,5.0,Aliens (1986)


In [8]:
# returns k neighbours (similar users) for a user 
def get_neighbours(df, k):
    index_order = np.argsort(df.values, axis=1)[:, :k]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:k].index, index=['top{}'.format(i) for i in range(1, k+1)]), axis=1)
    return df

In [9]:
# top 25 neighbours for each user
similar_users_movie = get_neighbours(movie_similarities,30)
similar_users_movie.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,49,54,72,53,515,550,189,133,26,513,...,145,60,595,574,556,92,435,163,439,340
2,49,189,515,53,25,145,54,26,87,194,...,65,163,251,150,299,440,519,507,245,521
3,515,49,25,53,496,54,442,72,26,87,...,92,595,556,65,463,251,180,150,163,439
4,581,54,189,49,25,515,300,53,472,251,...,26,289,463,609,214,206,507,574,120,556
5,145,49,515,53,609,26,189,550,35,87,...,293,142,72,92,556,439,478,163,595,299


In [10]:
Rating_avg = data.astype({"movieId": str})
Movie_user = Rating_avg.groupby(by = 'userId')['movieId'].apply(lambda x:','.join(x))

def get_score(user,movie):
    a = similar_users_movie[movie_similarities.index==user].values #get list of similar users
    b = a.squeeze().tolist() #make similar users a list
    c = movie_avg_matrix.loc[:,movie] # get all ratings for input movie
    d = c[c.index.isin(b)] #only gets ratings of similar users
    f = d[d.notnull()] #makes sure there is a similar user
    avg_user = avg_rating.loc[avg_rating['userId'] == user,'rating'].values[0] #get users average movie rating
    index = f.index.values.squeeze().tolist() #turns similar users into a list
    corr = movie_similarities.loc[user,index] #gets ratings for user and similar users
    fin = pd.concat([f, corr], axis=1) #merges ratings of similar users with correlation to user
    fin.columns = ['sum','correlation'] #change column name
    fin['score']=fin.apply(lambda x:x['sum'] * x['correlation'],axis=1) #multiply sum*correlation and put results into new column score
    nume = fin['score'].sum() #sum of score column
    deno = fin['correlation'].sum() #sum of correlation column
    final_score = avg_user + (nume/deno) #get final estimated score
    return final_score


#The function gets a list of similar users and their ratings for the input movie. then it multiplies the movie average rating sum by the correlation to the users
#found in the cosine similarty matrix, making the similar users have a weighted input with the more correlated users having a bigger weight. Then the final
#score is calculated by adding the users average rating with the sum of the weighted score divided by the sum of the correlation of the users.
score = get_score(610, 170875)
print("score (u,i) is",score)


score (u,i) is 2.6333347529270554


In [11]:
def recommend_movies(user):
    movies_seen = similarity_matrix.notna().any().tolist()
    a = similar_users_movie[similar_users_movie.index== user].values
    b = a.squeeze().tolist()
    d = Movie_user[Movie_user.index.isin(b)]
    l = ','.join(d.values)
    Movie_seen_by_similar_users = l.split(',')
    Movies_under_consideration = list(set(Movie_seen_by_similar_users)-set(list(map(str, movies_seen))))
    Movies_under_consideration = list(map(int, Movies_under_consideration))
    #print(Movies_under_consideration)
    score = []
    for item in Movies_under_consideration:
        #print(item)
        c = movie_avg_matrix.loc[:,item]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        avg_user = avg_rating.loc[avg_rating['userId'] == user,'rating'].values[0]
        index = f.index.values.squeeze().tolist()
        corr = movie_similarities.loc[user,index]
        fin = pd.concat([f, corr], axis=1)
        fin.columns = ['sum','correlation']
        fin['score']=fin.apply(lambda x:x['sum'] * x['correlation'],axis=1)
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        final_score = avg_user + (nume/deno)
        score.append(final_score)
    data = pd.DataFrame({'movieId':Movies_under_consideration,'score':score})
    top_5_recommendation = data.sort_values(by='score',ascending=False).head(5)
    Movie_Name = top_5_recommendation.merge(movies, how='inner', on='movieId')
    Movie_Names = Movie_Name.title.values.tolist()
    return Movie_Names
        

In [17]:
user = 6
predicted_movies = recommend_movies(user)
print(" ")
print("The Recommendations for User Id: {}".format(user))
print("   ")
for i in predicted_movies:
    print(i)

TypeError: Can only merge Series or DataFrame objects, a <class 'pandas.core.indexes.numeric.Int64Index'> was passed

In [13]:
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from collections import defaultdict

#Alternate cosine simlarity function using surprise to compare against
select_users = ratings.groupby('userId').filter(lambda x: len(x) >= 50)
reader = Reader(rating_scale=(1, 5))
dataf = Dataset.load_from_df(select_users[['userId','movieId','rating']], reader)

sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between Users
               }
algo = KNNBasic(sim_options=sim_options,verbose=False)
trainset, testset = train_test_split(dataf, test_size=0.20,random_state=0)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)


RMSE: 0.9642


0.9641868823505456

In [14]:
#calculate SVD and compare the Root Mean Square Error (RSME) against a collaborative filter model

#select users that have rated more than 50 movies; required to distinguish users
select_users = ratings.groupby('userId').filter(lambda x: len(x) >= 50)
reader = Reader(rating_scale=(1, 5))
dataf = Dataset.load_from_df(select_users[['userId','movieId','rating']], reader)

# Train SVD on 80% of known rates, remaining 20% used to test. 
train, test = train_test_split(dataf, test_size=0.20,random_state=0)
svd = SVD()
svd.fit(train)
pred = svd.test(test)

# check the accuracy using Root Mean Square Error
accuracy.rmse(pred)

RMSE: 0.8681


0.8681425348803701

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

#Create empty arrays to hold True and predicted values of user ratings
true_arr=[]
pred_arr=[]

train=data.sample(frac=0.8,random_state=0) #random state is a seed value
test=data.drop(train.index)
empty_matrix = pd.pivot_table(train,values='rating_x',index='userId',columns='movieId')

train, test = train_test_split(empty_matrix,test_size=0.2,random_state=0)


test = test.fillna(0)
users=test.index
movies=test.columns


similar_users_movie = get_neighbours(movie_similarities,2)

#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 2 users: ", rmse)


similar_users_movie = get_neighbours(movie_similarities,5)
true_arr=[]
pred_arr=[]
#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 5 users: ", rmse)

similar_users_movie = get_neighbours(movie_similarities,10)
true_arr=[]
pred_arr=[]
#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 10 users: ", rmse)


similar_users_movie = get_neighbours(movie_similarities,15)
true_arr=[]
pred_arr=[]
#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 15 users: ", rmse)


similar_users_movie = get_neighbours(movie_similarities,20)
true_arr=[]
pred_arr=[]
#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 20 users: ", rmse)

similar_users_movie = get_neighbours(movie_similarities,25)
true_arr=[]
pred_arr=[]
#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 25 users: ", rmse)

true_arr=[]
pred_arr=[]
similar_users_movie = get_neighbours(movie_similarities,30)

#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 30 users: ", rmse)

similar_users_movie = get_neighbours(movie_similarities,50)
true_arr=[]
pred_arr=[]
#iterate over the empty movie matrix and store true and predicted values into an array
for uid in users:
    for iid in movies:
        if(test.at[uid,iid] != 0):
            true_arr.append(test.at[uid,iid])
            pred_arr.append(get_score(uid,iid))

#use sklearn mean squared error function to get rmse score of algorithm
rmse = mean_squared_error(true_arr, pred_arr, squared=False)
print("rmse score with 50 users: ", rmse)


rmse score with 2 users:  0.8356868351443075


KeyboardInterrupt: 