In [386]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
import time

# Data Source

https://www.kaggle.com/grouplens/movielens-20m-dataset

# Load Data

In [367]:
#read first 3 columns
data = pd.read_csv("rating.csv", header = 0, usecols = [0,1,2])
print(data.head)

<bound method NDFrame.head of           userId  movieId  rating
0              1        2     3.5
1              1       29     3.5
2              1       32     3.5
3              1       47     3.5
4              1       50     3.5
...          ...      ...     ...
20000258  138493    68954     4.5
20000259  138493    69526     4.5
20000260  138493    69644     3.0
20000261  138493    70286     5.0
20000262  138493    71619     2.5

[20000263 rows x 3 columns]>


# Get subset of data

In [381]:
# use subset of data : the most rated movie and users who rate the most

M = 1000  #select M movies rated most
N = 5000 #select N users rate most

#calculate ratings of each movie

sort_movie_ratings = data.groupby("movieId").agg(ratings = ('rating', 'count')).reset_index().sort_values("ratings", ascending= False)
print(sort_movie_ratings.head)
top_rated_movies = sort_movie_ratings["movieId"][:M]
print(top_rated_movies)


#calculate ratings of each users

sort_user_ratings = data.groupby("userId").agg(ratings = ('rating', 'count')).reset_index().sort_values("ratings", ascending= False)
print(sort_user_ratings.head)
top_rated_users = sort_user_ratings["userId"][:N]
print(top_rated_users)


# select subset of data

subset_data = data[data['movieId'].isin(top_rated_movies) & data['userId'].isin(top_rated_users)]
print(subset_data)

subset_data.to_csv('subset_ratings.csv')


<bound method NDFrame.head of        movieId  ratings
293        296    67310
352        356    66172
315        318    63366
587        593    63299
476        480    59715
...        ...      ...
23146   110807        1
23148   110811        1
15481    78984        1
23150   110818        1
26743   131262        1

[26744 rows x 2 columns]>
293        296
352        356
315        318
587        593
476        480
         ...  
1537      1589
1029      1049
898        915
10449    38061
379        383
Name: movieId, Length: 1000, dtype: int64
<bound method NDFrame.head of         userId  ratings
118204  118205     9254
8404      8405     7515
82417    82418     5646
121534  121535     5520
125793  125794     5491
...        ...      ...
89304    89305       20
110462  110463       20
96989    96990       20
134746  134747       20
6525      6526       20

[138493 rows x 2 columns]>
118204    118205
8404        8405
82417      82418
121534    121535
125793    125794
           ...  


# Re-index data 

In [382]:
#re-index userId and movieId 

subset_data = pd.read_csv('subset_ratings.csv',header = 0, index_col=0)
print("read subset data:")
print(subset_data.head)

#re-index userId
unique_user = set(subset_data['userId'])
user_index = {}
count = 0
for id in unique_user:
    user_index[id] = count
    count+=1
    
subset_data['new_user_idx'] = subset_data.apply(lambda row : user_index[row.userId], axis = 1)


#re-index userId
unique_movie = set(subset_data['movieId'])
movie_index = {}
count = 0
for id in unique_movie:
    movie_index[id] = count
    count+=1
    
subset_data['new_movie_idx'] = subset_data.apply(lambda row : movie_index[row.movieId], axis = 1)

print("read subset reindex data:")
print(subset_data.head)

subset_data.to_csv('subset_reindex_ratings.csv')


  mask |= (ar1 == a)


read subset data:
<bound method NDFrame.head of           userId  movieId  rating
5400          54        1     4.0
5401          54        2     3.0
5402          54        5     3.0
5403          54        6     3.0
5404          54       10     4.0
...          ...      ...     ...
19993363  138437    74458     3.5
19993369  138437    78499     4.0
19993372  138437    79132     4.5
19993375  138437    80463     4.0
19993378  138437    81591     4.0

[2353634 rows x 3 columns]>
read subset reindex data:
<bound method NDFrame.head of           userId  movieId  rating  new_user_idx  new_movie_idx
5400          54        1     4.0             5              0
5401          54        2     3.0             5              1
5402          54        5     3.0             5              3
5403          54        6     3.0             5              4
5404          54       10     4.0             5              8
...          ...      ...     ...           ...            ...
19993363  138437  

# Split data to training and testing data

In [383]:
subset_reindex_data = pd.read_csv('subset_reindex_ratings.csv',header = 0, index_col=0)
print("subset reindex data:")
print(subset_reindex_data.head)

#shuffle data
subset_reindex_data = shuffle(subset_reindex_data)

#split to 0.8 training and 0.2 testing

split = int(0.8*len(subset_reindex_data))
train = subset_reindex_data[:split].reset_index(drop=True)
test = subset_reindex_data[split:].reset_index(drop=True)

print("train data :")
print(train.head)

print("test data :")
print(test.head)

  mask |= (ar1 == a)


subset reindex data:
<bound method NDFrame.head of           userId  movieId  rating  new_user_idx  new_movie_idx
5400          54        1     4.0             5              0
5401          54        2     3.0             5              1
5402          54        5     3.0             5              3
5403          54        6     3.0             5              4
5404          54       10     4.0             5              8
...          ...      ...     ...           ...            ...
19993363  138437    74458     3.5          1355            431
19993369  138437    78499     4.0          1355            412
19993372  138437    79132     4.5          1355            738
19993375  138437    80463     4.0          1355            360
19993378  138437    81591     4.0          1355            884

[2353634 rows x 5 columns]>
train data :
<bound method NDFrame.head of          userId  movieId  rating  new_user_idx  new_movie_idx
0        116361     8949     4.0          2935            4

# Create dictionaries for lookup

In [384]:
#create dic for users who rated a movie
movie_user_dic = {}
user_movie_dic = {}
user_movie_rating_dic = {}

#training data
train_length = len(train)
for i in range(train_length):  
    user = train.loc[i,'new_user_idx']
    movie = train.loc[i,'new_movie_idx']
    rating = train.loc[i,'rating']
    
    
    #movie user lookup
    if movie in movie_user_dic:
        movie_user_dic[movie].append(user) 
    else:
        movie_user_dic[movie] = [user]
        
    #user movie lookup
    if user in user_movie_dic:
        user_movie_dic[user].append(movie) 
    else:
        user_movie_dic[user] = [movie]
        
    #user movie rating lookup
    user_movie_rating_dic[(user,movie)] = rating
    
#testing data
test_user_movie_rating_dic={}
test_length = len(test)

for i in range(test_length):  
    user = test.loc[i,'new_user_idx']
    movie = test.loc[i,'new_movie_idx']
    rating = test.loc[i,'rating']
        
    #user movie rating lookup
    test_user_movie_rating_dic[(user,movie)] = rating
    
          

# save json file to disk
with open('movie_user_dic.pickle', 'wb') as file:
    pickle.dump(movie_user_dic, file) 
    
with open('user_movie_dic.pickle', 'wb') as file:
    pickle.dump(user_movie_dic, file) 

with open('user_movie_rating_dic.pickle', 'wb') as file:
    pickle.dump(user_movie_rating_dic, file) 
    
with open('test_user_movie_rating_dic.pickle', 'wb') as file:
    pickle.dump(test_user_movie_rating_dic, file) 


# Calculate pearson coefficient between users

In [388]:
# read file from disk
with open('movie_user_dic.pickle', 'rb') as file:
    movie_user_dic = pickle.load(file) 
    
with open('user_movie_dic.pickle', 'rb') as file:
    user_movie_dic = pickle.load(file) 

with open('user_movie_rating_dic.pickle', 'rb') as file:
    user_movie_rating_dic = pickle.load(file) 
    
with open('test_user_movie_rating_dic.pickle', 'rb') as file:
    test_user_movie_rating_dic = pickle.load(file) 

#calculate similarity


def user_similarity(useri, userj, common_movies, dev_i, dev_j):
    
    i_common_devs = []
    j_common_devs = []
    
    #create rating arrays for both users
    for movie in common_movies:
        i_common_devs.append(dev_i[movie])
        j_common_devs.append(dev_j[movie])     
    #to np.array    
    i_devs_arr = np.array(i_common_devs)
    j_devs_arr = np.array(j_common_devs)
    
    
    #calculate pearson correlation coefficient
    numerator = np.dot(i_devs_arr , j_devs_arr)
    denominator1 = np.sqrt(np.dot(i_devs_arr, i_devs_arr))
    denominator2 = np.sqrt(np.dot(j_devs_arr, j_devs_arr))
    
    return round(numerator/(denominator1*denominator2),5)
    

In [389]:
averages = []
deviations = []
most_similar_neighbors = []
common_movie_threshold = 10
limit_neighbors = 25
N = np.max(list(user_movie_dic.keys())) + 1

start = time.time()

for i in range(N):
    if i % 200 ==0:
        print("i:",i)
    movies_i = user_movie_dic[i] 
    ratings_i = {movie:user_movie_rating_dic[(i, movie)] for movie in movies_i }
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = {movie:(rating - avg_i) for movie, rating in ratings_i.items()}
    
    # save these for later use
    averages.append(avg_i)
    deviations.append(dev_i)
    
    i_all_neighbor = {}
    for j in range(N):
        if j != i:
            movies_j = user_movie_dic[j]
            common_movie = (set(movies_i) & set(movies_j))
            if len(common_movie) > common_movie_threshold:
                ratings_j = { movie:user_movie_rating_dic[(j, movie)] for movie in movies_j }
                avg_j = np.mean(list(ratings_j.values()))
                dev_j = { movie:(rating - avg_j) for movie, rating in ratings_j.items() }
                         
                similarity = user_similarity(i, j, common_movie, dev_i, dev_j)
                i_all_neighbor[j] = similarity
    # sort similarity            
    sorted_similairty = sorted(i_all_neighbor.items(), key=lambda kv: kv[1],reverse=True)
    if len(sorted_similairty ) > limit_neighbors:  
        most_similar_neighbors.append(sorted_similairty [:limit_neighbors])
        #print("sorted_similarity", sorted_similairty[:limit_neighbors])
    else:
        most_similar_neighbors.append(sorted_similairty)
        #print("sorted_similarity", sorted_similairty)

print(time.time() - start)        

#save the similarity results                
with open('most_similar_neighbors.pickle', 'wb') as file:
    pickle.dump(most_similar_neighbors, file)    

i: 0
i: 200
i: 400
i: 600
i: 800
i: 1000
i: 1200
i: 1400
i: 1600
i: 1800
i: 2000
i: 2200
i: 2400
i: 2600
i: 2800
i: 3000
i: 3200
i: 3400
i: 3600
i: 3800
i: 4000
i: 4200
i: 4400
i: 4600
i: 4800
112827.89938187599


# Make prediction on train and test data

In [390]:
with open('most_similar_neighbors.pickle', 'rb') as file:
    most_similar_neighbors = pickle.load(file)

def predict(userid, movieid):
    
    avg_rating = averages[userid]
    numerator = 0
    denominator = 0
    for neighbor, sim in  most_similar_neighbors[userid]:
        try:
            numerator += sim*deviations[neighbor][movieid]
            denominator += abs(sim)
        except KeyError: # ignore the error when nighbor did not rate this movie
            pass
            
    if denominator == 0: #if no nighbor rated this movie
        prediction = avg_rating
    else:
        prediction = numerator / denominator + avg_rating
        prediction = min(5, prediction) # max rating is 5
        prediction = max(0.5, prediction) # min rating is 0.5
    return prediction

train_predictions = []
train_actual_ratings = []

#predict training set
for (i, m), target in user_movie_rating_dic.items():
  # calculate the prediction for this movie
    prediction = predict(i, m)

    train_predictions.append(prediction)
    train_actual_ratings.append(target)

test_predictions = []
test_actual_ratings = []
# predict test set
for (i, m), target in test_user_movie_rating_dic.items():
  # calculate the prediction for this movie
    prediction = predict(i, m)

    test_predictions.append(prediction)
    test_actual_ratings.append(target)


# Calculate MSE results

In [399]:
def mse(actual_ratings, predictions):
    actual = np.array(actual_ratings)
    predict = np.array(predictions)
    return np.mean((actual-predict)**2)
    
    
print('train mse:', mse(train_actual_ratings, train_predictions))
print('test mse:', mse(test_actual_ratings, test_predictions))

train mse: 0.5199268299247752
test mse: 0.6469388871385485
