In [12]:
import math 
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

## Read Data

In [13]:
df = pd.read_table('ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating','timestamp'])
df

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [14]:
print("Unique UserID:",df.userId.unique().shape[0])
print("Unique MovieID:",df.movieId.unique().shape[0])

Unique UserID: 6040
Unique MovieID: 3706


## Build Utility Matrix

In [15]:
df_movie_features = df.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)
df_movie_features

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
10,5.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


## Cosine Similarity

In [16]:
cos_sim = cosine_similarity(df_movie_features)
cos_df = pd.DataFrame(cos_sim)
cos_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039
0,1.000000,0.096382,0.120610,0.132455,0.090158,0.179222,0.059678,0.138241,0.226148,0.255288,...,0.170588,0.082006,0.069807,0.033663,0.114877,0.186329,0.135979,0.000000,0.174604,0.133590
1,0.096382,1.000000,0.151479,0.171176,0.114394,0.100865,0.305787,0.203337,0.190198,0.226861,...,0.112503,0.091222,0.268565,0.014286,0.183384,0.228241,0.206274,0.066118,0.066457,0.218276
2,0.120610,0.151479,1.000000,0.151227,0.062907,0.074603,0.138332,0.077656,0.126457,0.213655,...,0.092960,0.125864,0.161507,0.000000,0.097308,0.143264,0.107744,0.120234,0.094675,0.133144
3,0.132455,0.171176,0.151227,1.000000,0.045094,0.013529,0.130339,0.100856,0.093651,0.120738,...,0.163629,0.093041,0.382803,0.000000,0.082097,0.170583,0.127464,0.062907,0.064634,0.137968
4,0.090158,0.114394,0.062907,0.045094,1.000000,0.047449,0.126257,0.220817,0.261330,0.117052,...,0.100652,0.035732,0.061806,0.054151,0.179083,0.293365,0.172686,0.020459,0.027689,0.241437
5,0.179222,0.100865,0.074603,0.013529,0.047449,1.000000,0.049982,0.075234,0.111123,0.204938,...,0.037084,0.080180,0.023334,0.000000,0.053685,0.093583,0.065788,0.065711,0.167303,0.083436
6,0.059678,0.305787,0.138332,0.130339,0.126257,0.049982,1.000000,0.237550,0.162306,0.092559,...,0.112287,0.018864,0.349259,0.000000,0.116922,0.122441,0.111673,0.000000,0.014977,0.080680
7,0.138241,0.203337,0.077656,0.100856,0.220817,0.075234,0.237550,1.000000,0.291369,0.154112,...,0.080165,0.029838,0.133244,0.017670,0.275391,0.227400,0.144395,0.019242,0.044660,0.148123
8,0.226148,0.190198,0.126457,0.093651,0.261330,0.111123,0.162306,0.291369,1.000000,0.239249,...,0.175427,0.023395,0.101233,0.047537,0.234626,0.239607,0.225055,0.093470,0.046434,0.215819
9,0.255288,0.226861,0.213655,0.120738,0.117052,0.204938,0.092559,0.154112,0.239249,1.000000,...,0.179099,0.223739,0.161627,0.061829,0.314958,0.338072,0.246902,0.113789,0.296776,0.255793


## Predicting Ratings

In [17]:
def func(ele):
    return ele[1]

def predict(userId, movieId):
    dict_with_rating = {}
    lis = []
    for idx, rating in enumerate(df_movie_features.iloc[movieId - 1]):
        if(rating > 0):
            temp = (idx,rating)
            dict_with_rating[idx] = rating
            lis.append(temp)
    top_k = []
    for i,sim in enumerate(cos_df.iloc[userId]):
        for j,k in lis:
            if(j == i):
                top_k.append([i,sim])
    top_sorted_sim = sorted(top_k, key=func, reverse=True)
    top_5 = top_sorted_sim[:5]
    sum_ = 0
    sim_sum = 0
    for top in top_5:
        sum_ += dict_with_rating[top[0]]*top[1]
        sim_sum += top[1]
    return sum_ / sim_sum

## Spliting Data 

In [18]:
user_id = df.userId 
movie_id = df.movieId 
rating_s = df.rating

ui_train, ui_test, mi_train, mi_test, r_train, r_test = train_test_split(user_id, movie_id, rating_s, test_size=0.2, random_state = 2)

## RMSE Calculation

In [19]:
em_list = []
for j in range(2000):
    predicted_rats = predict(ui_test.iloc[j], mi_test.iloc[j])
    rating_Q = r_test.iloc[j]
    half_rmse = (predicted_rats - rating_Q)**2
    em_list.append(half_rmse)
rmse = math.sqrt(sum(em_list)/len(em_list))
print("RMSE:",rmse)

RMSE: 1.2612222838711367
