In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm #adding progress bars to your loops

In [18]:
ratings = pd.DataFrame([[1,2,8,9,3,3],[2,1,9,8,4,2],[2,2,6,8,2,3],[9,7,2,3,1,1],
                   [1,1,1,2,8,7],[2,2,3,2,8,8],[7,9,2,2,2,3],[9,8,2,3,1,3]])

users = np.array(["u0","u1","u2","u3","u4","u5","u6","u7"])
movies = np.array(["horror1","horror2","drama1","drama2","art1","art2"])

ratings.columns = movies
ratings.index = users

In [19]:
ratings

Unnamed: 0,horror1,horror2,drama1,drama2,art1,art2
u0,1,2,8,9,3,3
u1,2,1,9,8,4,2
u2,2,2,6,8,2,3
u3,9,7,2,3,1,1
u4,1,1,1,2,8,7
u5,2,2,3,2,8,8
u6,7,9,2,2,2,3
u7,9,8,2,3,1,3


In [20]:
mean_ratings = ratings.mean(axis=1).to_dict()
mean_ratings

{'u0': 4.333333333333333,
 'u1': 4.333333333333333,
 'u2': 3.8333333333333335,
 'u3': 3.8333333333333335,
 'u4': 3.3333333333333335,
 'u5': 4.166666666666667,
 'u6': 4.166666666666667,
 'u7': 4.333333333333333}

In [21]:
df = pd.DataFrame(ratings.values - ratings.mean(axis=1).values.reshape(-1,1)).round(2)

df.columns = ["horror1","horror2","drama1","drama2","art1","art2"]
df.index = ["u0","u1","u2","u3","u4","u5","u6","u7"]

df

Unnamed: 0,horror1,horror2,drama1,drama2,art1,art2
u0,-3.33,-2.33,3.67,4.67,-1.33,-1.33
u1,-2.33,-3.33,4.67,3.67,-0.33,-2.33
u2,-1.83,-1.83,2.17,4.17,-1.83,-0.83
u3,5.17,3.17,-1.83,-0.83,-2.83,-2.83
u4,-2.33,-2.33,-2.33,-1.33,4.67,3.67
u5,-2.17,-2.17,-1.17,-2.17,3.83,3.83
u6,2.83,4.83,-2.17,-2.17,-2.17,-1.17
u7,4.67,3.67,-2.33,-1.33,-3.33,-1.33


In [22]:
userid = 'u4'
movieid = 'drama1' 

df1 = df.copy()
df1.loc[userid, movieid]=None
df1

Unnamed: 0,horror1,horror2,drama1,drama2,art1,art2
u0,-3.33,-2.33,3.67,4.67,-1.33,-1.33
u1,-2.33,-3.33,4.67,3.67,-0.33,-2.33
u2,-1.83,-1.83,2.17,4.17,-1.83,-0.83
u3,5.17,3.17,-1.83,-0.83,-2.83,-2.83
u4,-2.33,-2.33,,-1.33,4.67,3.67
u5,-2.17,-2.17,-1.17,-2.17,3.83,3.83
u6,2.83,4.83,-2.17,-2.17,-2.17,-1.17
u7,4.67,3.67,-2.33,-1.33,-3.33,-1.33


In [23]:
df1 = df1.dropna(axis='columns')
df1

Unnamed: 0,horror1,horror2,drama2,art1,art2
u0,-3.33,-2.33,4.67,-1.33,-1.33
u1,-2.33,-3.33,3.67,-0.33,-2.33
u2,-1.83,-1.83,4.17,-1.83,-0.83
u3,5.17,3.17,-0.83,-2.83,-2.83
u4,-2.33,-2.33,-1.33,4.67,3.67
u5,-2.17,-2.17,-2.17,3.83,3.83
u6,2.83,4.83,-2.17,-2.17,-1.17
u7,4.67,3.67,-1.33,-3.33,-1.33


In [24]:
def find_cosine_similarity(v1,v2):
    return np.dot(v1,v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [25]:
def find_cosine_similarity_dict(userid, df1):

    v1 = df1.loc[userid].values
    csim = {}

    for user in df1.index:
        if (user!=userid):
            v2 = df1.loc[user].values
            csim[user] = find_cosine_similarity(v1,v2)

    return csim

In [26]:
csim = find_cosine_similarity_dict(userid, df1)
csim

{'u0': -0.09189915029391571,
 'u1': -0.043282038968379455,
 'u2': -0.23458368821186085,
 'u3': -0.8283767771229885,
 'u5': 0.9848779179101157,
 'u6': -0.6542393507439361,
 'u7': -0.7791402993491813}

In [27]:
def make_prediction(df, userid, movieid, csim):

    num=0
    den=0

    for user in df.index:

        if (user!=userid):
            r = df.loc[user,movieid]
            c_similarity = csim[user]

            num += r * c_similarity
            den += c_similarity

    prediction = mean_ratings[userid] + num/den #adding mean cz we subtracted it initially 
    return prediction

In [28]:
prediction = make_prediction(df, userid, movieid, csim)
prediction

1.7845614254193523

In [29]:
print('ACTUAL RATING: ', ratings.loc[userid, movieid])
print('PREDICTED RATING: ', prediction.round(2))

ACTUAL RATING:  1
PREDICTED RATING:  1.78


In [30]:
def iteration(userid, movieid, df):
    
    df1 = df.copy()
    df1.loc[userid, movieid]=None
    df1 = df1.dropna(axis='columns')

    csim = find_cosine_similarity_dict(userid, df1)

    prediction = make_prediction(df, userid, movieid, csim)
    actual = ratings.loc[userid, movieid]
    error = abs(actual-prediction)
    
    return error

In [31]:
N_ITERATIONS = 1000

errors=[]

for _ in tqdm(range(N_ITERATIONS)):
    u = np.random.choice(users)
    m = np.random.choice(movies)

    error = iteration(u, m, df)
    errors.append(error)
    
errors = np.array(errors)
mae = np.mean(errors).round(2)
rmse = np.sqrt(np.mean(errors**2)).round(2)

100%|██████████| 1000/1000 [00:03<00:00, 268.77it/s]


In [32]:
print('Mean Absolute Error (MAE): ', mae)
print('Root Mean Square Error (RMSE): ', rmse)

Mean Absolute Error (MAE):  33.82
Root Mean Square Error (RMSE):  77.11
