In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

%matplotlib inline

In [156]:
from sklearn.model_selection import train_test_split
np.random.seed(1)
ratings=pd.read_csv('raw_data/ratings.csv')
ratings_train,ratings_test=train_test_split(ratings,test_size=0.1,random_state=42)

users_train=set(ratings_train.userId.unique())
items_train=set(ratings_train.movieId.unique())
ratings_test=ratings_test[lambda df:df.movieId.apply(lambda i: i in items_train)]
len(ratings_train),len(ratings_test)

(90003, 9674)

In [157]:
y_true=ratings_test.rating
x_test=ratings_test[['userId','movieId']]

In [33]:
def predict_rand(x):
    return [np.random.rand()*5 for i in range(len(x))]

In [158]:
from sklearn.metrics import mean_squared_error

y_pred=predict_rand(x_test)
mean_squared_error(y_true,y_pred)

4.3400636853013026

transform ratings to matrix

In [35]:
len(ratings_train.userId.unique()),len(ratings_train.movieId.unique())

(671, 8749)

In [58]:
ratings_matrix=ratings_train[['userId','movieId','rating']].pivot_table(index='movieId',columns='userId')
ratings_matrix.columns=ratings_matrix.columns.levels[1]
ratings_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,3.0,,4.0,,...,,4.0,3.5,,,,,,4.0,5.0
2,,,,,,,,,,,...,5.0,,,3.0,,,,,,
3,,,,,4.0,,,,,,...,,,,3.0,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,3.0,,,,,,


In [59]:
ratings_matrix.loc[1,7]==ratings[(ratings.userId==7)&(ratings.movieId==1)].rating

495    True
Name: rating, dtype: bool

## uucf 

calc user-user similarity by ratings

choose neighbor k

average the k's ratings for i

In [93]:
user_sims=ratings_matrix.corr(min_periods=5)

In [180]:
def uucf_predict(user,item,k):
    pred=0.
    
    sim_users=user_sims[user].sort_values(ascending=False)[1:k+1]

    r_sum=0
    r_w=0
    for v in sim_users.index:
        w=sim_users[v]
        if not np.isnan(w):
            r_v=ratings_matrix.loc[item,v]
            if not np.isnan(r_v):
                r_sum+=r_v*w
                r_w+=np.abs(w)

    if r_w != 0:
        pred= r_sum/r_w
        
    return pred
    
[uucf_predict(1,1,k) for k in (3, 5, 10, 20)]

[3.0, 3.3002269432546862, 3.4469205833387306, 3.5458584388320844]

In [168]:

for k in (5,10,20,50,100):
    t=time()
    y_pred=np.zeros(len(x_test))
    for i in range(len(y_pred)):
        y_pred[i]=uucf_predict(x_test.iloc[i].userId,x_test.iloc[i].movieId,k)
    
    s=mean_squared_error(y_true,y_pred)
    print('time %.2f, score %.3f'%(time()-t,s))


time 5.76, score 11.462
time 6.77, score 10.123
time 8.86, score 8.280
time 14.72, score 5.171
time 24.48, score 2.890


## normalized version

formula 4.17

use z-score normalization

In [176]:
ratings_summ=ratings_matrix.describe().T
r_users_mean=ratings_summ['mean']
r_users_std=ratings_summ['std']


In [181]:
def uucf_predict_norm(user,item,k):
    pred=r_users_mean[user]
    
    sim_users=user_sims[user].sort_values(ascending=False)[1:k+1]

    r_sum=0
    r_w=0
    for v in sim_users.index:
        w=sim_users[v]
        if not np.isnan(w) and not np.isnan(ratings_matrix.loc[item,v]) and r_users_std[v] != 0.:
            r_v_norm=(ratings_matrix.loc[item,v]-r_users_mean[v])/r_users_std[v]
            r_sum+=r_v_norm*w
            r_w+=np.abs(w)

    if r_w != 0:
        pred+= r_sum*r_users_std[user]/r_w
        
    return pred
    
[uucf_predict_norm(1,1,k) for k in (3, 5, 10, 20)]

[2.5552373624936142,
 2.6890092279065501,
 2.5984345287525357,
 2.6931559379087138]

In [182]:

for k in (5,10,20,50,100):
    t=time()
    y_pred=np.zeros(len(x_test))
    for i in range(len(y_pred)):
        y_pred[i]=uucf_predict_norm(x_test.iloc[i].userId,x_test.iloc[i].movieId,k)
    
    s=mean_squared_error(y_true,y_pred)
    print('time %.2f, score %.3f'%(time()-t,s))


time 5.99, score 0.987
time 7.14, score 1.020
time 9.41, score 1.018
time 16.28, score 0.969
time 28.11, score 0.918
