In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(1)
ratings=pd.read_csv('raw_data/ratings.csv')
ratings_train,ratings_test=train_test_split(ratings,test_size=0.1,random_state=42)

users_train=set(ratings_train.userId.unique())
items_train=set(ratings_train.movieId.unique())
ratings_test=ratings_test[lambda df:df.movieId.apply(lambda i: i in items_train)]
len(ratings_train),len(ratings_test)

In [None]:
y_true=ratings_test.rating
x_test=ratings_test[['userId','movieId']]

In [None]:
def predict_rand(x):
    return [np.random.rand()*5 for i in range(len(x))]

In [None]:
from sklearn.metrics import mean_squared_error

y_pred=predict_rand(x_test)
mean_squared_error(y_true,y_pred)

## iicf 

calc item-item similarity by ratings

choose neighbor k

average the k's ratings for i

formula 4.18

use z-score normalization

In [None]:
class IIcf:
    def __init__(self,k=3):
        self.k=k
        
    
    def get_params(self,deep=True):
        return {'k':self.k}
    
    def set_params(self,**params):
        self.k=params['k']
        
    def fit(self,X,y):
        self.ratings_matrix = X[['userId','movieId','rating']].pivot_table(index='userId',columns='movieId')
        self.ratings_matrix.columns = self.ratings_matrix.columns.levels[1]
        
        self.item_sims = self.ratings_matrix.corr(min_periods=5)
        ratings_summ = self.ratings_matrix.describe().T
        self.r_items_mean = ratings_summ['mean']
        self.r_items_std = ratings_summ['std']
        
        self.users_train=set(self.ratings_matrix.index)
        self.items_train=set(self.ratings_matrix.columns)
        
        return self
    
    def predict(self,X):
        
        y_pred=[]

        for i in X.index:
            item=X.loc[i,'movieId']
            user=X.loc[i,'userId']
            
            pred = 0
            if item in self.items_train and user in self.users_train:
                pred=self.r_items_mean[item]
                sim_items=self.item_sims[item].sort_values(ascending=False)[1:self.k+1]

                r_sum=0
                r_w=0
                for j in sim_items.index:
                    w=sim_items[j]
                    if not np.isnan(w) and not np.isnan(self.ratings_matrix.loc[user,j]) and self.r_items_std[j] != 0.:
                        r_j_norm=(self.ratings_matrix.loc[user,j]-self.r_items_mean[j])/self.r_items_std[j]
                        r_sum+=r_j_norm*w
                        r_w+=np.abs(w)

                if r_w != 0:
                    pred+= r_sum*self.r_items_std[item]/r_w
                
            y_pred.append(pred)

        return y_pred
    


In [None]:
# iicf=IIcf(1)
# iicf.fit(ratings_train,ratings_train.rating)
# for k in (5,10,20,50,100):
#     t=time()
#     iicf.k=k
#     s=mean_squared_error(y_true,iicf.predict(x_test))
#     print('time %.2f, error %.3f'%(time()-t,s))


In [None]:
import cf
t=time()
param_grid={'k':[5]}
gs=GridSearchCV(estimator=cf.IIcf(),param_grid=param_grid,scoring='neg_mean_squared_error',n_jobs=4,verbose=1,cv=5)
gs.fit(ratings_train,ratings_train.rating)
print('time cost %.2f'%(time()-t))
gs.grid_scores_,gs.best_estimator_,gs.best_score_

In [None]:
gs.grid_scores_