In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error
%matplotlib inline

from util import split_train_test

np.random.seed(1)
ratings=pd.read_csv('raw_data/ratings.csv')


X_train,y_train,X_test,y_test=split_train_test(ratings)

## svd++

include implicit feedback

### include rated feedback

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from copy import deepcopy
from sklearn.metrics import mean_squared_error

class SvdPlus(BaseEstimator):
    
    def __init__(self,factors=3,epsilon=100,C1=0.01,C2=0.01,learning_rate=0.01,max_iter=100):
        self.factors=factors
        self.epsilon=epsilon
        self.C1=C1
        self.C2=C2
        self.learning_rate=learning_rate
        self.max_iter=max_iter
        
        self.init_fit=False
        
    def init_fitting(self,X,y):
        print(str(self.get_params()))
        
        if not self.init_fit:
            users_train=set(X.userId.unique())
            items_train=set(X.movieId.unique())
            
            
            items_rated = X.pivot_table(index='userId',columns='movieId',aggfunc='count').fillna(0)
            self.y_items_cnt=len(items_train)**0.5
            self.y_items_rated = items_rated.sum(axis=1)
            
            
            self.f_users={}
            for u in users_train:
                self.f_users[u]=np.ones(self.factors)/10
                
                
            self.f_items={}
            for i in items_train:
                self.f_items[i]=np.ones(self.factors)/10

            self.b_users=pd.Series(np.random.randn(len(users_train)),index=users_train)
            self.b_items=pd.Series(np.random.randn(len(items_train)),index=items_train)

            self.g_mean=np.mean(y)
            
            self.init_fit = True

    def fit(self,X,y):
        self.init_fitting(X,y)

        early_stop=False
        for f in range(self.factors):
            print('train factors %d'%(f+1))
            
            learning_rate = self.learning_rate
            last_cost=np.inf
            for it in range(self.max_iter):
                cost=self.C1*(np.sum(self.b_users**2)+np.sum(self.b_items**2))+self.C2*np.sum((self.y_items_rated/self.y_items_cnt)**2)
                for fm in self.f_users,self.f_items:
                    for _,f_i in fm.items():
                        cost+=self.C1*np.sum(f_i**2)
                
                
                for idx in X.index:
                    u=X.loc[idx,'userId']
                    i=X.loc[idx,'movieId']
                    
                    y_ri = self.y_items_rated[u]/self.y_items_cnt
                    r_pred=np.dot(self.f_users[u]+y_ri , self.f_items[i])
                    e_ui=y[idx] - self.g_mean - self.b_users[u] - self.b_items[i] - r_pred
                    cost+=(e_ui)**2
                    
                    self.b_users[u] += self.learning_rate*(e_ui - self.C1*self.b_users[u])
                    self.b_items[i] += self.learning_rate*(e_ui - self.C1*self.b_items[i])

                    f_uf = self.f_users[u][f]
                    self.f_users[u][f] += self.learning_rate*(e_ui*self.f_items[i][f] - self.C2*self.f_users[u][f])
                    
                    f_if = self.f_items[i][f]
                    self.f_items[i][f] += self.learning_rate*(e_ui*(f_uf+y_ri) - self.C2*self.f_items[i][f])
                    
                    self.y_items_rated[u] += self.learning_rate*(e_ui*f_if/self.y_items_cnt - \
                                                                     self.C2*self.y_items_rated[u])
                    
                    
                print('iter %d, cost %.2f'%(it+1,cost))

                if np.isnan(cost) or (last_cost > cost and last_cost-cost < self.epsilon) or last_cost<cost:
                    early_stop = it < 2 and last_cost < cost
                    break

                last_cost = cost
                
            learning_rate*=0.9
                
            if early_stop:
                break
                
            
            
        return self

    def predict(self,X):
        y_pred=np.zeros(len(X))

        for i in range(len(X)):
            u=X.iloc[i]['userId']
            m=X.iloc[i]['movieId']
            if u in self.f_users and m in self.f_items:
                y_ri = self.y_items_rated[u]/self.y_items_cnt
                    
                y_pred[i]=self.g_mean+self.b_users[u]+self.b_items[m]+np.dot(self.f_users[u]+y_ri,self.f_items[m])
                    
        return y_pred
                
    def get_params(self,deep=True):
        return {'factors':self.factors,'epsilon':self.epsilon,
                'C1':self.C1,'C2':self.C2,
                'learning_rate':self.learning_rate,'max_iter':self.max_iter}
    
    def set_params(self,**params):
        self.factors=params['factors']
        self.epsilon=params['epsilon']
        self.C1=params['C1']
        self.C2=params['C2']
        self.learning_rate=params['learning_rate']
        self.max_iter=params['max_iter']

t=time()
p_svd=SvdPlus(factors=10,learning_rate=0.005,C1=0.01,C2=0.01,max_iter=100)
p_svd.fit(X_train,y_train)
print('time cost %d'%int(time()-t))
'error %.3f'%mean_squared_error(y_test,p_svd.predict(X_test))

In [None]:
'error %.3f'%mean_squared_error(y_test,p_svd.predict(X_test))

In [None]:
import svd

param_grid={'factors':[5,10,20,50],'alpha':np.logspace(-4,4,9)[:4],'learning_rate':np.logspace(-4,4,9)[:4],'max_iter':[20,50,100],'epsilon':[100.]}

# gs=GridSearchCV(estimator=svd.BiasedSvd(),param_grid=param_grid,scoring='neg_mean_squared_error',n_jobs=4,verbose=1,cv=5)
# gs.fit(ratings_train,ratings_train.rating)
# gs.grid_scores_,gs.best_estimator_,gs.best_score_