In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error
%matplotlib inline

from util import split_train_test

X_train,y_train,X_test,y_test=split_train_test()

## timed base predictor

In [None]:
from sklearn.base import BaseEstimator

class TimedBasePredictor(BaseEstimator):
    
    def __init__(self,epsilon=100,alpha=0.01,learning_rate=0.01,max_iter=100):
        self.epsilon=epsilon
        self.alpha=alpha
        self.learning_rate=learning_rate
        self.max_iter=max_iter
        
        self.init_fit=False

    def init_fitting(self,X,y):
        print(str(self.get_params()))
        
        if not self.init_fit:
            users_train=set(X.userId.unique())
            items_train=set(X.movieId.unique())
            
            self.b_users=pd.Series(np.random.randn(len(users_train)),index=users_train)
            self.b_items=pd.Series(np.random.randn(len(items_train)),index=items_train)

            self.g_mean=np.mean(y)

            self.init_fit = True

              
    def fit(self, X, y):
        self.init_fitting(X,y)

        last_cost = np.inf    
        for it in range(self.max_iter):
            cost=np.sum(self.b_users**2)+np.sum(self.b_items**2)
                    
            for idx in X.index:
                u=X.loc[idx,'userId']
                i=X.loc[idx,'movieId']
                e_ui=y[idx]-self.g_mean-self.b_users[u]-self.b_items[i]
                cost+=(e_ui)**2
                
                self.b_users[u] += self.learning_rate*(e_ui - self.alpha*self.b_users[u])
                self.b_items[i] += self.learning_rate*(e_ui - self.alpha*self.b_items[i])
            
            print('iter %d, cost %.2f'%(it+1,cost))

            if np.isnan(cost) or (last_cost > cost and last_cost-cost < self.epsilon) or last_cost<cost:
                break

            last_cost = cost
                
            self.learning_rate*=0.9
              
        return self

    def predict(self,X):
        y_pred=[self.g_mean for i in range(len(X))]

        for i in range(len(X)):
            u=X.iloc[i]['userId']
            m=X.iloc[i]['movieId']
            if u in self.b_users.index and m in self.b_items.index:
                y_pred[i] += self.b_users[u] + self.b_items[m]
                    
        return y_pred
                
    def get_params(self,deep=True):
        return {'epsilon':self.epsilon,
                'alpha':self.alpha,'learning_rate':self.learning_rate,'max_iter':self.max_iter}
    
    def set_params(self,**params):
        self.epsilon=params['epsilon']
        self.alpha=params['alpha']
        self.learning_rate=params['learning_rate']
        self.max_iter=params['max_iter']

In [3]:
from base import BasePredictor
t=time()
bp=BasePredictor(learning_rate=0.01,alpha=0.01,max_iter=100,epsilon=10)
bp.fit(X_train,y_train)
print('time cost %d'%int(time()-t))
'error %.2f'%mean_squared_error(y_test,bp.predict(X_test))

{'epsilon': 10, 'alpha': 0.01, 'learning_rate': 0.01, 'max_iter': 200}
iter 1, cost 173957.52
iter 2, cost 122886.41
iter 3, cost 108045.95
iter 4, cost 100349.66
iter 5, cost 95571.05
iter 6, cost 92303.54
iter 7, cost 89927.54
iter 8, cost 88124.38
iter 9, cost 86712.44
iter 10, cost 85580.13
iter 11, cost 84654.89
iter 12, cost 83887.28
iter 13, cost 83242.42
iter 14, cost 82694.94
iter 15, cost 82225.96
iter 16, cost 81821.13
iter 17, cost 81469.40
iter 18, cost 81162.10
iter 19, cost 80892.36
iter 20, cost 80654.66
iter 21, cost 80444.49
iter 22, cost 80258.16
iter 23, cost 80092.60
iter 24, cost 79945.20
iter 25, cost 79813.78
iter 26, cost 79696.45
iter 27, cost 79591.58
iter 28, cost 79497.77
iter 29, cost 79413.78
iter 30, cost 79338.54
iter 31, cost 79271.09
iter 32, cost 79210.61
iter 33, cost 79156.35
iter 34, cost 79107.65
iter 35, cost 79063.93
iter 36, cost 79024.67
iter 37, cost 78989.41
iter 38, cost 78957.73
iter 39, cost 78929.26
iter 40, cost 78903.68
iter 41, cost 

'error 0.99'