In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(1)
ratings=pd.read_csv('raw_data/ratings.csv')
ratings_train,ratings_test=train_test_split(ratings,test_size=0.1,random_state=42)

users_train=set(ratings_train.userId.unique())
items_train=set(ratings_train.movieId.unique())
ratings_test=ratings_test[lambda df:df.movieId.apply(lambda i: i in items_train)]
len(ratings_train),len(ratings_test)

In [None]:
y_true=ratings_test.rating
x_test=ratings_test[['userId','movieId']]

## svd

svd with user bias and item bias

### train features at once

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from copy import deepcopy

class BiasedSvd(BaseEstimator):
    
    def __init__(self,factors=3,epsilon=100,alpha=0.01,learning_rate=0.01,max_iter=100):
        self.factors=factors
        self.epsilon=epsilon
        self.alpha=alpha
        self.learning_rate=learning_rate
        self.max_iter=max_iter
        
        self.fitted=False

    def fit(self,X,y):
        print('{factors %d, alpha %.4f, learning_rate %.4f, max_iter %d}'%(self.factors,self.alpha,self.learning_rate,self.max_iter))
        
        if not self.fitted:
            users_train=set(X.userId.unique())
            self.f_users={}
            for u in users_train:
                self.f_users[u]=np.random.randn(self.factors)

            items_train=set(X.movieId.unique())
            self.f_items={}
            for i in items_train:
                self.f_items[i]=np.random.randn(self.factors)

            self.b_users=pd.Series(np.random.randn(len(users_train)),index=users_train)
            self.b_items=pd.Series(np.random.randn(len(items_train)),index=items_train)

            self.g_mean=np.mean(y)

        last_cost=10e9
        for it in range(self.max_iter):
            cost=np.sum(self.b_users**2)+np.sum(self.b_items**2)
            for f in self.f_users,self.f_items:
                for _,f_i in f.items():
                    cost+=self.alpha*np.sum(f_i**2)
                    
            f_users=deepcopy(self.f_users)
            f_items=deepcopy(self.f_items)
            b_users=deepcopy(self.b_users)
            b_items=deepcopy(self.b_items)
#             f_users=self.f_users
#             f_items=self.f_items
#             b_users=self.b_users
#             b_items=self.b_items
            for idx in X.index:
                u=X.loc[idx,'userId']
                i=X.loc[idx,'movieId']
                r_pred=np.dot(self.f_users[u],self.f_items[i])
                e_ui=y[idx]-self.g_mean-b_users[u]-b_items[i]-r_pred
                cost+=(e_ui)**2
                
                f_users[u] += self.learning_rate*(e_ui*self.f_items[i] - self.alpha*f_users[u])
                f_items[i] += self.learning_rate*(e_ui*self.f_users[u] - self.alpha*f_items[i])
                
                b_users[u] += self.learning_rate*(e_ui - self.alpha*b_users[u])
                b_items[i] += self.learning_rate*(e_ui - self.alpha*b_items[i])
                
            self.f_users=f_users
            self.f_items=f_items
            self.b_users=b_users
            self.b_items=b_items

            print('iter %d, cost %.2f'%(it+1,cost))

            if np.isnan(cost): #or last_cost-cost < self.epsilon or cost > last_cost:
                break
                
            last_cost = cost
            
        self.fitted = True
            
        return self

    def predict(self,X):
        y_pred=np.zeros(len(X))

        for i in range(len(X)):
            u=X.iloc[i]['userId']
            m=X.iloc[i]['movieId']
            if u in self.f_users and m in self.f_items:
                y_pred[i]=self.g_mean+self.b_users[u]+self.b_items[i]+np.dot(self.f_users[u],self.f_items[m])
                    
        return y_pred
                
    def get_params(self,deep=True):
        return {'factors':self.factors,'epsilon':self.epsilon,
                'alpha':self.alpha,'learning_rate':self.learning_rate,'max_iter':self.max_iter}
    
    def set_params(self,**params):
        self.factors=params['factors']
        self.epsilon=params['epsilon']
        self.alpha=params['alpha']
        self.learning_rate=params['learning_rate']
        self.max_iter=params['max_iter']
        
# b_svd=BiasedSvd(factors=20,learning_rate=0.002,alpha=0.001,max_iter=200)
# b_svd.fit(ratings_train,ratings_train.rating)
# 'error %.2f'%mean_squared_error(y_true,b_svd2.predict(x_test))

### train feature by feature

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from copy import deepcopy

class BiasedSvd2(BaseEstimator):
    
    def __init__(self,factors=3,epsilon=100,alpha=0.01,learning_rate=0.01,max_iter=100):
        self.factors=factors
        self.epsilon=epsilon
        self.alpha=alpha
        self.learning_rate=learning_rate
        self.max_iter=max_iter
        
        self.fitted=False

    def fit(self,X,y):
        print('{factors %d, alpha %.4f, learning_rate %.4f, max_iter %d}'%(self.factors,self.alpha,self.learning_rate,self.max_iter))
        
        if not self.fitted:
            users_train=set(X.userId.unique())
            self.f_users={}
            for u in users_train:
                self.f_users[u]=np.ones(self.factors)/10
#                 self.f_users[u]=np.random.randn(self.factors)

            items_train=set(X.movieId.unique())
            self.f_items={}
            for i in items_train:
                self.f_items[i]=np.ones(self.factors)/10
#                 self.f_items[i]=np.random.randn(self.factors)

            self.b_users=pd.Series(np.random.randn(len(users_train)),index=users_train)
            self.b_items=pd.Series(np.random.randn(len(items_train)),index=items_train)

            self.g_mean=np.mean(y)

        
        early_stop=False
        for f in range(self.factors):
            print('train factors %d'%(f+1))
            
            last_cost=10e9
            for it in range(self.max_iter):
                cost=np.sum(self.b_users**2)+np.sum(self.b_items**2)
                for fm in self.f_users,self.f_items:
                    for _,f_i in fm.items():
                        cost+=self.alpha*np.sum(f_i**2)

                for idx in X.index:
                    u=X.loc[idx,'userId']
                    i=X.loc[idx,'movieId']
                    r_pred=np.dot(self.f_users[u],self.f_items[i])
                    e_ui=y[idx] - self.g_mean - self.b_users[u] - self.b_items[i] - r_pred
                    cost+=(e_ui)**2

                    f_uf = self.f_users[u][f]
                    self.f_users[u][f] += self.learning_rate*(e_ui*self.f_items[i][f] - self.alpha*self.f_users[u][f])
                    self.f_items[i][f] += self.learning_rate*(e_ui*f_uf - self.alpha*self.f_items[i][f])

                    self.b_users[u] += self.learning_rate*(e_ui - self.alpha*self.b_users[u])
                    self.b_items[i] += self.learning_rate*(e_ui - self.alpha*self.b_items[i])

                print('iter %d, cost %.2f'%(it+1,cost))

                if np.isnan(cost) or (last_cost > cost and last_cost-cost < self.epsilon) or last_cost<cost:
                    if it < 1:
                        early_stop = True
                    break

                last_cost = cost
                
            if early_stop:
                break
                
            self.learning_rate*=0.9
            
        self.fitted = True

        return self

    def predict(self,X):
        y_pred=np.zeros(len(X))

        for i in range(len(X)):
            u=X.iloc[i]['userId']
            m=X.iloc[i]['movieId']
            if u in self.f_users and m in self.f_items:
                y_pred[i]=self.g_mean + self.b_users[u] + self.b_items[m] + np.dot(self.f_users[u],self.f_items[m])
                    
        return y_pred
                
    def get_params(self,deep=True):
        return {'factors':self.factors,'epsilon':self.epsilon,
                'alpha':self.alpha,'learning_rate':self.learning_rate,'max_iter':self.max_iter}
    
    def set_params(self,**params):
        self.factors=params['factors']
        self.epsilon=params['epsilon']
        self.alpha=params['alpha']
        self.learning_rate=params['learning_rate']
        self.max_iter=params['max_iter']
        
b_svd2=BiasedSvd2(factors=10,learning_rate=0.01,alpha=0.001,max_iter=100)
b_svd2.fit(ratings_train,ratings_train.rating)
'error %.2f'%mean_squared_error(y_true,b_svd2.predict(x_test))

In [None]:
import svd

param_grid={'factors':[5,10,20,50],'alpha':np.logspace(-4,4,9)[:4],'learning_rate':np.logspace(-4,4,9)[:4],'max_iter':[20,50,100],'epsilon':[100.]}

# gs=GridSearchCV(estimator=svd.BiasedSvd(),param_grid=param_grid,scoring='neg_mean_squared_error',n_jobs=4,verbose=1,cv=5)
# gs.fit(ratings_train,ratings_train.rating)
# gs.grid_scores_,gs.best_estimator_,gs.best_score_