In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error
from copy import deepcopy
%matplotlib inline

from util import split_train_test

np.random.seed(1)
ratings=pd.read_csv('raw_data/ratings.csv')

X_train,y_train,X_test,y_test=split_train_test(ratings)

## als

In [None]:
np.dot(np.random.randn(5,20),np.random.randn(20,5))

In [None]:
from sklearn.base import BaseEstimator

class Als(BaseEstimator):
    
    def __init__(self,alpha=0.01,epsilon=100,factors=5,max_iter=100):
        self.epsilon=epsilon
        self.factors=factors
        self.max_iter=max_iter
        
        self.init_fit=False
        
    def init_fitting(self,X,y):
        print(str(self.get_params()))
        
        if not self.init_fit:
            self.g_mean=np.mean(y)
            
            self.r = X[['userId','movieId','rating']].pivot_table(index='userId',columns='movieId').fillna(self.g_mean)
            self.r.columns=self.r.columns.levels[1]
            
            self.f_users=np.random.randn(len(self.r.index),self.factors)
            self.f_items=np.random.randn(len(self.r.columns),self.factorsll)


            self.init_fit = True

              
    def fit(self, X, y):
        self.init_fitting(X,y)

        last_cost = np.inf    
        for it in range(self.max_iter):
            
            self.f_items = np.dot(np.dot(np.linalg.inv(np.dot(self.f_users.T, self.f_users)),self.f_users.T),self.r).T
            self.f_users = np.dot(np.dot(self.r, self.f_items), np.linalg.inv(np.dot(self.f_items.T,self.f_items)))
            
            r_pred = np.dot(self.f_users, self.f_items.T)
            cost = np.sum(np.sum((self.r-r_pred)**2))
                    
            print('iter %d, cost %.2f'%(it+1,cost))

            if np.isnan(cost) or (last_cost > cost and last_cost-cost < self.epsilon) or last_cost<cost:
                self.f_users = pd.DataFrame(self.f_users,index=self.r.index)
                self.f_items = pd.DataFrame(self.f_items,index=self.r.columns)
                break

            last_cost = cost
        
#         f_items = deepcopy(self.f_items)
#         for it in range(self.max_iter):
            
#             f_items = np.dot(np.dot(np.linalg.inv(np.dot(self.f_users.T, self.f_users)),self.f_users.T),self.r).T
#             self.f_users = np.dot(np.dot(self.r, f_items), np.linalg.inv(np.dot(f_items.T,f_items)))
            
#             r_pred = np.dot(self.f_users, f_items.T)
#             cost = np.sum(np.sum((self.r-r_pred)**2))
                    
#             print('iter %d, cost %.2f'%(it+1,cost))

#             if np.isnan(cost) or (last_cost > cost and last_cost-cost < self.epsilon) or last_cost<cost:
#                 break

#             last_cost = cost
        
#         last_cost = np.inf    
#         f_users = deepcopy(self.f_users)
#         for it in range(self.max_iter):
            
#             self.f_items = np.dot(np.dot(np.linalg.inv(np.dot(f_users.T, f_users)),f_users.T),self.r).T
#             f_users = np.dot(np.dot(self.r, self.f_items), np.linalg.inv(np.dot(self.f_items.T, self.f_items)))
            
#             r_pred = np.dot(f_users, self.f_items.T)
#             cost = np.sum(np.sum((self.r-r_pred)**2))
                    
#             print('iter %d, cost %.2f'%(it+1,cost))

#             if np.isnan(cost) or (last_cost > cost and last_cost-cost < self.epsilon) or last_cost<cost:
#                 self.f_users = pd.DataFrame(self.f_users,index=self.r.index)
#                 self.f_items = pd.DataFrame(self.f_items,index=self.r.columns)
#                 break

#             last_cost = cost
        
        
              
        return self

    def predict(self,X):
        y_pred=[0 for i in range(len(X))]

        for i in range(len(X)):
            u=X.iloc[i]['userId']
            m=X.iloc[i]['movieId']
            if u in self.f_users.index and m in self.f_items.index:
                y_pred[i] += np.dot(self.f_users.loc[u],self.f_items.loc[m])
                    
        return y_pred
                
    def get_params(self,deep=True):
        return {'epsilon':self.epsilon,
                'factors':self.factors,'max_iter':self.max_iter}
    
    def set_params(self,**params):
        self.epsilon=params['epsilon']
        self.factors=params['factors']
        self.max_iter=params['max_iter']
        
t=time()
als=Als(factors=10,max_iter=100,epsilon=1e2)
als.fit(X_train,y_train)
print('time cost %d'%int(time()-t))
'error %.3f'%mean_squared_error(y_test,als.predict(X_test))