In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy.typing import NDArray

import os
import json
import time

In [2]:
anime_csv_path = 'csv_output/anime.csv'
ratings_csv_path = 'csv_output/rating.csv'

anime_data = pd.read_csv(anime_csv_path)
anime_data['genre'].fillna('', inplace=True)

user_ratings_data = pd.read_csv(ratings_csv_path)

In [3]:
ratings_df = user_ratings_data.pivot_table(index='user_id', columns='anime_id', values='rating')
ratings_df.replace(-1, np.nan, inplace=True)
ratings_df

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,9.0,8.0,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,10.0,10.0,10.0,,,,,,,9.0,...,,,,,,,,,,


In [89]:
class MatrixFactorization:
    def __init__(self, ratings: NDArray, config: dict):
        self.r: NDArray = ratings.astype(np.float32)
        self.config: dict = config
        
        self.model_path = f"{config['models_folder']}/{config['model_name']}"
        self.config_file = f'{self.model_path}/config.json'
        self.matrices_file = f'{self.model_path}/matrices.npz'
        self.log_file = f'{self.model_path}/log.json'

        if not os.path.exists(config['models_folder']):
            os.mkdir(config['models_folder'])

        if os.path.exists(self.model_path):
            self.load_model()
        else:
            self.init_model()
        
        
    def train(self, verbose=True):
        start_iter = self.curr_iter
        for _ in range(start_iter, self.max_iters):
            r_pred = self.get_predictions()
            error = np.subtract(self.r, r_pred, where=np.isnan(self.r) == False)
            error = np.nan_to_num(error, nan=0)
            
            train_time = self.gradient_descent(error)
            
            mse = self.mse(error)
            mae = self.mae(error)
            
            self.curr_iter += 1
            self.save_model()
            self.log(mse, mae, train_time, verbose)
    
    
    def gradient_descent(self, error):
        start_time = time.time()
        
        p_update = (error @ self.q) - (self.lambda_reg * self.p)
        q_update = (error.T @ self.p) - (self.lambda_reg * self.q)
        
        b_user_update = error.sum(axis=1, keepdims=True) - self.lambda_reg * self.b_user
        b_item_update = error.sum(axis=0, keepdims=True) - self.lambda_reg * self.b_item
        
        self.p = self.p + self.alpha * p_update
        self.q = self.q + self.alpha * q_update
        
        self.b_user = self.b_user + self.alpha * b_user_update
        self.b_item = self.b_item + self.alpha * b_item_update
        
        end_time = time.time()
        return round(end_time - start_time, 2)
    
    
    def get_predictions(self):
        return (self.p @ self.q.T) + self.b_user + self.b_item + self.b_mu
    
    
    def mse(self, error):
        mse = np.round(np.nansum(error**2), 1)
        return mse
    
    
    def mae(self, error):
        mae = np.round(np.nansum(np.abs(error)), 1)
        return mae
    
    
    def init_model(self):
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        
        with open(self.config_file, 'w') as f:
            json.dump(self.config, f)
        
        self.num_users, self.num_items = self.r.shape
        self.k: int = self.config['num_features']
        self.alpha: float = self.config['alpha']
        self.lambda_reg: float = self.config['lambda_reg']
        self.max_iters: int = self.config['max_iters']
        self.curr_iter: int = 1
        
        self.p = np.random.random(size=(self.num_users, self.k)).astype(np.float32)
        self.q = np.random.random(size=(self.num_items, self.k)).astype(np.float32)
        
        self.b_user = np.random.random(size=(self.num_users, 1)).astype(np.float32)
        self.b_item = np.random.random(size=(1, self.num_items)).astype(np.float32)
        self.b_mu = np.nanmean(self.r).astype(np.float32)
        
        self.save_model()
        
        
    def load_model(self):
        with open(self.config_file, 'r') as f:
            self.config = json.load(f)
            self.num_users, self.num_items = self.r.shape
            self.k: int = self.config['num_features']
            self.alpha: float = self.config['alpha']
            self.lambda_reg: float = self.config['lambda_reg']
            self.max_iters: int = self.config['max_iters']
            self.curr_iter: int = self.config['curr_iter']
            
            
        loaded_matrices = np.load(self.matrices_file)
        self.p = loaded_matrices['p']
        self.q = loaded_matrices['q']
        self.b_user = loaded_matrices['b_user']
        self.b_item = loaded_matrices['b_item']
        self.b_mu = loaded_matrices['b_mu']
        
        
    def save_model(self):
        self.config['curr_iter'] = self.curr_iter
        with open(self.config_file, 'w') as f:
            json.dump(self.config, f)
        
        matrices_file = f'{self.model_path}/matrices.npz'
        np.savez(
            file=matrices_file,
            p=self.p,
            q=self.q,
            b_user=self.b_user,
            b_item=self.b_item,
            b_mu=self.b_mu
        )
    
    
    def log(self, mse, mae, train_time, verbose):
        if verbose:
            print(f'Epoch {self.curr_iter} - MSE: {mse}, MAE: {mae}, Train Time: {train_time}')
        
        new_data = {
            'epoch': self.curr_iter,
            'train_time': train_time,
            'mse': mse.astype(np.float64),
            'mae': mae.astype(np.float64)
        }
        
        if not os.path.exists(self.log_file):
            with open(self.log_file, 'w') as f:
                log_data = {
                    'train': []
                }
                json.dump(log_data, f)
        
        with open(self.log_file, 'r') as f:
            log_data = json.load(f)
        
        log_data['train'].append(new_data)
        with open(self.log_file, 'w') as f:
            json.dump(log_data, f)
    

In [88]:
# Test
ratings = np.array([[1, 2, 3],
                    [4, 5, 6]])

config = {
    'models_folder': 'models',
    'model_name': 'test_model',
    'num_features': 2,
    'alpha': 0.001,
    'lambda_reg': 0.1,
    'max_iters': 100
}

model = MatrixFactorization(ratings, config)
model.train(verbose=False)
model.get_predictions()

array([[3.6409948, 3.563891 , 3.6779559],
       [4.3693714, 4.313853 , 4.611257 ]], dtype=float32)

In [None]:
ratings = ratings_df.to_numpy()
num_users, num_items = ratings.shape

config = {
    'models_folder': 'models',
    'model_name': 'model_1',
    'num_features': 2,
    'alpha': 0.00001,
    'lambda_reg': 0.1,
    'max_iters': 51
}

model = MatrixFactorization(ratings, config)
model.train(verbose=True)
r_pred = model.get_predictions()

pd.DataFrame(r_pred, index=ratings_df.index[:num_users], columns=ratings_df.columns[:num_items]).round(2)

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.79,8.33,8.21,7.46,9.25,8.28,8.33,8.64,8.52,8.98,...,9.41,9.11,9.43,9.67,8.32,9.59,9.19,9.87,9.83,8.91
2,8.84,8.49,8.50,7.88,9.10,8.59,8.52,8.87,8.67,8.95,...,9.40,9.06,9.26,9.59,8.55,9.65,9.29,9.74,9.84,9.08
3,9.24,8.61,8.37,8.33,9.34,8.60,8.54,8.73,8.53,9.33,...,9.52,9.20,9.85,9.84,8.66,9.41,9.76,10.00,9.66,9.19
4,8.60,8.00,7.80,7.79,8.63,8.03,7.94,8.14,7.93,8.66,...,8.86,8.53,9.14,9.16,8.07,8.77,9.13,9.30,9.01,8.59
5,8.15,7.66,7.53,7.17,8.36,7.68,7.64,7.90,7.72,8.26,...,8.59,8.26,8.69,8.85,7.70,8.66,8.63,9.00,8.89,8.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,9.24,8.96,9.01,8.23,9.56,9.06,9.01,9.40,9.20,9.37,...,9.88,9.54,9.63,10.04,9.01,10.21,9.67,10.19,10.40,9.55
73513,9.29,8.66,8.43,8.41,9.37,8.66,8.59,8.78,8.58,9.37,...,9.56,9.24,9.89,9.88,8.72,9.45,9.82,10.03,9.70,9.25
73514,8.84,8.34,8.22,7.94,8.98,8.39,8.32,8.58,8.38,8.92,...,9.23,8.90,9.34,9.49,8.40,9.28,9.33,9.63,9.51,8.93
73515,8.62,8.12,8.00,7.89,8.65,8.21,8.08,8.33,8.10,8.67,...,8.93,8.59,9.07,9.19,8.20,8.94,9.15,9.31,9.16,8.71


In [7]:
ratings_df

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,9.0,8.0,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,10.0,10.0,10.0,,,,,,,9.0,...,,,,,,,,,,
