In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy.typing import NDArray

In [2]:
anime_csv_path = 'csv_output/anime.csv'
ratings_csv_path = 'csv_output/rating.csv'

anime_data = pd.read_csv(anime_csv_path)
anime_data['genre'].fillna('', inplace=True)

user_ratings_data = pd.read_csv(ratings_csv_path)

In [3]:
ratings_df = user_ratings_data.pivot_table(index='user_id', columns='anime_id', values='rating')
ratings_df.replace(-1, np.nan, inplace=True)
ratings_df

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,9.0,8.0,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,10.0,10.0,10.0,,,,,,,9.0,...,,,,,,,,,,


In [4]:
class MatrixFactorization:
    def __init__(self, ratings: NDArray, num_features: int, alpha: float, lambda_reg: float, iterations: int):
        self.r = ratings
        self.num_users, self.num_items = self.r.shape
        self.k = num_features
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.iterations = iterations
        
        
    def train(self, verbose=True):
        self.p = np.random.random(size=(self.num_users, self.k))
        self.q = np.random.random(size=(self.num_items, self.k))
        
        self.b_user = np.random.random(size=(self.num_users, 1))
        self.b_item = np.random.random(size=(1, self.num_items))
        self.b_mu = np.nanmean(self.r)
        
        for i in range(1, self.iterations + 1):
            r_pred = self.get_predictions()
            
            self.gradient_descent(r_pred)
            mse = self.mse(r_pred)
            mae = self.mae(r_pred)
            
            if verbose:
                print(f'Epoch {i} - MSE: {mse}, MAE: {mae}')
    
    
    def gradient_descent(self, r_pred: NDArray):
        error = self.r - r_pred
        error = np.nan_to_num(error, nan=0)
        
        p_update = (error @ self.q) - (self.lambda_reg * self.p)
        q_update = (error.T @ self.p) - (self.lambda_reg * self.q)
        
        b_user_update = error.sum(axis=1, keepdims=True) - self.lambda_reg * self.b_user
        b_item_update = error.sum(axis=0, keepdims=True) - self.lambda_reg * self.b_item
        
        self.p = self.p + self.alpha * p_update
        self.q = self.q + self.alpha * q_update
        
        self.b_user = self.b_user + self.alpha * b_user_update
        self.b_item = self.b_item + self.alpha * b_item_update
    
    
    def get_predictions(self):
        return (self.p @ self.q.T) + self.b_user + self.b_item + self.b_mu
    
    
    def mse(self, r_pred: NDArray):
        # if r_pred == None:
        #     r_pred = self.get_predictions()
        error = np.nan_to_num(self.r - r_pred, nan=0)
        return np.round(np.sum(error**2), 2)
    
    
    def mae(self, r_pred: NDArray):
        # if r_pred == None:
        #     r_pred = self.get_predictions()
        error = np.nan_to_num(self.r - r_pred, nan=0)
        return np.round(np.sum(np.abs(error)), 1)
    

In [5]:
# Dummy test
ratings = np.array([[1, 2, 3],
                    [4, 5, 6]])

num_features = 2
alpha = 0.001
lambda_reg = 0.1
iterations = 1000

model = MatrixFactorization(ratings, num_features, alpha, lambda_reg, iterations)
model.train(verbose=False)
model.get_predictions()

array([[1.26960102, 2.21884507, 2.72099066],
       [4.05431766, 4.88651449, 5.93008587]])

In [6]:
ratings = ratings_df.to_numpy()
num_users, num_items = ratings.shape

num_features = 2
alpha = 0.00001
lambda_reg = 0.1
iterations = 30

model = MatrixFactorization(ratings, num_features, alpha, lambda_reg, iterations)
model.train(verbose=True)
r_pred = model.get_predictions()

pd.DataFrame(r_pred, index=ratings_df.index[:num_users], columns=ratings_df.columns[:num_items]).round(2)

Epoch 1 - MSE: 31555610.62, MAE: 11085903.2
Epoch 2 - MSE: 29501907.03, MAE: 10656149.6
Epoch 3 - MSE: 27927826.5, MAE: 10325570.9
Epoch 4 - MSE: 26663857.47, MAE: 10058504.9
Epoch 5 - MSE: 25616671.85, MAE: 9835894.1
Epoch 6 - MSE: 24729354.0, MAE: 9646218.1
Epoch 7 - MSE: 23964510.44, MAE: 9481938.0
Epoch 8 - MSE: 23296201.32, MAE: 9337790.3
Epoch 9 - MSE: 22705688.7, MAE: 9209943.6
Epoch 10 - MSE: 22179009.59, MAE: 9095498.3
Epoch 11 - MSE: 21705498.99, MAE: 8992223.3
Epoch 12 - MSE: 21276843.84, MAE: 8898416.4
Epoch 13 - MSE: 20886452.12, MAE: 8812682.2
Epoch 14 - MSE: 20529018.37, MAE: 8733921.0
Epoch 15 - MSE: 20200216.39, MAE: 8661204.5
Epoch 16 - MSE: 19896477.3, MAE: 8593806.0
Epoch 17 - MSE: 19614826.16, MAE: 8531126.2
Epoch 18 - MSE: 19352759.77, MAE: 8472646.7
Epoch 19 - MSE: 19108153.95, MAE: 8417925.0
Epoch 20 - MSE: 18879192.26, MAE: 8366566.7
Epoch 21 - MSE: 18664310.53, MAE: 8318247.3
Epoch 22 - MSE: 18462153.26, MAE: 8272686.0
Epoch 23 - MSE: 18271538.93, MAE: 8229636

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.92,8.58,8.49,8.33,9.35,8.93,8.65,8.73,8.85,8.96,...,9.20,9.76,9.43,10.05,8.78,9.35,9.48,9.79,9.60,9.05
2,8.34,7.93,8.01,7.92,8.68,8.33,8.06,8.02,8.05,8.32,...,8.41,8.90,8.64,9.02,8.11,8.36,8.71,8.98,8.67,8.30
3,8.25,8.10,7.81,7.96,8.89,8.69,7.94,8.11,8.34,8.64,...,9.13,9.28,8.70,9.61,8.34,8.82,8.98,9.44,9.16,8.26
4,8.90,8.56,8.44,8.18,9.32,8.83,8.65,8.74,8.89,8.89,...,9.14,9.81,9.50,10.15,8.75,9.45,9.50,9.78,9.66,9.13
5,8.32,8.08,7.93,8.01,8.85,8.61,8.02,8.11,8.26,8.56,...,8.90,9.16,8.70,9.40,8.29,8.66,8.91,9.30,8.99,8.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,8.57,8.19,8.18,7.98,8.95,8.51,8.31,8.33,8.42,8.54,...,8.70,9.31,9.03,9.55,8.37,8.87,9.06,9.33,9.12,8.68
73513,8.77,8.27,8.34,7.81,9.01,8.33,8.57,8.56,8.61,8.46,...,8.48,9.51,9.41,9.80,8.42,9.17,9.22,9.36,9.31,9.10
73514,9.53,9.23,9.01,8.69,10.01,9.46,9.30,9.46,9.66,9.55,...,9.89,10.63,10.27,11.07,9.44,10.34,10.25,10.56,10.51,9.88
73515,9.19,8.96,8.76,8.76,9.73,9.43,8.90,9.03,9.22,9.42,...,9.80,10.14,9.68,10.46,9.18,9.71,9.85,10.24,10.00,9.26


In [7]:
ratings_df

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,9.0,8.0,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,10.0,10.0,10.0,,,,,,,9.0,...,,,,,,,,,,


In [8]:
a = np.array([[1,2,3],[4,5,np.nan]])
b = np.array([[1, 2, 3],[4, 5, 6]])
a - b

array([[ 0.,  0.,  0.],
       [ 0.,  0., nan]])