In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
anime_csv_path = 'csv_output/anime.csv'
ratings_csv_path = 'csv_output/rating.csv'

In [3]:
anime_data = pd.read_csv(anime_csv_path)
anime_data['genre'].fillna('', inplace=True)

ratings_data = pd.read_csv(ratings_csv_path)

In [4]:
user_anime_ratings = ratings_data.pivot_table(index='user_id', columns='anime_id', values='rating')
user_anime_ratings.replace(-1, np.nan, inplace=True)
user_anime_ratings

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,9.0,8.0,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,10.0,10.0,10.0,,,,,,,9.0,...,,,,,,,,,,


In [5]:
# Normalize and scale ratings across users
normalized_ratings = user_anime_ratings.apply(
    lambda user: user - user.mean(),
    axis=1
)
# normalized_ratings = normalized_ratings.apply(
#     lambda user: (user) / (user.max() - user.min()),
#     axis=1
# )
normalized_ratings

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,3.644880,,,1.64488,,1.64488,1.64488,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,1.484848,0.484848,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,1.452514,1.452514,1.452514,,,,,,,0.452514,...,,,,,,,,,,


In [None]:
num_features = 3
# p_len, q_len = normalized_ratings.shape
# r = normalized_ratings

p_len, q_len = 70000, 5000
r = normalized_ratings.iloc[0:p_len, 0:q_len]

p = np.random.random(size=(p_len, num_features))
q = np.random.random(size=(q_len, num_features))

epochs = 20
alpha = 0.00001
lambda_reg = 0.1

for _ in range(epochs):
    r_hat = p @ q.T
    error = (r - r_hat)
    error.fillna(0, inplace=True)
    
    loss = error**2
    
    p_update = error.to_numpy() @ q
    q_update = error.to_numpy().T @ p

    # Update p and q
    p = p + alpha * (p_update - lambda_reg * p)
    q = q + alpha * (q_update - lambda_reg * q)

    print(loss.sum().sum())

r_pred = p @ q.T
r_pred = pd.DataFrame(r_pred, index=r.index, columns=r.columns)
r_pred = r_pred.round(2)
r_pred


8799657.59446765
7521236.022190613
8822218.426970769
18400127.031925853
64782489.18010231
100053039.121139
21113742.964853168


KeyboardInterrupt: 

In [9]:
r

anime_id,1,5,6,7,8,15,16,17,18,19,...,7416,7419,7420,7423,7427,7430,7435,7436,7445,7446
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,3.64488,,,1.644880,,1.64488,1.644880,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69997,,,,,,,,,0.688889,,...,,,,,,,,,,
69998,,,,,,,,,,,...,,,,,,,,,,
69999,,,,,,-0.016393,,,,,...,,,,,,,,,,
70000,,,,,,,,,,,...,,,,,,,,,,


In [8]:
a = {
    1: [1, 2, 3],
    2: [4, 5, 6],
    3: [np.nan, np.nan, np.nan]
}
a = pd.DataFrame(a)
print(a)

b = a.apply(
    lambda user: user - user.mean(),
    axis=1
)
print(b)

c = b.apply(
    lambda user: (user - user.min()) / (user.max() - user.min()),
    axis=1
)
print(c)

d = {
    1: [1, 2, 3],
    2: [4, 5, 6],
    3: [7, 8, 9]
}
d = pd.DataFrame(d)
print(d)

print(a - d)


   1  2   3
0  1  4 NaN
1  2  5 NaN
2  3  6 NaN
     1    2   3
0 -1.5  1.5 NaN
1 -1.5  1.5 NaN
2 -1.5  1.5 NaN
     1    2   3
0  0.0  1.0 NaN
1  0.0  1.0 NaN
2  0.0  1.0 NaN
   1  2  3
0  1  4  7
1  2  5  8
2  3  6  9
   1  2   3
0  0  0 NaN
1  0  0 NaN
2  0  0 NaN


In [9]:
a = np.array([[1, 2, 3],
              [3, 4, 5]])
b = np.array([[3, 4, 5],
              [5, 6, 7]])

a @ b.T

array([[26, 38],
       [50, 74]])