In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import RegressorMixin
from tqdm import tqdm
from itertools import combinations
import math
from multiprocessing import Pool
import warnings
from functools import partial

In [2]:
np.random.seed(42)

# warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [4]:
df = pd.read_pickle('../data/train.pkl.xz')
anime = pd.read_pickle('../data/anime.pkl.xz').set_index('title')

# user_sim_df = pd.read_pickle('../data/user_sim.pkl')
# anime_sim_df = pd.read_pickle('../data/anime_sim.pkl')

Remove any users who rated less than 10 shows

In [5]:
df = df.loc[df.notnull().sum(axis=1) >= 10].copy(deep=True)

In [6]:
df.head()

title,.hack//Legend Of The Twilight,.hack//Roots,.hack//SIGN,009-1,07-Ghost,100 Sleeping Princes & the Kingdom of Dreams,100% Pascal-sensei (2017),11eyes,12-Sai: Chiccha na Mune no Tokimeki,12-Sai: Chiccha na Mune no Tokimeki 2,...,gdgd Fairies,gdgd Fairies 2,gdgd men's party,number24,revisions,sola,www.Working!!,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anilucard,,,,,,,,,,,...,,,,,,,,,,
Thallasian,,,,,,,,,,,...,,,,,0.84261,,,,,
shegicaesario,,,,,,,,,,,...,,,,,,,,,,
bearofwisdom,,,,,,,,,,,...,,,,,,,,,,
Potatocat11,,,,,,,,,,,...,,,,,,,,,,


In [24]:
def get_similar_users(username=None, userdata=None, df=df, min_common=10, sortby='cosine_sim', asc=False, num_sim_users=10):    
    username = userdata.name if username is None else username
    userdata = df.loc[username] if userdata is None else userdata
    
    common_booled = np.bitwise_and(df.drop(username, axis=0, errors='ignore').notnull(), userdata.notnull())
    num_common = common_booled.sum(axis=1)
    
    if sortby == 'corr':
        tmp = df.T
        res = tmp.corrwith(userdata)
    
    elif sortby == 'dist':
        tmp = df.fillna(0)

        dist = np.sum(tmp.subtract(tmp.loc[username], axis=1) ** 2, axis=1)
        dist = dist / num_common
        res = dist.loc[dist != 0]
        
    elif sortby == 'cosine_sim':
        tmp = df.fillna(0)
        sparse = sp.sparse.csr_matrix(tmp.values, dtype=np.float32)
        res = cosine_similarity(sparse, np.array(userdata.fillna(0)).reshape(1, -1))

        res = pd.Series(res.flatten(), index=df.index).drop(username, errors='ignore').sort_values(ascending=False)
    
    elif sortby == 'num_common':
        res = []
    
    data = pd.DataFrame({sortby: res, 'num_common': num_common})
    data = data.query(f'username != "{username}" and num_common >= {min_common}')
    data = data.sort_values(sortby, ascending=asc).head(num_sim_users)

    return data

In [25]:
def get_recommendations(username=None, userdata=None, df=df, min_common=10, sortby='cosine_sim', asc=False, num_sim_users=10):
    username = userdata.name if username is None else username
    userdata = df.loc[username] if userdata is None else userdata
    
    sim_users = get_similar_users(username=username, userdata=userdata, df=df, min_common=min_common, 
                                  sortby=sortby, asc=asc, num_sim_users=num_sim_users)
    sim_users_data = df.loc[sim_users.index]
    sim_users_data = sim_users_data.loc[:,sim_users_data.notnull().any(axis=0)]
    user_watched = userdata.loc[userdata.notnull()].index
    diff = set(sim_users_data.columns).difference(user_watched)
    sim_users_data = sim_users_data[diff]
    # recs = sim_users_data.mul((sim_users['cosine_sim'] / sim_users['cosine_sim'].max()), 
    #                           axis=0).mean().sort_values(ascending=False)
    recs = sim_users_data.mean().sort_values(ascending=False)
    return recs

In [9]:
username = 'JohnTamer'

In [10]:
get_similar_users(username)

Unnamed: 0_level_0,cosine_sim,num_common
username,Unnamed: 1_level_1,Unnamed: 2_level_1
NVSRYA,0.56631,70
JSuperAnime,0.558657,49
Iguanalana,0.55397,60
RuneofAces,0.537463,45
artymisk,0.524306,61
eraserrain,0.522316,65
Perko995,0.522301,69
gedrue,0.516915,46
Xynene,0.512099,78
Katelynn,0.506667,77


In [11]:
recs = get_recommendations(username)

In [12]:
recs

title
Mobile Suit Gundam: Iron-Blooded Orphans      4.288570
Barakamon                                     4.217963
Death Parade                                  4.217963
Black Bullet                                  3.979759
Gargantia on the Verdurous Planet             3.953519
                                                ...   
Naruto Spin-Off: Rock Lee & His Ninja Pals    0.299954
A Little Snow Fairy Sugar                     0.286639
Kämpfer                                       0.208187
Mythical Detective Loki Ragnarok              0.161663
Reign: The Conqueror                          0.100036
Length: 309, dtype: float64

In [17]:
validate_true = pd.read_pickle('../data/validate.pkl.xz')

In [18]:
keeper_mask = pd.DataFrame(np.random.uniform(size=validate_true.shape), 
                           index=validate_true.index, 
                           columns=validate_true.columns).applymap(lambda x: x > 0.75)

In [19]:
validate = validate_true.mask(keeper_mask)

In [None]:
pred_userdata = get_recommendations(userdata=validate.iloc[0], num_users=200)

In [None]:
def get_rmse(pred_data, true_data):
    true_data = true_data[true_data.notnull()]
    watched_intersection = set(true_data.index).intersection(pred_data.index)
    true_ratings = true_data[watched_intersection]
    pred_ratings = pred_data[watched_intersection]
    return np.sqrt(np.mean((true_ratings - pred_ratings)**2))

In [26]:
class RecommenderSystem(RegressorMixin):
    def fit(self, X):
        self.df = X
        
    def predict(self, X_new):
        return get_recommendations(userdata=X_new, df=self.df, num_sim_users=self.num_sim_users, 
                                   username=self.username, min_common=self.min_common, 
                                   sortby=self.sortby, asc=self.asc, num_sim_users=self.num_sim_users)
    
    def set_params(self, num_sim_users):
        self.num_sim_users = num_sim_users

In [27]:
recommender = RecommenderSystem()

In [28]:
recommender.fit(df)

In [29]:
recommender.set_params(num_sim_users = 100)

In [31]:
recommender.predict(validate.iloc[1])

title
Ashita no Joe 2                                              4.368842
Sexy Commando Gaiden Sugoiyo!! Masaru-san                    4.356941
Gallery Fake                                                 4.340704
Tiger Mask                                                   4.058205
Yes! Pretty Cure 5                                           4.048426
                                                               ...   
SD Gundam Force                                              0.195431
Girl Friend BETA                                             0.163037
Conception                                                   0.109300
High School Prodigies Have It Easy Even in Another World!    0.061464
Dimension High School                                        0.035610
Length: 2774, dtype: float64

In [None]:
total_rmse = 0
for idx in tqdm(range(10)):
    userdata = validate.iloc[idx]
    pred_userdata = get_recommendations(userdata=userdata, num_recs=100, num_users=1000)
    true_userdata = validate_true.iloc[idx]
    rmse = get_rmse(pred_userdata, true_userdata)
    total_rmse += rmse if not(pd.isna(rmse)) else 0

In [None]:
get_rmse(pred_userdata, validate_true.iloc[0])

1000 users

In [None]:
total_rmse / 10

100 users

In [None]:
total_rmse / 10

60 users

In [None]:
total_rmse / 10

50 users

In [None]:
total_rmse / 10

40 users

In [None]:
total_rmse / 10

30 users

In [None]:
total_rmse / 10

20 users

In [None]:
total_rmse / 10

10 users

In [None]:
total_rmse / 10

5 users

In [None]:
total_rmse / 10

In [None]:
idx = 3
userdata = validate.iloc[idx]
pred_userdata = get_recommendation(userdata=userdata, num_recs=100, num_users=5)
true_userdata = validate_true.iloc[idx]
get_rmse(pred_userdata, true_userdata)