In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from itertools import combinations
import math
from multiprocessing import Pool
import warnings
from functools import partial

In [2]:
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [4]:
df = pd.read_pickle('../data/train.pkl.xz')
anime = pd.read_pickle('../data/anime.pkl.xz').set_index('title')

user_sim_df = pd.read_pickle('../data/user_sim.pkl')
anime_sim_df = pd.read_pickle('../data/anime_sim.pkl')

Remove any users who rated less than 10 shows

In [5]:
df = df.loc[df.notnull().sum(axis=1) >= 10].copy(deep=True)

anime_corr = df.corr()

anime_corr.to_pickle('../data/anime_corr.pkl')
anime_corr.to_csv('../data/anime_corr.csv')

%%bash
cd ../data

rm anime_corr.pkl.xz
xz -vT14 anime_corr.pkl

rm anime_corr.csv.xz
xz -vT14 anime_corr.csv

In [6]:
df.head()

title,.hack//Legend Of The Twilight,.hack//Roots,.hack//SIGN,009-1,07-Ghost,100 Sleeping Princes & the Kingdom of Dreams,100% Pascal-sensei (2017),11eyes,12-Sai: Chiccha na Mune no Tokimeki,12-Sai: Chiccha na Mune no Tokimeki 2,...,gdgd Fairies,gdgd Fairies 2,gdgd men's party,number24,revisions,sola,www.Working!!,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anilucard,,,,,,,,,,,...,,,,,,,,,,
Thallasian,,,,,,,,,,,...,,,,,0.84261,,,,,
shegicaesario,,,,,,,,,,,...,,,,,,,,,,
bearofwisdom,,,,,,,,,,,...,,,,,,,,,,
Potatocat11,,,,,,,,,,,...,,,,,,,,,,


def get_similar_shows(title, sort_by='corr', num=10, min_count=1000, min_num_common_tags=2):
    def num_common_tags(title):
        tags = set(anime.query(f'title == "{title}"')['tags'][0])
        num_common_series = anime.loc[anime['tags'].notnull(), 'tags'].apply(lambda x: len(set(x).intersection(tags)))
        return num_common_series.sort_values(ascending=False)
    
    dfa = pd.DataFrame({'corr': anime_corr[title], 'count': anime_counts, 
                        'num_common_tags': num_common_tags(title), 'avg_rating': anime['rating']})
    dfa = dfa.query(f'title != "{title}" and count >= {min_count} and num_common_tags >= {min_num_common_tags}')
    return dfa.sort_values(sort_by, ascending=False).head(num)

df = df.fillna(0)

df_sparse = sp.sparse.csr_matrix(df.values, dtype=np.float32)

user_similarity = cosine_similarity(df_sparse)
anime_similarity = cosine_similarity(df_sparse.T)

user_sim_df = pd.DataFrame(user_similarity, index=df.index, columns=df.index)
anime_sim_df = pd.DataFrame(anime_similarity, index=df.columns, columns=df.columns)

user_sim_df

anime_sim_df

user_sim_df.to_pickle('../data/user_sim.pkl')
anime_sim_df.to_pickle('../data/anime_sim.pkl')

df = df.replace(0, np.NaN).copy(deep=True)

In [106]:
def get_similar_users(username=None, userdata=None, min_common=10, sortby='cosine_sim', asc=False, num_users=10):    
    username = userdata.name if username is None else username
    userdata = df.loc[username] if userdata is None else userdata
    
    common_booled = np.bitwise_and(df.drop(username, axis=0, errors='ignore').notnull(), userdata.notnull())
    num_common = common_booled.sum(axis=1)
    
    if sortby == 'corr':
        tmp = df.T
        res = tmp.corrwith(userdata)
    
    elif sortby == 'dist':
        tmp = df.fillna(0)

        dist = np.sum(tmp.subtract(tmp.loc[username], axis=1) ** 2, axis=1)
        dist = dist / num_common
        res = dist.loc[dist != 0]
        
    elif sortby == 'cosine_sim':
        if username in user_sim_df.columns:
            res = user_sim_df[username].drop(username, errors='ignore').sort_values(ascending=False)
        else:
            tmp = df.fillna(0)
            sparse = sp.sparse.csr_matrix(tmp.values, dtype=np.float32)
            res = cosine_similarity(sparse, np.array(userdata.fillna(0)).reshape(1, -1))
            
            res = pd.Series(res.flatten(), index=df.index).drop(username, errors='ignore').sort_values(ascending=False)
    
    elif sortby == 'num_common':
        res = []
    
    data = pd.DataFrame({sortby: res, 'num_common': num_common})
    data = data.query(f'username != "{username}" and num_common >= {min_common}')
    data = data.sort_values(sortby, ascending=asc).head(num_users)

    return data

In [122]:
def get_recommendation(username=None, userdata=None, num_recs=10, min_common=10, sortby='cosine_sim', asc=False, num_users=10):
    username = userdata.name if username is None else username
    userdata = df.loc[username] if userdata is None else userdata
    
    sim_users = get_similar_users(username=username, userdata=userdata, min_common=min_common, 
                                  sortby=sortby, asc=asc, num_users=num_users)
    sim_users_data = df.loc[sim_users.index]
    sim_users_data = sim_users_data.loc[:,sim_users_data.notnull().any(axis=0)]
    user_watched = userdata.loc[userdata.notnull()].index
    diff = set(sim_users_data.columns).difference(user_watched)
    sim_users_data = sim_users_data[diff]
    # recs = sim_users_data.mul((sim_users['cosine_sim'] / sim_users['cosine_sim'].max()), 
    #                           axis=0).mean().sort_values(ascending=False)
    recs = sim_users_data.mean().sort_values(ascending=False)
    return recs.head(num_recs)

In [75]:
username = 'JohnTamer'

In [57]:
recs = get_recommendation(username, num_recs=100)

In [58]:
recs

title
Mobile Suit Gundam: Iron-Blooded Orphans    3.955297
Knights of Sidonia                          3.953519
Gargantia on the Verdurous Planet           3.953519
Death Parade                                3.814186
Barakamon                                   3.814186
                                              ...   
Le Chevalier D'Eon                          2.345870
Tower of Druaga: the Sword of Uruk          2.345870
Hayate the Combat Butler!                   2.345870
Phantom Thief Jeanne                        2.345870
Romeo x Juliet                              2.345870
Length: 100, dtype: float64

In [59]:
validate_true = pd.read_pickle('../data/validate.pkl.xz')

In [60]:
keeper_mask = pd.DataFrame(np.random.uniform(size=validate_true.shape), 
                           index=validate_true.index, 
                           columns=validate_true.columns)

In [61]:
keeper_mask = keeper_mask.applymap(lambda x: int(x > 0.3))

In [62]:
validate = validate_true.mul(keeper_mask)
validate = validate.replace(0, np.NaN)

In [63]:
validate_true.notnull().sum(axis=1)

username
Lililou           70
GhostyBae         20
RyeHawk           28
SummerRose316     35
omgusername      220
                ... 
kirbix50           2
justinplaice      26
jeffy90            1
Papichulo12        4
IIMokuChanII      54
Length: 23432, dtype: int64

In [64]:
validate.notnull().sum(axis=1)

username
Lililou           49
GhostyBae         10
RyeHawk           24
SummerRose316     24
omgusername      146
                ... 
kirbix50           2
justinplaice      21
jeffy90            1
Papichulo12        3
IIMokuChanII      35
Length: 23432, dtype: int64

In [67]:
userdata = validate.iloc[0]

In [70]:
userdata

title
.hack//Legend Of The Twilight   NaN
.hack//Roots                    NaN
.hack//SIGN                     NaN
009-1                           NaN
07-Ghost                        NaN
                                 ..
sola                            NaN
www.Working!!                   NaN
xxxHOLiC                        NaN
xxxHOLiC Kei                    NaN
ēlDLIVE                         NaN
Name: Lililou, Length: 4489, dtype: float64

In [129]:
recs = get_recommendation(userdata=userdata, num_recs=100, num_users=5)

In [130]:
recs

title
Steins;Gate 0             3.989500
Hunter x Hunter (2011)    3.936916
Gakuen Alice              3.884333
Ghost Hunt                3.884333
Maid-sama!                3.884333
                            ...   
Your lie in April         2.219784
Bleach                    2.191606
My Hero Academia 2        2.154735
Deadman Wonderland        2.123242
Eden of the East          2.075920
Length: 100, dtype: float64

In [131]:
true_userdata = validate_true.iloc[0]

In [132]:
true_userdata = true_userdata[true_userdata.notnull()]

In [133]:
true_userdata[set(true_userdata.index).intersection(recs.index)].sort_values(ascending=False)

title
Yuri!!! on Ice                             4.142217
Noragami Aragoto                           3.784257
Great Teacher Onizuka                      3.784257
Ajin: Demi-Human 2nd Season                3.336893
Psycho-Pass                                3.336893
Tokyo Ghoul √A                             2.819753
Darker than Black: Gemini of the Meteor    1.286344
Name: Lililou, dtype: float64

In [134]:
recs[set(true_userdata.index).intersection(recs.index)].sort_values(ascending=False)

title
Yuri!!! on Ice                             3.852527
Tokyo Ghoul √A                             3.852527
Great Teacher Onizuka                      3.559451
Psycho-Pass                                3.222334
Noragami Aragoto                           3.057373
Darker than Black: Gemini of the Meteor    2.604866
Ajin: Demi-Human 2nd Season                2.441234
dtype: float64