In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from itertools import combinations
import math
from multiprocessing import Pool
import warnings
from functools import partial

In [2]:
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [4]:
df = pd.read_pickle('../data/train.pkl.xz')
anime = pd.read_pickle('../data/anime.pkl.xz').set_index('title')

# anime_corr = pd.read_pickle('../data/anime_corr.pkl.xz')

anime_corr = df.corr()

anime_corr.to_pickle('../data/anime_corr.pkl')
anime_corr.to_csv('../data/anime_corr.csv')

%%bash
cd ../data

rm anime_corr.pkl.xz
xz -vT14 anime_corr.pkl

rm anime_corr.csv.xz
xz -vT14 anime_corr.csv

In [5]:
df.head()

title,.hack//Legend Of The Twilight,.hack//Roots,.hack//SIGN,009-1,07-Ghost,100 Sleeping Princes & the Kingdom of Dreams,100% Pascal-sensei (2017),11eyes,12-Sai: Chiccha na Mune no Tokimeki,12-Sai: Chiccha na Mune no Tokimeki 2,...,gdgd Fairies,gdgd Fairies 2,gdgd men's party,number24,revisions,sola,www.Working!!,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anilucard,,,,,,,,,,,...,,,,,,,,,,
Thallasian,,,,,,,,,,,...,,,,,0.84261,,,,,
QueenUmbra,,,,,,,,,,,...,,,,,,,,,,
andura,,,,,,,,,,,...,,,,,,,,,,
shegicaesario,,,,,,,,,,,...,,,,,,,,,,


In [6]:
anime.head()

Unnamed: 0_level_0,type,num_eps,is_ongoing,duration,studio,start_year,end_year,season,rating,num_votes,synopsis,tags,content_warnings,url
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Gag Manga Biyori 2,TV,12.0,False,5.0,Artland,2006.0,2006.0,Summer,3.583,233.0,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...
Fu Yu Nu,Movie,1.0,False,5.0,,2016.0,2016.0,,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu
Kijeu CSI: Gwahaksusadae,TV,104.0,False,23.0,,2012.0,2014.0,,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...
Zuoshou Shanglan,Web,,False,,,,,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...
Jeonsa Ryan,Movie,1.0,False,80.0,,1997.0,1997.0,,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan


In [7]:
anime_counts = df.notnull().sum(axis=0)
users_counts = df.notnull().sum(axis=1)

def get_similar_shows(title, sort_by='corr', num=10, min_count=1000, min_num_common_tags=2):
    def num_common_tags(title):
        tags = set(anime.query(f'title == "{title}"')['tags'][0])
        num_common_series = anime.loc[anime['tags'].notnull(), 'tags'].apply(lambda x: len(set(x).intersection(tags)))
        return num_common_series.sort_values(ascending=False)
    
    dfa = pd.DataFrame({'corr': anime_corr[title], 'count': anime_counts, 
                        'num_common_tags': num_common_tags(title), 'avg_rating': anime['rating']})
    dfa = dfa.query(f'title != "{title}" and count >= {min_count} and num_common_tags >= {min_num_common_tags}')
    return dfa.sort_values(sort_by, ascending=False).head(num)

In [9]:
df = df.fillna(0)

Remove any users who did not rate any shows

In [10]:
df = df.loc[(df != 0).any(axis=1)]

In [14]:
df_sparse = sp.sparse.csr_matrix(df.values, dtype=np.float16)

In [None]:
user_similarity = cosine_similarity(df_sparse)
anime_similarity = cosine_similarity(df_sparse.T)

In [None]:
user_sim_df = pd.DataFrame(user_similarity, index=df.index, columns=df.index)
anime_sim_df = pd.DataFrame(anime_similarity, index=df.columns, columns=df.columns)

In [None]:
user_sim_df

In [None]:
anime_sim_df

In [None]:
user_sim_df.to_pickle('../data/user_sim.pkl')
user_sim_df.to_csv('../data/user_sim_df.csv')

anime_sim_df.to_pickle('../data/anime_sim.pkl')
anime_sim_df.to_csv('../data/anime_sim.csv')

In [None]:
%%bash
cd ../data

rm user_sim.pkl.xz
xz -vT14 user_sim.pkl

rm user_sim.csv.xz
xz -vT14 user_sim.csv

rm anime_sim.pkl.xz
xz -vT14 anime_sim.pkl

rm anime_sim.csv.xz
xz -vT14 anime_sim.csv

In [None]:
get_similar_shows('Clannad')

In [None]:
def find_corr(table, array):
    return table.corrwith(array)

In [None]:
def get_similar_users(username, userdata=None, num=10, min_common=10):    
    userdata = dfa[username] if userdata is None else userdata
    
    chunksize = 1500
    chunks = np.split(dfa, range(chunksize, dfa.shape[1]+chunksize, chunksize), axis=1)
    with Pool(15) as p:
        partial_f = partial(find_corr, array=userdata)
        user_corr = pd.concat(p.map(partial_f, chunks)).drop(username).fillna(0)
    
    del chunks

    common_booled = np.bitwise_and(dfu.drop(username, axis=0).notnull(), userdata.notnull())
    
    dist = np.sum(dfu.subtract(dfu.loc[username], axis=1) ** 2, axis=1)
    dist = dist.loc[dist != 0]
    
    data = pd.DataFrame({'corr': user_corr, 'total_dist': dist,  'num_common': common_booled.sum(axis=1)})
    data['avg_dist'] = data['total_dist'] / data['num_common']
    data = data.query(f'username != "{username}" and num_common >= {min_common}')
    data = data.sort_values(['avg_dist'], ascending=True).head(num)

    return data

In [None]:
username = 'JohnTamer'

In [None]:
res = get_similar_users(username, min_common=40)

In [None]:
res