In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from tqdm import tqdm
from itertools import combinations
import math
from multiprocessing import Pool
import warnings
from functools import partial

In [2]:
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [4]:
dfu = pd.read_pickle('../data/train.pkl.xz')
anime = pd.read_pickle('../data/anime.pkl.xz').set_index('title')

# anime_corr = pd.read_pickle('../data/anime_corr.pkl.xz')

In [None]:
anime_corr = dfu.corr()

In [None]:
anime_corr.to_pickle('../data/anime_corr.pkl')
anime_corr.to_csv('../data/anime_corr.csv')

In [None]:
%%bash
cd ../data

rm anime_corr.pkl.xz
xz -vT14 anime_corr.pkl

rm anime_corr.csv.xz
xz -vT14 anime_corr.csv

In [None]:
dfu.head()

In [None]:
anime.head()

In [None]:
dfa = dfu.T

anime_counts = dfu.notnull().sum(axis=0)
users_counts = dfu.notnull().sum(axis=1)

In [None]:
def get_similar_shows(title, sort_by='corr', num=10, min_count=1000, min_num_common_tags=2):
    def num_common_tags(title):
        tags = set(anime.query(f'title == "{title}"')['tags'][0])
        num_common_series = anime.loc[anime['tags'].notnull(), 'tags'].apply(lambda x: len(set(x).intersection(tags)))
        return num_common_series.sort_values(ascending=False)
    
    dfa = pd.DataFrame({'corr': anime_corr[title], 'dist': dist, 'count': anime_counts, 
                        'num_common_tags': num_common_tags(title), 'avg_rating': anime['rating']})
    dfa = dfa.query(f'title != "{title}" and count >= {min_count} and num_common_tags >= {min_num_common_tags}')
    return dfa.sort_values(sort_by, ascending=False).head(num)

In [None]:
get_similar_shows('Clannad')

In [None]:
def find_corr(table, array):
    return table.corrwith(array)

In [None]:
def get_similar_users(username, num=10, min_common=None):    
    chunksize = 1500
    chunks = np.split(dfa, range(chunksize, dfa.shape[1]+chunksize, chunksize), axis=1)
    with Pool(15) as p:
        partial_f = partial(find_corr, array=dfa[username])
        user_corr = pd.concat(p.map(partial_f, chunks)).drop(username).fillna(0)
    
    del chunks

    common_booled = np.bitwise_and(dfu.drop(username, axis=0).notnull(), dfa[username].notnull())
    
    if min_common is None:
        min_common = np.sqrt(len(dfa.loc[dfa[username].notnull(), username]))
    
    data = pd.DataFrame({'corr': user_corr, 'num_common': common_booled.sum(axis=1)})
    data = data.query(f'username != "{username}" and num_common >= {min_common}')
    return data.sort_values(['corr', 'num_common'], ascending=False).head(num)

In [None]:
username = 'JohnTamer'

In [None]:
res = get_similar_users(username, min_common=30)

In [None]:
res

In [None]:
closest_user = res.head(1).index[0]

In [None]:
dfa.loc[dfa[[username, closest_user]].notnull().all(axis=1), [username, closest_user]]

In [None]:
sns.scatterplot(x=username, y=closest_user, data=dfa.loc[dfa[[username, closest_user]].notnull().all(axis=1)])