In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import RegressorMixin
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from itertools import combinations
import math
from multiprocessing import Pool
import warnings
from functools import partial
from pandarallel import pandarallel

In [2]:
np.random.seed(42)
tqdm.pandas()
pandarallel.initialize(progress_bar=True)
warnings.filterwarnings("ignore", category=RuntimeWarning)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [4]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [5]:
df = pd.read_pickle('../data/train.pkl.xz')
anime = pd.read_pickle('../data/anime.pkl.xz')

# user_sim_df = pd.read_pickle('../data/user_sim.pkl')
# anime_sim_df = pd.read_pickle('../data/anime_sim.pkl')

In [6]:
validate_full = pd.read_pickle('../data/validate.pkl.xz')

In [7]:
keeper_mask = pd.DataFrame(np.random.uniform(size=validate_full.shape), 
                           index=validate_full.index, 
                           columns=validate_full.columns).applymap(lambda x: x > 0.75)

In [8]:
validate_masked = validate_full.mask(keeper_mask)

In [9]:
validate_actual = validate_full.mask(~keeper_mask)

Remove any users who rated less than 10 shows

In [10]:
df = df.loc[df.notnull().sum(axis=1) >= 10].copy(deep=True)

In [11]:
df.head()

title,.hack//Legend Of The Twilight,.hack//Roots,.hack//SIGN,009-1,07-Ghost,100 Sleeping Princes & the Kingdom of Dreams,100% Pascal-sensei (2017),11eyes,12-Sai: Chiccha na Mune no Tokimeki,12-Sai: Chiccha na Mune no Tokimeki 2,...,gdgd Fairies,gdgd Fairies 2,gdgd men's party,number24,revisions,sola,www.Working!!,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Varelaw,,,,,,,,,,,...,,,,,,,,,,
Hayomi,,,,,,,,,,,...,,,,,,,,,,
fudizilla,,,,,,,,,0.416939,0.286725,...,,,,,,,0.559873,,,
lindatjuh,,,,,,,,,,,...,,,,,,,,,,
Spiderwebzz,,,,,,,,,,,...,,,,,,,,,,


In [12]:
df = df.fillna(0)

In [13]:
sparse = sp.sparse.csr_matrix(df.values, dtype=np.float32)

def similarity_calculator(data):
    return pd.DataFrame(cosine_similarity(sparse, data), index=df.index, columns=data.index).astype(np.float16)

chunksize = int(validate_masked.shape[0]/14)+1
chunks = chunker(validate_masked.fillna(0), chunksize)
with Pool(14) as p:
    similarity_matrix = pd.concat(p.map(similarity_calculator, chunks), axis=1)

In [14]:
similarity_matrix.to_pickle('../data/similarity_matrix.pkl')

In [15]:
del chunks

In [29]:
def n_similar_users(column, n=50):
    return pd.Series(column.sort_values(ascending=False).head(n).index, name=column.name)

with Pool(14) as p:
    most_similar_users = pd.concat(p.map(n_similar_users, (tup[1] for tup in similarity_matrix.items())), axis=1)

In [None]:
most_similar_users.to_pickle('../data/most_similar_users.pkl')

In [None]:
df = df.replace(0, np.NaN)
most_similar_users = pd.read_pickle('../data/most_similar_users.pkl')

def get_recommendations(username):
    sim_users_data = df.loc[most_similar_users[username].values]

    sim_users_data = sim_users_data.loc[:,sim_users_data.notnull().any(axis=0)]

    watch_data = validate_masked.loc[username]

    user_watched = watch_data.loc[watch_data.notnull()].index

    suggestables = set(sim_users_data.columns).difference(user_watched)

    sim_users_data = sim_users_data[suggestables].head(50)

    priors = df[sim_users_data.columns].mean()

    sim_users_data = pd.concat([pd.DataFrame(priors, columns=['average']).T, sim_users_data], axis=0)

    prior_weight = 1.0
    incremental_base = 1.01
    user_weights = [prior_weight] + [incremental_base**x for x in range(sim_users_data.shape[0]-1, 0, -1)]

    weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()

    anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()

    pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)
    
    return pred_data

with Pool(14) as p:
    all_recs = []
    for recs in tqdm(p.imap(get_recommendations, validate_masked.index), total=validate_masked.shape[0]):
        all_recs.append(recs)
        
validate_pred = pd.DataFrame(all_recs, index=validate_masked.index)

validate_pred.to_pickle('../data/validation_recs.pkl')

In [None]:
validate_pred = pd.read_pickle('../data/validation_recs.pkl')

In [None]:
rmse = np.sqrt(((validate_pred - validate_actual)**2).mean(axis=1))

In [None]:
rmse.mean()

### Model

In [None]:
class User:
    def __init__(self, watch_data, true_data):
        self.username = watch_data.name
        self.watch_data = watch_data
        self.similar_users = None
        self.true_data = true_data
        
    def get_similar_users(self, min_common=10, sortby='cosine_sim', asc=False, num=50):
        # username = userdata.name if username is None else username
        # userdata = df.loc[username] if userdata is None else userdata

#         common_booled = np.bitwise_and(df == 0, self.watch_data == 0)
#         num_common = common_booled.sum(axis=1)

#         if sortby == 'corr':
#             tmp = df.T
#             res = tmp.corrwith(self.watch_data)

#         elif sortby == 'dist':
#             tmp = df.fillna(0)

#             dist = np.sum(tmp.subtract(tmp.loc[self.username], axis=1) ** 2, axis=1)
#             dist = dist / num_common
#             res = dist.loc[dist != 0]

#         elif sortby == 'cosine_sim':
        tmp = df.fillna(0)
        sparse = sp.sparse.csr_matrix(tmp.values, dtype=np.float32)
        res = cosine_similarity(sparse, pd.DataFrame([validate.iloc[0]]).fillna(0))

        res = pd.Series(res.flatten(), 
                        index=df.index).drop(self.username, 
                                             errors='ignore').sort_values(ascending=False)

        data = pd.DataFrame({sortby: res, 'num_common': num_common})
        data = data.query(f'username != "{self.username}" and num_common >= {min_common}')
        data = data.sort_values(sortby, ascending=asc).head(num)
        
        return data.index
    
    def get_recommendations(self, num=10, prior_weight=1.0, incremental_base=1.01):
        
        if self.similar_users is None:
            self.similar_users = self.get_similar_users()
            
        sim_users_data = df.loc[self.similar_users]
        sim_users_data = sim_users_data.loc[:,sim_users_data.notnull().any(axis=0)]
        user_watched = self.watch_data.loc[self.watch_data.notnull()].index
        diff = set(sim_users_data.columns).difference(user_watched)
        sim_users_data = sim_users_data[diff].head(num)
    
        priors = df[sim_users_data.columns].mean()
        sim_users_data = pd.concat([pd.DataFrame(priors, columns=['average']).T, sim_users_data], axis=0)
        user_weights = [prior_weight] + [incremental_base**x for x in range(sim_users_data.shape[0]-1, 0, -1)]
        weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()
        anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()
        self.pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)
        
        # self.recs = sim_users_data.mean().sort_values(ascending=False)
        
        return self.pred_data
    
    def get_rmse(self):
        true_data = self.true_data
        pred_data = self.pred_data
        
        true_data = true_data[true_data.notnull()]
        watched_intersection = set(true_data.index).intersection(pred_data.index)
        true_ratings = true_data[watched_intersection]
        pred_ratings = pred_data[watched_intersection]
        return np.sqrt(np.mean((true_ratings - pred_ratings)**2))

In [None]:
def evaluate_model(idx):
    user = User(watch_data=validate.iloc[idx], true_data=validate_true.iloc[idx])
    _ = user.get_recommendations()
    return user.get_rmse()

In [None]:
with Pool(16) as p:
    rmses = pd.Series(p.map(evaluate_model, range(0, 16)), index=range(0, 16))

In [None]:
rmses.mean()

### Hidden

In [None]:
def get_similar_users(username=None, userdata=None, df=df, min_common=10, sortby='cosine_sim', asc=False, num_sim_users=50):    
    username = userdata.name if username is None else username
    userdata = df.loc[username] if userdata is None else userdata
    
    common_booled = np.bitwise_and(df.drop(username, axis=0, errors='ignore').notnull(), userdata.notnull())
    num_common = common_booled.sum(axis=1)
    
    if sortby == 'corr':
        tmp = df.T
        res = tmp.corrwith(userdata)
    
    elif sortby == 'dist':
        tmp = df.fillna(0)

        dist = np.sum(tmp.subtract(tmp.loc[username], axis=1) ** 2, axis=1)
        dist = dist / num_common
        res = dist.loc[dist != 0]
        
    elif sortby == 'cosine_sim':
        tmp = df.fillna(0)
        sparse = sp.sparse.csr_matrix(tmp.values, dtype=np.float32)
        res = cosine_similarity(sparse, np.array(userdata.fillna(0)).reshape(1, -1))

        res = pd.Series(res.flatten(), index=df.index).drop(username, errors='ignore').sort_values(ascending=False)
    
    elif sortby == 'num_common':
        res = []
    
    data = pd.DataFrame({sortby: res, 'num_common': num_common})
    data = data.query(f'username != "{username}" and num_common >= {min_common}')
    data = data.sort_values(sortby, ascending=asc).head(num_sim_users)

    return data

In [None]:
def get_recommendations(username=None, userdata=None, df=df, min_common=10, sortby='cosine_sim', asc=False, num_sim_users=50):
    username = userdata.name if username is None else username
    userdata = df.loc[username] if userdata is None else userdata
    
    sim_users = get_similar_users(username=username, userdata=userdata, df=df, min_common=min_common, 
                                  sortby=sortby, asc=asc, num_sim_users=num_sim_users)
    sim_users_data = df.loc[sim_users.index]
    sim_users_data = sim_users_data.loc[:,sim_users_data.notnull().any(axis=0)]
    user_watched = userdata.loc[userdata.notnull()].index
    diff = set(sim_users_data.columns).difference(user_watched)
    sim_users_data = sim_users_data[diff]

    # recs = sim_users_data.mean().sort_values(ascending=False)
    
    
    priors = df[sim_users_data.columns].mean()
    sim_users_data = pd.concat([pd.DataFrame(priors, columns=['average']).T, sim_users_data], axis=0)
    user_weights = [1.5] + [1.01**x for x in range(sim_users_data.shape[0]-1, 0, -1)]
    weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()
    anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()
    recs = (weighted_sum / anime_weights).sort_values(ascending=False)
    
    return recs

In [None]:
def get_rmse(pred_data, true_data):
    true_data = true_data[true_data.notnull()]
    watched_intersection = set(true_data.index).intersection(pred_data.index)
    true_ratings = true_data[watched_intersection]
    pred_ratings = pred_data[watched_intersection]
    return np.sqrt(np.mean((true_ratings - pred_ratings)**2))

In [None]:
pred_userdata = get_recommendations(userdata=validate.iloc[0])

In [None]:
get_rmse(pred_userdata, validate_true.iloc[0])

In [None]:
total_rmse = 0
for idx in tqdm(range(10)):
    userdata = validate.iloc[idx]
    pred_userdata = get_recommendations(userdata=userdata, num_sim_users=50, sortby='corr')
    true_userdata = validate_true.iloc[idx]
    rmse = get_rmse(pred_userdata, true_userdata)
    total_rmse += rmse if not(pd.isna(rmse)) else 0

50 users

In [None]:
total_rmse / 10

In [None]:
idx = 3
userdata = validate.iloc[idx]
pred_userdata = get_recommendation(userdata=userdata, num_recs=100, num_users=5)
true_userdata = validate_true.iloc[idx]
get_rmse(pred_userdata, true_userdata)