In [None]:
import numpy as np
import random


SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Data

## Data Info

In [None]:
import pandas as pd


anime = pd.read_csv('data/anime.csv')
rating = pd.read_csv('data/rating.csv')

In [None]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [None]:
rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [None]:
missing = anime.isna().sum()
mis_per = 100 * missing / len(rating)
mis_table = pd.DataFrame({"Miss count": missing, 
                          "Miss percent": mis_per}).sort_values(by = "Miss percent", ascending=False)
mis_table[mis_table['Miss count'] != 0]

Unnamed: 0,Miss count,Miss percent
rating,230,0.002944
genre,62,0.000793
type,25,0.00032


In [None]:
missing = anime.isin(['Unknown']).sum()
mis_per = 100 * missing / len(rating)
mis_table = pd.DataFrame({"Miss count": missing, 
                          "Miss percent": mis_per}).sort_values(by = "Miss percent", ascending=False)
mis_table[mis_table['Miss count'] != 0]

Unnamed: 0,Miss count,Miss percent
episodes,187,0.002951


In [None]:
missing = rating.isna().sum()
mis_per = 100 * missing / len(rating)
mis_table = pd.DataFrame({"Miss count": missing, 
                          "Miss percent": mis_per}).sort_values(by = "Miss percent", ascending=False)
mis_table[mis_table['Miss count'] != 0]

Unnamed: 0,Miss count,Miss percent


In [None]:
missing = rating.isin([-1]).sum()
mis_per = 100 * missing / len(rating)
mis_table = pd.DataFrame({"Miss count": missing, 
                          "Miss percent": mis_per}).sort_values(by = "Miss percent", ascending=False)
mis_table[mis_table['Miss count'] != 0]

Unnamed: 0,Miss count,Miss percent
rating,1476496,18.896157


In [None]:
rating['user_id'].unique().shape, rating['anime_id'].unique().shape

((73515,), (11200,))

## Data Upd

In [None]:
rating = rating[rating['rating'] != -1].dropna().drop_duplicates()
anime = anime[anime['episodes'] != 'Unknown'].dropna().drop_duplicates()

# Recommendation System I

## User DATA

In [None]:
GENRES = set()
for genre in np.array(anime['genre']):
    GENRES |= {g for g in genre.split(', ')}

In [None]:
n_users = rating['user_id'].unique().shape[0]

In [None]:
users = dict(list({'user_id': rating['user_id'].unique()}.items()) + \
             list({g: np.zeros(n_users) for g in GENRES}.items()))

In [None]:
from IPython.display import clear_output


def progress(i, n):
    clear_output(wait=True)
    print('Progress: %d/%d' % (i, n))

    
for i, user in enumerate(users['user_id']):
    progress(i, len(users['user_id']))
    rate = rating[rating['user_id'] == user]
    genre = np.array(anime[anime['anime_id'].isin(rate['anime_id'])]['genre'])
    genre = [g.split(', ') for g in genre]
    rate = np.array(rate['rating'])
    for r, gre in zip(rate, genre):
        for g in gre:
            users[g][i] += r

Progress: 69599/69600


In [None]:
users = pd.DataFrame(users)
users.head()

Unnamed: 0,user_id,Martial Arts,Mecha,Yuri,Mystery,Magic,Demons,Seinen,Shounen,Drama,...,School,Sci-Fi,Cars,Parody,Thriller,Psychological,Vampire,Romance,Sports,Josei
0,1,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
2,3,24.0,8.0,0.0,77.0,101.0,28.0,25.0,303.0,237.0,...,95.0,63.0,0.0,0.0,33.0,79.0,8.0,72.0,72.0,0.0
3,5,121.0,32.0,0.0,98.0,142.0,97.0,234.0,755.0,375.0,...,608.0,326.0,13.0,83.0,33.0,72.0,36.0,671.0,190.0,8.0
4,7,46.0,173.0,0.0,200.0,314.0,185.0,393.0,605.0,371.0,...,844.0,569.0,0.0,114.0,86.0,190.0,65.0,946.0,87.0,0.0


In [None]:
users = users[['user_id'] + sorted(users.drop(columns=['user_id']).columns)]

In [None]:
users.to_csv('users.csv', index=False)

## Clusters

In [None]:
from sklearn.cluster import KMeans
import pickle


kmeans = KMeans(n_clusters=256)
kmeans.fit(users.drop(columns=['user_id']))
pickle.dump(kmeans, open('kmeans.model', 'wb'))

KMeans(n_clusters=256)

In [None]:
np.unique(kmeans.labels_, return_counts=True)[1].max(), \
np.unique(kmeans.labels_, return_counts=True)[1].mean(), \
np.unique(kmeans.labels_, return_counts=True)[1].min()

(10600, 271.875, 1)

In [None]:
users['cluster'] = kmeans.labels_
keys_g = users.drop(columns=['user_id', 'cluster']).columns.tolist()
users = users[['user_id', 'cluster'] + keys_g]
users.head()

Unnamed: 0,user_id,cluster,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,1,8,30.0,10.0,0.0,20.0,0.0,20.0,0.0,30.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
1,2,8,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,236,472.0,366.0,0.0,292.0,0.0,28.0,237.0,42.0,...,0.0,36.0,0.0,72.0,65.0,184.0,33.0,8.0,0.0,0.0
3,5,217,666.0,410.0,13.0,1398.0,6.0,97.0,375.0,428.0,...,0.0,317.0,16.0,190.0,156.0,446.0,33.0,36.0,0.0,0.0
4,7,10,933.0,453.0,0.0,1449.0,26.0,185.0,371.0,546.0,...,0.0,375.0,0.0,87.0,253.0,570.0,86.0,65.0,0.0,0.0


In [None]:
np.unique(users['cluster'], return_counts=True)[1].argmax()

8

In [None]:
users[users['cluster']==8].mean()

user_id          36967.602453
cluster              8.000000
Action              10.422075
Adventure            5.085943
Cars                 0.036226
Comedy              11.943679
Dementia             0.344717
Demons               1.277642
Drama                7.873302
Ecchi                1.833396
Fantasy              6.038396
Game                 0.965755
Harem                1.849528
Hentai               0.728679
Historical           1.181981
Horror               1.802264
Josei                0.269057
Kids                 0.184434
Magic                2.476132
Martial Arts         1.111604
Mecha                1.559057
Military             1.441226
Music                0.710660
Mystery              3.730566
Parody               0.862830
Police               1.225472
Psychological        3.024906
Romance              8.391792
Samurai              0.389340
School               6.305943
Sci-Fi               4.097547
Seinen               1.817075
Shoujo               3.250849
Shoujo Ai 

In [None]:
users.to_csv('users_cl.csv', index=False)

## Anime top for cluster

In [None]:
n_cluster = 256
anime_top = {cluster: [] for cluster in range(n_cluster)}

In [None]:
for cluster in anime_top:
    usr = np.array(users[users['cluster'] == cluster]['user_id'])
    rtg = rating[rating['user_id'].isin(usr)][['anime_id', 
                                               'rating']].groupby('anime_id').mean().sort_values('rating', ascending=False)
    anime_top[cluster] = rtg.index.tolist()

In [None]:
import json


with open('anime_top_for_cluster.json', 'w') as file:
    json.dump(anime_top, file)

# Recommendation System II

## Data

### Anime DATA

In [None]:
data_anime = anime.drop(columns=['name'])

In [None]:
n_anime = data_anime['anime_id'].unique().shape[0]

In [None]:
tpe = {t: np.zeros(n_anime).astype(int) for t in set(data_anime['type'])}
gnr = {g: np.zeros(n_anime).astype(int) for g in GENRES}

In [None]:
for i, gr in enumerate(data_anime['genre']):
    gr = gr.split(', ')
    for g in gr:
        gnr[g][i] = 1
        
for i, t in enumerate(data_anime['type']):
    tpe[t][i] = 1

In [None]:
data_anime = data_anime.drop(columns=['genre', 'type'])

In [None]:
for key in sorted(tpe):
    data_anime[key] = tpe[key]
    
for key in sorted(gnr):
    data_anime[key] = gnr[key]

In [None]:
data_anime.head()

Unnamed: 0,anime_id,episodes,rating,members,Movie,Music,ONA,OVA,Special,TV,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,1,9.37,200630,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,64,9.26,793665,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,28977,51,9.25,114262,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,24,9.17,673572,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,9969,51,9.16,151266,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11830 entries, 0 to 12293
Data columns (total 52 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   anime_id       11830 non-null  int64  
 1   episodes       11830 non-null  object 
 2   rating         11830 non-null  float64
 3   members        11830 non-null  int64  
 4   Movie          11830 non-null  int32  
 5   Music          11830 non-null  int32  
 6   ONA            11830 non-null  int32  
 7   OVA            11830 non-null  int32  
 8   Special        11830 non-null  int32  
 9   TV             11830 non-null  int32  
 10  Action         11830 non-null  int32  
 11  Adventure      11830 non-null  int32  
 12  Cars           11830 non-null  int32  
 13  Comedy         11830 non-null  int32  
 14  Dementia       11830 non-null  int32  
 15  Demons         11830 non-null  int32  
 16  Drama          11830 non-null  int32  
 17  Ecchi          11830 non-null  int32  
 18  Fantas

In [None]:
data_anime['episodes'] = np.array(data_anime['episodes']).astype(int)

In [None]:
data_anime.to_csv('anime_cl.csv', index=False)

### Data La Final

In [None]:
data = rating.merge(users, on='user_id', how='left').merge(data_anime, on='anime_id', how='left')

In [None]:
data = data.drop(columns = ['user_id', 'anime_id'])
data.rename(columns={'rating_x': 'target'}, inplace=True )
data.head()

Unnamed: 0,target,cluster,Action_x,Adventure_x,Cars_x,Comedy_x,Dementia_x,Demons_x,Drama_x,Ecchi_x,...,Shounen Ai_y,Slice of Life_y,Space_y,Sports_y,Super Power_y,Supernatural_y,Thriller_y,Vampire_y,Yaoi_y,Yuri_y
0,10,8,30.0,10.0,0.0,20.0,0.0,20.0,0.0,30.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10,8,30.0,10.0,0.0,20.0,0.0,20.0,0.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,8,30.0,10.0,0.0,20.0,0.0,20.0,0.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,8,30.0,10.0,0.0,20.0,0.0,20.0,0.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,8,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Dataset

In [None]:
data_sample = data.sample(int(2e6), random_state=SEED)

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(np.array(data_sample.drop(columns='target')),
                                                      np.array(data_sample['target']),
                                                      test_size=0.1,
                                                      shuffle=True)

## Train

### Regression

In [None]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score


def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print('r2_score:', r2_score(y_valid, preds))

    
#rm_1 = LGBMRegressor(random_state=SEED)
#rm_2 = XGBRegressor(random_state=SEED)
#rm_3 = CatBoostRegressor(random_seed=SEED, task_type="GPU")
#reg_model = VotingRegressor([('lgb', rm_1), ('xgb', rm_2), ('cbr', rm_3)])
reg_model = CatBoostRegressor(iterations = 2048, random_seed=SEED, task_type="GPU")
reg_model.fit(X_train, y_train)
score(reg_model, X_valid, y_valid)

    r2_score: 0.3795679678525802

In [None]:
reg_model.save_model('cat_reg.model', format="cbm")

### Classification

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, precision_score


def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print(classification_report(y_valid, preds))
    print('recall: ', recall_score(y_valid, preds, average="macro", zero_division=0))

    
clf_model = CatBoostClassifier(iterations=2048, loss_function='MultiClass', task_type="GPU", random_state=SEED)
clf_model.fit(X_train, y_train)
score(clf_model, X_valid, y_valid)

              precision    recall  f1-score   support

           1       0.39      0.17      0.23       503
           2       0.20      0.04      0.06       769
           3       0.24      0.05      0.09      1296
           4       0.28      0.04      0.06      3407
           5       0.33      0.09      0.14      8955
           6       0.33      0.13      0.19     20094
           7       0.35      0.46      0.40     43426
           8       0.37      0.52      0.43     51958
           9       0.39      0.30      0.34     39371
          10       0.54      0.51      0.52     30221

    accuracy                           0.39    200000
    macro avg       0.34      0.23      0.25    200000
    weighted avg       0.38      0.39      0.37    200000

    recall:  0.22889428506982382

In [None]:
clf_model.save_model('cat_cls.model', format="cbm")