In [1]:
import sys
sys.path.append('../../src')

import pandas as pd
import numpy as np

from run import run
from utils import save_all_columns, save_columns

register model: lightgbm
register model: catboost
register model: catboost_class


In [2]:
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')
anime = pd.read_csv('../../data/anime.csv')
profile = pd.read_csv('../../data/profile.csv')

In [3]:
# seen のみ user を target encoding, frequency encoding 

In [4]:
test_seen = test[test["user"].isin(train["user"])].reset_index(drop=True)
test_unseen = test[~test["user"].isin(train["user"])].reset_index(drop=True)

In [17]:
test_seen

Unnamed: 0,user,anime_id,ID
0,df5ce90323,2191,df5ce90323/2191
1,fd47680337,12355,fd47680337/12355
2,8bfc6cea02,3287,8bfc6cea02/3287
3,5ba4de2706,5114,5ba4de2706/5114
4,6f78771003,33966,6f78771003/33966
...,...,...,...
22124,513878a55e,2501,513878a55e/2501
22125,f60d21c6ff,34914,f60d21c6ff/34914
22126,76b6afecea,1606,76b6afecea/1606
22127,ae8792e8c9,14467,ae8792e8c9/14467


In [5]:
save_all_columns(test_seen.rename(columns=(lambda x: "test_seen_" + x)), "test")
save_all_columns(test_unseen.rename(columns=(lambda x: "test_unseen_" + x)), "test")

save columns: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 266.95it/s, save test_seen_ID]
save columns: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 713.11it/s, save test_unseen_ID]


In [6]:
from sklearn.model_selection import KFold

In [7]:
kf = KFold(n_splits=7, shuffle=True, random_state=34)

In [8]:
train_result_df = train.copy()
testseen_result_df = test_seen.copy()

train_mean = train.groupby('user')['score'].mean()
testseen_result_df['user_target_encoding'] = test_seen['user'].map(train_mean)

tmp = np.zeros(train.shape[0])
for (train_idx, val_idx) in kf.split(train):
    target = train.iloc[train_idx]
    target_mean = target.groupby('user')['score'].mean()
    tmp[val_idx] = train['user'].iloc[val_idx].map(target_mean)

train_result_df['user_target_encoding'] = tmp

In [9]:
save_columns(train_result_df['user_target_encoding'], 'train', col_rename='user_target_encoding')
save_columns(testseen_result_df['user_target_encoding'], 'test', col_rename='test_seen_user_target_encoding')

In [10]:
train_result_df['user_count_encoding'] = train['user'].map(train['user'].value_counts())
testseen_result_df['user_count_encoding'] = test['user'].map(train['user'].value_counts())

In [11]:
save_columns(train_result_df['user_count_encoding'], 'train', col_rename='user_count_encoding')
save_columns(testseen_result_df['user_count_encoding'], 'test', col_rename='test_seen_user_count_encoding')

In [12]:
train_result_df['text_len'] = train['text'].apply(len)

In [13]:
train_result_df['text_len_mean'] = train_result_df['user'].map(train_result_df.groupby('user')['text_len'].mean())
train_result_df['text_len_max'] = train_result_df['user'].map(train_result_df.groupby('user')['text_len'].max())
train_result_df['text_len_min'] = train_result_df['user'].map(train_result_df.groupby('user')['text_len'].min())
train_result_df['text_len_std'] = train_result_df['user'].map(train_result_df.groupby('user')['text_len'].std())


testseen_result_df['test_seen_text_len_mean'] = testseen_result_df['user'].map(train_result_df.groupby('user')['text_len'].mean())
testseen_result_df['test_seen_text_len_max'] = testseen_result_df['user'].map(train_result_df.groupby('user')['text_len'].max())
testseen_result_df['test_seen_text_len_min'] = testseen_result_df['user'].map(train_result_df.groupby('user')['text_len'].min())
testseen_result_df['test_seen_text_len_std'] = testseen_result_df['user'].map(train_result_df.groupby('user')['text_len'].std())


In [14]:
save_columns = ['text_len_mean', 'text_len_max', 'text_len_min', 'text_len_std']
save_all_columns(train_result_df[save_columns], 'train')

test_save_columns = ['test_seen_text_len_mean', 'test_seen_text_len_max', 'test_seen_text_len_min', 'test_seen_text_len_std']
save_all_columns(testseen_result_df[test_save_columns], 'test')

save columns: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 295.97it/s, save text_len_std]
save columns: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 333.72it/s, save test_seen_text_len_std]


In [15]:
testseen_result_df

Unnamed: 0,user,anime_id,ID,user_target_encoding,user_count_encoding,test_seen_text_len_mean,test_seen_text_len_max,test_seen_text_len_min,test_seen_text_len_std
0,df5ce90323,2191,df5ce90323/2191,8.250000,32.0,2615.343750,4645,1042,804.416062
1,fd47680337,12355,fd47680337/12355,6.800000,5.0,7529.600000,11034,3124,3434.728344
2,8bfc6cea02,3287,8bfc6cea02/3287,4.491803,61.0,3867.098361,9091,1527,1871.456925
3,5ba4de2706,5114,5ba4de2706/5114,8.000000,5.0,4035.600000,6489,2441,1613.404568
4,6f78771003,33966,6f78771003/33966,6.084906,106.0,5276.650943,16812,1445,2459.406115
...,...,...,...,...,...,...,...,...,...
22124,513878a55e,2501,513878a55e/2501,5.000000,,2482.000000,3636,1512,1073.888262
22125,f60d21c6ff,34914,f60d21c6ff/34914,7.000000,,2091.000000,2091,2091,
22126,76b6afecea,1606,76b6afecea/1606,10.000000,,904.000000,904,904,
22127,ae8792e8c9,14467,ae8792e8c9/14467,6.666667,1.0,1386.000000,1616,1230,203.361747
