### 사용한 TabNet 라이브러리는 [PyTorch-TabNet](https://github.com/dreamquark-ai/tabnet) 으로 scikit-learn 인터페이스를 따르고 있음.

# Import

In [1]:
#import data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_row', 50)

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

# Load data and split

In [100]:
path = 'data/'

train = pd.read_csv(path+'ksy_train_rating_10n.csv')
test = pd.read_csv(path+'ksy_test_rating_10n.csv')

print(train.shape)
print(test.shape)

(306795, 10)
(76699, 10)


# Simple preprocessing

In [103]:
train['user_id'] = train['user_id'].astype('str')
train['years'] = train['years'].astype('str')
train['fix_age'] = train['fix_age'].astype('str')

test['user_id'] = test['user_id'].astype('str')
test['years'] = test['years'].astype('str')
test['fix_age'] = test['fix_age'].astype('str')

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if train.dtypes[col] == 'object':
        categorical_columns.append(col)

_data = pd.concat([train, test])
categorical_columns = categorical_columns#[2:]
for col in categorical_columns:
    tem = {id:idx for idx, id in enumerate(_data[col].unique())}
    categorical_dims[col] = len(tem)
    train[col] = train[col].map(tem)
    test[col] = test[col].map(tem)

# Define categorical features for categorical embeddings

In [104]:
train.nunique()

user_id                59803
isbn                  129777
rating                    10
book_author             2102
publisher               1217
language                  11
category_high            209
years                      5
fix_location_state       234
fix_age                    7
dtype: int64

In [105]:
target = 'rating'

cat_idxs = [ i for i, f in enumerate(categorical_columns)]

cat_dims = [ categorical_dims[f] for i, f in enumerate(categorical_columns)]

In [None]:
# define your embedding sizes : here just a random choice (default =1)
# cat_idxs나 cat_dims와 동일한 형태로 만들어야함 or int
# cat_emb_dim = [10, 10, 10, 10, 10, 10, 10, 10]

# Network parameters

In [106]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

clf = TabNetRegressor(  cat_idxs=cat_idxs,
                        cat_dims=cat_dims,
                        cat_emb_dim=10,
                        optimizer_fn=torch.optim.Adam,  # (default=torch.optim.Adam)
                        optimizer_params=dict(lr=2e-2), # (default=dict(lr=2e-2))
                        scheduler_params={"step_size":10,
                                            "gamma":0.95},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,   # (default=None)
                        mask_type='entmax', # "sparsemax", entmax
                        n_steps=5,  # (default=3) 이게 부스팅 단계 조절 파라미터! 보통 3~10 정도 사용한대
                        device_name = 'cuda',
                        seed = 42
                        )

# Training

In [107]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(train[categorical_columns], train[target], test_size=0.2)

n_targets = 1

X_train = X_train.values
y_train = y_train.values
# X와 y의 형태 맞춰주기 -> n_tragets 값은 변경 가능
y_train = np.transpose(np.tile(y_train, (n_targets,1)))

X_valid = X_valid.values
y_valid = y_valid.values
y_valid = np.transpose(np.tile(y_valid, (n_targets,1)))

X_test = test[categorical_columns].values
y_test = test[target].values
y_test = np.transpose(np.tile(y_test, (n_targets,1)))

In [56]:
max_epochs = 1000 if not os.getenv("CI", False) else 2

In [108]:
from torch import nn

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['rmse'], #['rmsle', 'mae', 'rmse', 'mse']
    max_epochs=max_epochs,
    patience=10, # 최소한 10번은 하자.
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=True, # (default=False)
)

# default loss_fn: mse

epoch 0  | loss: 7.2959  | train_rmse: 2.41906 | valid_rmse: 2.41509 |  0:00:42s
epoch 1  | loss: 5.83729 | train_rmse: 2.3667  | valid_rmse: 2.38357 |  0:01:17s
epoch 2  | loss: 5.41863 | train_rmse: 2.20293 | valid_rmse: 2.32913 |  0:01:58s
epoch 3  | loss: 4.6854  | train_rmse: 1.98883 | valid_rmse: 2.31756 |  0:02:39s
epoch 4  | loss: 3.92533 | train_rmse: 1.78254 | valid_rmse: 2.29335 |  0:03:19s
epoch 5  | loss: 3.2928  | train_rmse: 1.61805 | valid_rmse: 2.34874 |  0:03:59s
epoch 6  | loss: 2.8126  | train_rmse: 1.49865 | valid_rmse: 2.41215 |  0:04:41s
epoch 7  | loss: 2.42915 | train_rmse: 1.4091  | valid_rmse: 2.39286 |  0:05:22s


KeyboardInterrupt: 

In [None]:
print(f"BEST VALID SCORE FOR bookrec - best epoch {clf.best_epoch}: {clf.best_cost}")

# Test

In [None]:
preds = clf.predict(X_test)

In [None]:
preds

# Submit

In [None]:
submit = pd.read_csv(path+'sample_submission.csv')

In [None]:
submit['rating'] = preds

In [None]:
submit

In [None]:
import time

now = time.localtime()
now_date = time.strftime('%Y%m%d', now)
now_hour = time.strftime('%X', now)
save_time = now_date + '_' + now_hour.replace(':', '')

submit.to_csv('submit/KCH_{}_TabNet_{}{}_{}epoch_{:.4f}.csv'.format(save_time, clf.mask_type, clf.n_steps, clf.best_epoch, clf.best_cost, index = False))

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
fig, axs = plt.subplots(1, clf.n_steps, figsize=(20,20))

for i in range(clf.n_steps):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")

In [None]:
explain_matrix.shape

In [None]:
masks.keys()

In [None]:
submit['rating'].hist()

# Model Save / Load

In [None]:
# 예제 코드를 그냥 가져온 것으로 조금 수정이 필요할 듯
"""
# save tabnet model
saving_path_name = "./tabnet_model_test_1"
saved_filepath = clf.save_model(saving_path_name)

# define new model with basic parameters and load state dict weights
loaded_clf = TabNetClassifier()
loaded_clf.load_model(saved_filepath)
"""