In [2]:
import sys
import torch, gc
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.metrics import Metric
from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import optuna

In [3]:
class Preprocessed_data_loader:
    def __init__(self, path="",IS_CUSTOM=False,DROPS=[]):

        self.other_features = [
            "answerCode",
            "Timestamp",
        ]
        self.cat_features = [
            "userID",
            "assessmentItemID",
            "testId",
            "KnowledgeTag",
            "year",
            "month",
            "day",
            "hour",
            "minute",
            "second",
            "dayofweek",
            "first3",
            "mid3",
            "last3",
            "hour_answerCode_Level",
            
        ]
        self.cont_features = [
            "userID_answerCode_mean",
            "userID_answerCode_count",
            "userID_answerCode_sum",
            "userID_answerCode_var",
            "userID_answerCode_median",
            "testId_answerCode_mean",
            "testId_answerCode_count",
            "testId_answerCode_sum",
            "testId_answerCode_var",
            "testId_answerCode_median",
            "assessmentItemID_answerCode_mean",
            "assessmentItemID_answerCode_count",
            "assessmentItemID_answerCode_sum",
            "assessmentItemID_answerCode_var",
            "assessmentItemID_answerCode_median",
            "KnowledgeTag_answerCode_mean",
            "KnowledgeTag_answerCode_count",
            "KnowledgeTag_answerCode_sum",
            "KnowledgeTag_answerCode_var",
            "KnowledgeTag_answerCode_median",
            "dayofweek_answerCode_mean",
            "dayofweek_answerCode_count",
            "dayofweek_answerCode_sum",
            "dayofweek_answerCode_var",
            "dayofweek_answerCode_median",
            "userID_first3_answerCode_mean",
            "userID_first3_answerCode_count",
            "userID_first3_answerCode_sum",
            "userID_first3_answerCode_var",
            "userID_first3_answerCode_median",
            "hour_answerCode_mean",
            "hour_answerCode_count",
            "hour_answerCode_sum",
            "hour_answerCode_var",
            "hour_answerCode_median",
            "month_answerCode_mean",
            "month_answerCode_count",
            "month_answerCode_sum",
            "month_answerCode_var",
            "month_answerCode_median",
            "user_acc",
            "assessmentItemID_elo_pred",
            "testId_elo_pred",
            "KnowledgeTag_elo_pred",
            "feature_ensemble_elo_pred",
            "userID_elapsedTime_median",
            "KnowledgeTag_elapsedTime_median",
            "assessmentItemID_elapsedTime_median",
            "testId_elapsedTime_median",
            "userID_answerCode_elapsedTime_median",
            "KnowledgeTag_answerCode_elapsedTime_median",
            "assessmentItemID_answerCode_elapsedTime_median",
            "elapsedTime",
            "testId_answerCode_elapsedTime_median",
            "user_correct_answer",
            "user_total_answer",
        ]
        self.data_path = path
        train_name = "preprocessed_custom_train_data.csv" if IS_CUSTOM else "/preprocessed_train_data.csv"
        test_name = "preprocessed_custom_test_data.csv" if IS_CUSTOM else "/preprocessed_test_data.csv"
        self.train_df = pd.read_csv(path+train_name, index_col=0)
        self.test_df = pd.read_csv(path+test_name, index_col=0)
        self.train = self.train_df.drop(DROPS, axis=1)
        self.test = self.train_df.drop(DROPS, axis=1)

In [4]:
class Eval(Metric):
    def __init__(self):
        self._name = "roc_auc_score"
        self._maximize = True

    def __call__(self, y_true, y_score):
        auc = roc_auc_score(y_true, y_score)
        return auc

In [5]:
IS_CUSTOM = True
USE_VALID = True
show=True
use = [
    'KnowledgeTag',
    'KnowledgeTag_answerCode_median',
    'KnowledgeTag_answerCode_sum',
    'KnowledgeTag_answerCode_var',
    'assessmentItemID_answerCode_median',
    'assessmentItemID_elo_pred',
    'dayofweek_answerCode_count',
    'dayofweek_answerCode_var',
    'elapsedTime',
    'feature_ensemble_elo_pred',
    'hour',
    'hour_answerCode_count',
    'mid3',
    'testId_elapsedTime_median',
    'testId_elo_pred',
    'userID_answerCode_median',
    'userID_answerCode_sum',
    'userID_first3_answerCode_count',
    'userID_first3_answerCode_mean',
    'userID_first3_answerCode_var',
    'user_acc',
    'user_correct_answer',
    'user_total_answer'
 ]+['answerCode']
data = Preprocessed_data_loader(IS_CUSTOM=IS_CUSTOM)
train = data.train_df[use]
_test = data.test_df[use]
test = _test[_test.answerCode==-1]
valid = _test[_test.answerCode!=-1]

X_train = train.drop("answerCode",axis=1).values
y_train = train.answerCode.values.reshape(-1,1)

X_valid = valid.drop("answerCode",axis=1).values
y_valid = valid.answerCode.values.reshape(-1,1)

X_test = test.drop("answerCode",axis=1).values
y_test = test.answerCode.values.reshape(-1,1)

cat_features = [i for i in data.cat_features if i in use]
cat_features

['KnowledgeTag', 'hour', 'mid3']

In [6]:
temp = data.train_df[use].drop("answerCode",axis=1).columns
temp
cat_idx = [i for i,j in enumerate(temp) if j in cat_features]
cat_dim = [data.train_df[use].drop("answerCode",axis=1)[temp[i]].nunique() for i in cat_idx]
cat_dim

[912, 24, 198]

In [9]:
df = pd.concat([data.train_df[use],data.test_df[use]])
edf = df[df.answerCode!=-1].drop("answerCode",axis=1).to_numpy()
target = df[df.answerCode!=-1].answerCode.to_numpy().reshape(-1,1)
BS = 2**19

In [12]:
unsupervised_model = TabNetPretrainer().load_model("pretrained/unsupervised_model.zip")



In [None]:
def Objective(trial):
        mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
        n_da = trial.suggest_int("n_da", 8, 64, step=4)
        n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
        gamma = trial.suggest_float("gamma", 1.0, 2.0, step=0.2)
        n_independent = trial.suggest_int("n_independent", 1, 5)
        n_shared = trial.suggest_int("n_shared", 1, 5)
        lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
        tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                     lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=1e-1, weight_decay=1e-5),
                     mask_type=mask_type, n_shared=n_shared, n_independent=n_independent,
                     scheduler_params=dict(mode="min",
                                           patience=trial.suggest_int("patienceScheduler",low=3,high=10), # changing sheduler patience to be lower than early stopping patience 
                                           min_lr=1e-4,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=1,
                     ) #early stopping
        kf = KFold(n_splits=5, random_state=42, shuffle=True)
        CV_score_array=[]
        for train_index, test_index in kf.split(edf):
            X_train, X_valid = edf[train_index], edf[test_index]
            y_train, y_valid = target[train_index], target[test_index]
            regressor = TabNetRegressor(**tabnet_params)
            regressor.fit(
                X_train=X_train, y_train=y_train,
                eval_set=[(X_valid, y_valid)],
                patience=trial.suggest_int("patience",low=15,high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
                batch_size=BS,
                virtual_batch_size=BS,from_unsupervised=unsupervised_model,
                eval_metric=[Eval])
            CV_score_array.append(regressor.best_cost)
        avg = np.mean(CV_score_array)
        return avg

study = optuna.create_study(direction="maximize", study_name='TabNet optimization')
study.optimize(Objective, n_trials=3) #timeout=60
TabNet_params = study.best_params
final_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                 lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
                 optimizer_params=dict(lr=1e-1, weight_decay=1e-4),
                 mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
                 scheduler_params=dict(mode="min",
                                       patience=TabNet_params['patienceScheduler'],
                                       min_lr=1e-4,
                                       factor=0.9,),
                 scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                 verbose=0,
                 )
epochs = TabNet_params['epochs']
kf = KFold(n_splits=2, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(edf):
    X_train, X_valid = edf[train_index], edf[test_index]
    y_train, y_valid = target[train_index], target[test_index]
    regressor = TabNetRegressor(**final_params)
    regressor.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        patience=TabNet_params['patience'], max_epochs=epochs,
        batch_size=BS,
        virtual_batch_size=BS,from_unsupervised=unsupervised_model,
        eval_metric=[Eval])
    CV_score_array.append(regressor.best_cost)
    predictions_array.append(regressor.predict(X_test))
study = optuna.create_study(direction="maximize", study_name='TabNet optimization')
study.optimize(Objective, n_trials=3)#timeout=6*60

[32m[I 2022-12-07 22:28:11,142][0m A new study created in memory with name: TabNet optimization[0m


epoch 0  | loss: 0.5597  | val_0_roc_auc_score: 0.45064 |  0:00:16s
epoch 1  | loss: 0.21754 | val_0_roc_auc_score: 0.40731 |  0:00:31s
epoch 2  | loss: 0.19018 | val_0_roc_auc_score: 0.42708 |  0:00:46s
epoch 3  | loss: 0.1822  | val_0_roc_auc_score: 0.55566 |  0:01:02s
epoch 4  | loss: 0.1766  | val_0_roc_auc_score: 0.59802 |  0:01:17s
epoch 5  | loss: 0.17231 | val_0_roc_auc_score: 0.59771 |  0:01:32s
epoch 6  | loss: 0.16838 | val_0_roc_auc_score: 0.57371 |  0:01:47s
epoch 7  | loss: 0.16534 | val_0_roc_auc_score: 0.54561 |  0:02:02s
epoch 8  | loss: 0.16333 | val_0_roc_auc_score: 0.59354 |  0:02:17s
epoch 9  | loss: 0.16193 | val_0_roc_auc_score: 0.65934 |  0:02:33s
epoch 10 | loss: 0.16054 | val_0_roc_auc_score: 0.69984 |  0:02:48s
epoch 11 | loss: 0.15967 | val_0_roc_auc_score: 0.71136 |  0:03:03s
epoch 12 | loss: 0.15889 | val_0_roc_auc_score: 0.71396 |  0:03:18s
epoch 13 | loss: 0.15787 | val_0_roc_auc_score: 0.71874 |  0:03:33s
epoch 14 | loss: 0.15706 | val_0_roc_auc_score: 



epoch 0  | loss: 0.5645  | val_0_roc_auc_score: 0.47137 |  0:00:14s
epoch 1  | loss: 0.20517 | val_0_roc_auc_score: 0.51946 |  0:00:29s
epoch 2  | loss: 0.18778 | val_0_roc_auc_score: 0.50854 |  0:00:44s
epoch 3  | loss: 0.18295 | val_0_roc_auc_score: 0.55161 |  0:00:59s
epoch 4  | loss: 0.17959 | val_0_roc_auc_score: 0.56591 |  0:01:14s
epoch 5  | loss: 0.17712 | val_0_roc_auc_score: 0.55421 |  0:01:29s
epoch 6  | loss: 0.17505 | val_0_roc_auc_score: 0.62007 |  0:01:43s
epoch 7  | loss: 0.17333 | val_0_roc_auc_score: 0.66305 |  0:01:58s
epoch 8  | loss: 0.17186 | val_0_roc_auc_score: 0.67916 |  0:02:13s
epoch 9  | loss: 0.16967 | val_0_roc_auc_score: 0.69035 |  0:02:28s
epoch 10 | loss: 0.16733 | val_0_roc_auc_score: 0.69865 |  0:02:43s
epoch 11 | loss: 0.16512 | val_0_roc_auc_score: 0.70869 |  0:02:57s
epoch 12 | loss: 0.16312 | val_0_roc_auc_score: 0.72143 |  0:03:12s
epoch 13 | loss: 0.1618  | val_0_roc_auc_score: 0.73612 |  0:03:27s
epoch 14 | loss: 0.16042 | val_0_roc_auc_score: 



epoch 0  | loss: 0.5237  | val_0_roc_auc_score: 0.5033  |  0:00:14s
epoch 1  | loss: 0.34487 | val_0_roc_auc_score: 0.49473 |  0:00:29s
epoch 2  | loss: 0.2009  | val_0_roc_auc_score: 0.44829 |  0:00:44s
epoch 3  | loss: 0.18512 | val_0_roc_auc_score: 0.55298 |  0:00:59s
epoch 4  | loss: 0.17953 | val_0_roc_auc_score: 0.50644 |  0:01:14s
epoch 5  | loss: 0.1766  | val_0_roc_auc_score: 0.54484 |  0:01:29s
epoch 6  | loss: 0.17414 | val_0_roc_auc_score: 0.60099 |  0:01:44s
epoch 7  | loss: 0.17243 | val_0_roc_auc_score: 0.60258 |  0:01:59s
epoch 8  | loss: 0.17013 | val_0_roc_auc_score: 0.61394 |  0:02:14s
epoch 9  | loss: 0.16781 | val_0_roc_auc_score: 0.63394 |  0:02:28s
epoch 10 | loss: 0.1653  | val_0_roc_auc_score: 0.65339 |  0:02:43s
epoch 11 | loss: 0.16284 | val_0_roc_auc_score: 0.66667 |  0:02:58s
epoch 12 | loss: 0.16135 | val_0_roc_auc_score: 0.67    |  0:03:13s
epoch 13 | loss: 0.16051 | val_0_roc_auc_score: 0.68684 |  0:03:28s
epoch 14 | loss: 0.15975 | val_0_roc_auc_score: 



epoch 0  | loss: 0.52821 | val_0_roc_auc_score: 0.52893 |  0:00:15s
epoch 1  | loss: 0.2023  | val_0_roc_auc_score: 0.49488 |  0:00:30s
epoch 2  | loss: 0.18037 | val_0_roc_auc_score: 0.4894  |  0:00:45s
epoch 3  | loss: 0.1763  | val_0_roc_auc_score: 0.56502 |  0:01:00s
epoch 4  | loss: 0.17246 | val_0_roc_auc_score: 0.5726  |  0:01:14s
epoch 5  | loss: 0.16948 | val_0_roc_auc_score: 0.6044  |  0:01:30s
epoch 6  | loss: 0.16644 | val_0_roc_auc_score: 0.63832 |  0:01:45s
epoch 7  | loss: 0.16353 | val_0_roc_auc_score: 0.67007 |  0:02:00s
epoch 8  | loss: 0.16143 | val_0_roc_auc_score: 0.71342 |  0:02:15s
epoch 9  | loss: 0.15982 | val_0_roc_auc_score: 0.74871 |  0:02:30s
epoch 10 | loss: 0.15819 | val_0_roc_auc_score: 0.76574 |  0:02:45s
epoch 11 | loss: 0.15679 | val_0_roc_auc_score: 0.77401 |  0:03:00s
epoch 12 | loss: 0.15557 | val_0_roc_auc_score: 0.77821 |  0:03:14s
epoch 13 | loss: 0.15459 | val_0_roc_auc_score: 0.78336 |  0:03:29s
epoch 14 | loss: 0.15378 | val_0_roc_auc_score: 



epoch 0  | loss: 0.53075 | val_0_roc_auc_score: 0.51688 |  0:00:14s
epoch 1  | loss: 0.20526 | val_0_roc_auc_score: 0.45367 |  0:00:29s
epoch 2  | loss: 0.1844  | val_0_roc_auc_score: 0.49267 |  0:00:44s
epoch 3  | loss: 0.17742 | val_0_roc_auc_score: 0.5081  |  0:00:59s
epoch 4  | loss: 0.17348 | val_0_roc_auc_score: 0.55572 |  0:01:13s
epoch 5  | loss: 0.16934 | val_0_roc_auc_score: 0.63177 |  0:01:28s
epoch 6  | loss: 0.16578 | val_0_roc_auc_score: 0.67068 |  0:01:43s
epoch 7  | loss: 0.16359 | val_0_roc_auc_score: 0.70786 |  0:01:57s
epoch 8  | loss: 0.16201 | val_0_roc_auc_score: 0.74041 |  0:02:12s
epoch 9  | loss: 0.16073 | val_0_roc_auc_score: 0.75223 |  0:02:28s
epoch 10 | loss: 0.15948 | val_0_roc_auc_score: 0.76329 |  0:02:44s
epoch 11 | loss: 0.15849 | val_0_roc_auc_score: 0.77505 |  0:03:00s
epoch 12 | loss: 0.15789 | val_0_roc_auc_score: 0.78232 |  0:03:16s
epoch 13 | loss: 0.15697 | val_0_roc_auc_score: 0.78795 |  0:03:32s
epoch 14 | loss: 0.15626 | val_0_roc_auc_score: 

[32m[I 2022-12-07 23:14:16,796][0m Trial 0 finished with value: 0.8323825061382537 and parameters: {'mask_type': 'entmax', 'n_da': 8, 'n_steps': 2, 'gamma': 2.0, 'n_independent': 3, 'n_shared': 2, 'lambda_sparse': 7.953064356675222e-06, 'patienceScheduler': 6, 'patience': 28, 'epochs': 36}. Best is trial 0 with value: 0.8323825061382537.[0m


epoch 0  | loss: 17.24265| val_0_roc_auc_score: 0.53568 |  0:04:36s
epoch 1  | loss: 4.65627 | val_0_roc_auc_score: 0.50156 |  0:09:11s
epoch 2  | loss: 0.54723 | val_0_roc_auc_score: 0.53357 |  0:13:42s
epoch 3  | loss: 0.23375 | val_0_roc_auc_score: 0.56377 |  0:18:16s
epoch 4  | loss: 0.20428 | val_0_roc_auc_score: 0.48194 |  0:22:48s
epoch 5  | loss: 0.19229 | val_0_roc_auc_score: 0.5786  |  0:27:17s
epoch 6  | loss: 0.18521 | val_0_roc_auc_score: 0.57942 |  0:31:46s
epoch 7  | loss: 0.18038 | val_0_roc_auc_score: 0.57823 |  0:36:19s
epoch 8  | loss: 0.17692 | val_0_roc_auc_score: 0.58771 |  0:40:49s
epoch 9  | loss: 0.17486 | val_0_roc_auc_score: 0.60968 |  0:45:23s


In [None]:
predictions = np.mean(predictions_array,axis=0)
print("The CV score is %.5f" % np.mean(CV_score_array,axis=0) )
submission[target_list[i]] = predictions
submission.to_csv(path + 'tabnet_optuna.csv', index=False)