In [28]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F 
import re
import math
import os
import pandas as pd

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier


In [29]:
data_dir = '/opt/ml/input/data/'
file_name = 'train_feature_engineering.csv'

csv_file_path = os.path.join(data_dir, file_name)
df = pd.read_csv(csv_file_path)  # , nrows=100000)

In [30]:
ratio = 0.7

In [31]:
categorical_columns = ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last']
categorical_dims =  {}
for col in tqdm(df.columns):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].values)
    categorical_dims[col] = len(le.classes_)

100%|██████████| 27/27 [00:15<00:00,  1.79it/s]


In [32]:
for col in categorical_columns:
    print(df[col].nunique())
    print(df[col].max())

6698
6697
9454
9453
1537
1536
912
911
13
12
9
8
198
197


In [33]:
n = len(df['userID'].unique())
user_permute_list = np.random.permutation(df['userID'].unique())
train_userid = user_permute_list[:(int(n*ratio))]
valid_userid = user_permute_list[(int(n*ratio)):]
train = df[df['userID'].isin(train_userid)].reset_index(drop=True).sort_values(["userID", "Timestamp"])
valid = df[df['userID'].isin(valid_userid)].reset_index(drop=True).sort_values(["userID", "Timestamp"])

In [34]:
train_x = train.drop(['answerCode', 'Timestamp','relative_answered_correctly'], axis = 1)
train_y = train[['answerCode']]
valid_x = valid.drop(['answerCode', 'Timestamp', 'relative_answered_correctly'], axis = 1)
valid_y = valid[['answerCode']]

In [35]:
features = [ col for col in train_x.columns] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]


In [36]:
X_train = train_x[features].values
y_train = train_y.values

X_valid = valid_x[features].values
y_valid = valid_y.values

In [37]:
clf = TabNetMultiTaskClassifier(
    cat_idxs = cat_idxs,
    cat_dims = cat_dims,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.0006715856786942948),
    scheduler_params={"step_size":5,
                        "gamma":1.8841685063804285},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax', 
)



In [39]:
max_epochs = 53
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs,
    patience=30,
    batch_size=14598,
    virtual_batch_size=4430,
    drop_last=False,
)

epoch 0  | loss: 0.71687 | train_auc: 0.62375 | valid_auc: 0.62537 |  0:00:51s
epoch 1  | loss: 0.6136  | train_auc: 0.6905  | valid_auc: 0.69128 |  0:01:43s
epoch 2  | loss: 0.58305 | train_auc: 0.73139 | valid_auc: 0.73116 |  0:02:35s
epoch 3  | loss: 0.55789 | train_auc: 0.76006 | valid_auc: 0.76047 |  0:03:26s
epoch 4  | loss: 0.54201 | train_auc: 0.77421 | valid_auc: 0.77523 |  0:04:17s
epoch 5  | loss: 0.529   | train_auc: 0.78944 | valid_auc: 0.79133 |  0:05:09s
epoch 6  | loss: 0.51813 | train_auc: 0.79726 | valid_auc: 0.79942 |  0:06:00s
epoch 7  | loss: 0.51236 | train_auc: 0.80126 | valid_auc: 0.80345 |  0:06:52s
epoch 8  | loss: 0.5088  | train_auc: 0.80404 | valid_auc: 0.8062  |  0:07:44s
epoch 9  | loss: 0.50657 | train_auc: 0.80544 | valid_auc: 0.80767 |  0:08:35s
epoch 10 | loss: 0.50468 | train_auc: 0.80732 | valid_auc: 0.80934 |  0:09:26s
epoch 11 | loss: 0.50303 | train_auc: 0.80842 | valid_auc: 0.81041 |  0:10:17s
epoch 12 | loss: 0.50187 | train_auc: 0.80917 | vali



In [42]:
clf.save_model('./abc')

Successfully saved model at ./abc.zip


'./abc.zip'