# TabNet

In [1]:
from pytorch_tabnet.tab_model import TabNetRegressor, TabNetClassifier

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import wandb
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

  from .autonotebook import tqdm as notebook_tqdm


## 1. 데이터 로드

Load Data

In [7]:
def set_seeds(seed: int = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
set_seeds(42)

In [8]:
data_dir = '../data/'
train_data_path = os.path.join(data_dir, 'train_data_3PL+level.csv')
test_data_path = os.path.join(data_dir, 'test_data_3PL+level.csv') 
df = pd.read_csv(train_data_path)

## 2. Feature Engineering

In [9]:
def feature_engineering(df):

    # 유저별로 정렬
    df.sort_values(by=['userID', 'Timestamp'], inplace=True)
    
    # 데이터 타입 변경
    dtype = {
        'userID': 'int16',
        'answerCode': 'int8',
        'KnowledgeTag': 'int16'
    }
    df = df.astype(dtype)
    
    # 'Timestamp' 열을 날짜/시간 형식으로 파싱
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')

    # testTag 추가
    df['testTag'] = df['testId'].apply(lambda x: x[2]).astype('int16')

    # 유저별로 정답 누적 횟수 계산, 결측치 0
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_correct_answer'].fillna(0, inplace=True)
    
    # 유저별로 제출 누적 횟수 계산
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() 
    
    # 유저별로 누적 정답률 계산, 결측치 0.75
    df['user_acc'] = df['user_correct_answer'] / df['user_total_answer']
    df['user_acc'].fillna(0.75, inplace=True)

    # userID별 정답률 추가
    df['user_sum'] = df.groupby('userID')['answerCode'].transform('sum')
    df['user_mean'] = df.groupby('userID')['answerCode'].transform('mean')
    
    # assessmentItemID별 정답률 추가
    df['assessment_sum'] = df.groupby('assessmentItemID')['answerCode'].transform('sum')
    df['assessment_mean'] = df.groupby('assessmentItemID')['answerCode'].transform('mean')
    
    # testId별 정답률 추가
    df['test_sum'] = df.groupby('testId')['answerCode'].transform('sum')
    df['test_mean'] = df.groupby('testId')['answerCode'].transform('mean')
    
    # KnowledgeTag별 정답률 추가
    df['knowledgeTag_sum'] = df.groupby('KnowledgeTag')['answerCode'].transform('sum')
    df['knowledgeTag_mean'] = df.groupby('KnowledgeTag')['answerCode'].transform('mean')
    
    # testTag별 정답률 추가
    df['testTag_sum'] = df.groupby('testTag')['answerCode'].transform('sum')
    df['testTag_mean'] = df.groupby('testTag')['answerCode'].transform('mean')

    # 상대적 정답률
    df['relative_answer_assessment'] = df['answerCode'] - df.groupby('assessmentItemID')['answerCode'].transform('mean')
    
    # 유저별 상대적 정답률 평균 - 학습 수준 레벨
    df['relative_answer_mean'] = df.groupby('userID')['relative_answer_assessment'].transform('mean')

    # 유저가 문항을 푼 시간
    df['time_to_solve'] = df.groupby(['userID', 'testId'])['Timestamp'].diff().dt.total_seconds().shift(-1)
    
    # 결측치 이전 행의 값으로 채움
    df['time_to_solve'].fillna(method='ffill', inplace=True)

    # 유저별 문항 시간 평균
    #df['time_to_solve_mean'] = df.groupby('userID')['time_to_solve'].transform('mean')
    df['time_to_solve_mean'] = df.groupby(['userID', 'testId'])['time_to_solve'].transform('mean')

    # clip(0, 255)는 메모리를 위해 uint8 데이터 타입을 쓰기 위함
    df['prior_assessment_frequency'] = df.groupby(['userID', 'assessmentItemID']).cumcount().clip(0, 255)

    # 각 태그별로 이전에 몇번 풀었는지
    df['prior_KnowledgeTag_frequency'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()
    
    # 시험지 태그별 학년별 몇번 풀었는지
    df['prior_testTag_frequency'] = df.groupby(['userID', 'testTag']).cumcount()
    
    return df


In [10]:
df = feature_engineering(df)
df.head()

  df['time_to_solve'].fillna(method='ffill', inplace=True)


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Dffclt,Dscrmn,Gussng,user_level,...,knowledgeTag_mean,testTag_sum,testTag_mean,relative_answer_assessment,relative_answer_mean,time_to_solve,time_to_solve_mean,prior_assessment_frequency,prior_KnowledgeTag_frequency,prior_testTag_frequency
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,-2.017182,20.079513,0.052178,0.015996,...,0.955022,187545,0.709232,0.017937,-0.025899,3.0,7.833333,0,0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,-1.723821,4.616495,0.056888,0.015996,...,0.913187,187545,0.709232,0.035874,-0.025899,8.0,7.833333,0,0,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,-0.167255,18.583456,0.754422,0.015996,...,0.913187,187545,0.709232,0.089686,-0.025899,7.0,7.833333,0,1,2
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.496282,39.87703,0.946875,0.015996,...,0.913187,187545,0.709232,0.03139,-0.025899,7.0,7.833333,0,2,3
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,-1.3351,6.965071,0.237969,0.015996,...,0.913187,187545,0.709232,0.058296,-0.025899,11.0,7.833333,0,3,4


## 3. Train/Test 데이터 셋 분리

In [11]:
def custom_split_and_encoding(df, feats, cat_feats, ratio=0.7):
    
    for col in cat_feats:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].values)
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)
    
    train = df[df['userID'].isin(user_ids)][feats]
    valid = df[df['userID'].isin(user_ids) == False][feats]
    
    valid = valid[valid['userID'] != valid['userID'].shift(-1)]

    cat_idxs = [ i for i, f in enumerate(feats) if f in cat_feats]
    cat_dims = [ len(df[f].unique()) for f in feats if f in cat_feats]
    
    return train, valid, cat_idxs, cat_dims
        

In [12]:
FEATS = [
    'userID', 'assessmentItemID', 'testId', 
    'KnowledgeTag',
    'Dffclt',
    'Dscrmn',
    'Gussng',
    # 'testTag',
    'user_correct_answer',
    'user_total_answer',
    'user_acc',
    'user_mean',
    'relative_answer_mean',
    'time_to_solve',
    'time_to_solve_mean',
    'prior_testTag_frequency',
    'answerCode'
]

In [13]:
cat_cols = ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag']
train, valid, cat_idxs, cat_dims = custom_split_and_encoding(df, cat_feats=cat_cols, feats=FEATS)

In [14]:
X_train = train.drop(['answerCode'], axis = 1)
y_train = train[['answerCode']]
X_valid = valid.drop(['answerCode'], axis = 1)
y_valid = valid[['answerCode']]

## 4. 훈련 및 검증

In [15]:
model = TabNetClassifier(
    n_d = 64,
    n_a = 64,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=10,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),
    scheduler_params={"step_size":50,
                        "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax', # "sparsemax", entmax
    verbose=1,
    device_name='cuda'
)

model.fit(
    X_train=X_train.values, y_train=y_train.values.flatten(),
    eval_set=[(X_train.values, y_train.values.flatten()), (X_valid.values, y_valid.values.flatten())],
    eval_name=['train', 'valid'],
    eval_metric=['auc', 'accuracy', 'logloss'],
    max_epochs=2,
    patience=5,
    batch_size=1024,
    virtual_batch_size=128,
    drop_last=False,
)

Device used : cuda
epoch 0  | loss: 0.53224 | train_auc: 0.79517 | train_accuracy: 0.75078 | train_logloss: 0.51365 | valid_auc: 0.75453 | valid_accuracy: 0.69258 | valid_logloss: 0.59191 |  0:01:50s
epoch 1  | loss: 0.51526 | train_auc: 0.79662 | train_accuracy: 0.75114 | train_logloss: 0.51287 | valid_auc: 0.75374 | valid_accuracy: 0.68859 | valid_logloss: 0.59038 |  0:03:44s
Stop training because you reached max_epochs = 2 with best_epoch = 1 and best_valid_logloss = 0.59038
Best weights from best epoch are automatically used!


In [21]:
result = model.predict_proba(X_valid.values)
preds_proba = np.max(result, axis=1)
preds = np.argmax(result, axis=1)

In [22]:
acc = accuracy_score(y_valid, preds)
auc = roc_auc_score(y_valid, preds_proba)
print(f'VALID AUC : {auc} ACC : {acc}\n')

VALID AUC : 0.49783641423176306 ACC : 0.6885899352267065



## 5. Inference

In [23]:
test_df = pd.read_csv(test_data_path)

test_df = feature_engineering(test_df)

for col in cat_cols:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col].values)

# y_test = test_df['answerCode'].values
# X_test = test_df.drop(['answerCode'], axis=1)

# #FEATS = [col for col in X_test.columns]
# X_text = X_test[FEATS]
test_df = test_df[FEATS]
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

  df['time_to_solve'].fillna(method='ffill', inplace=True)


In [24]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df.values)
total_preds = np.max(result, axis=1)

In [25]:
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/submission.csv
