import library

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor, TabNetClassifier

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import wandb
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

Load Data

In [None]:
data_dir = '/opt/ml/input/data/'
train_data_path = os.path.join(data_dir, 'train_data.csv')
test_data_path = os.path.join(data_dir, 'test_data.csv') 
df = pd.read_csv(train_data_path)

In [None]:
def feature_engineering_base(df):
    
    df['datetime'] = pd.to_datetime(df['Timestamp'])
    df['elapsed'] = df.groupby(['userID','testId'])['datetime'].transform(lambda x: x.diff().fillna(pd.Timedelta(seconds=0)))
    df['elapsed'] = df['elapsed'].apply(lambda x: x.total_seconds())
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    df.fillna(0, inplace = True)

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df

In [None]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo"] = prob

    return df

In [None]:
random.seed(42)
def custom_split_and_encoding(df, ratio=0.7):
    categorical_columns = ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag']
    categorical_dims =  {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].values)
        categorical_dims[col] = len(le.classes_)
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)
    
    #train = df[df['userID'] == df['userID'].shift(-1)]
    #valid = df[df['userID'] != df['userID'].shift(-1)]
    
    train = df[df['userID'].isin(user_ids)]
    valid = df[df['userID'].isin(user_ids) == False]
    
    valid = valid[valid['userID'] != valid['userID'].shift(-1)]
    
    X_train = train.drop(['answerCode', 'Timestamp', 'datetime'], axis = 1)
    y_train = train[['answerCode']]
    X_valid = valid.drop(['answerCode', 'Timestamp', 'datetime'], axis = 1)
    y_valid = valid[['answerCode']]
    
    features = [ col for col in X_train.columns] 

    cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

    cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
    
    return X_train, y_train, X_valid, y_valid, cat_idxs, cat_dims, features, categorical_columns
        

In [None]:
df = feature_engineering_base(df)
df = elo(df)
df.head()

In [None]:
X_train, y_train, X_valid, y_valid, cat_idxs, cat_dims, features, categorical_columns = custom_split_and_encoding(df)

Training

In [None]:
clf = TabNetClassifier(
    # n_d = 64,
    # n_a = 64
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=10,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),
    scheduler_params={"step_size":50,
                        "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax', # "sparsemax", entmax
    verbose=1,
    device_name='cuda'
)

clf.fit(
    X_train=X_train[features].values, y_train=y_train.values.flatten(),
    eval_set=[(X_train[features].values, y_train.values.flatten()), (X_valid[features].values, y_valid.values.flatten())],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy', 'auc'],
    max_epochs=41,
    patience=10,
    batch_size=14598,
    virtual_batch_size=4430,
    drop_last=False,
)

Test Data 불러오기

In [None]:
test_df = pd.read_csv(test_data_path)

test_df = feature_engineering_base(test_df)
test_df = elo(test_df)

for col in categorical_columns:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col].values)

y_test = test_df['answerCode'].values
X_test = test_df.drop(['answerCode', 'Timestamp', 'datetime'], axis=1)

#FEATS = [col for col in X_test.columns]
test = X_test[X_test['userID'] != X_test['userID'].shift(-1)].to_numpy()


Prediction

In [None]:
preds = clf.predict_proba(test)
total_preds = preds[:, 1]

Save Output

In [None]:

output_dir = 'output/'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))