# CatBoost Baseline

In [1]:
# %pip install catboost

In [2]:
import os
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRegressor as cbt
from catboost import Pool
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [3]:
data_dir = './data/' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data_3PL+level.csv')
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Dffclt,Dscrmn,Gussng,user_level
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,-2.017182,20.079513,0.052178,0.015996
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,-1.723821,4.616495,0.056888,0.015996
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,-0.167255,18.583456,0.754422,0.015996
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.496282,39.877030,0.946875,0.015996
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,-1.335100,6.965071,0.237969,0.015996
...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.767458,0.882364,0.123318,-1.407234
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,-0.277564,5.384278,0.099105,0.552926
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,-0.267161,10.263590,0.035658,0.552926
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,-0.229779,1.516802,0.513883,0.552926


In [4]:
#df = df.drop(['user_level'], axis=1)
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Dffclt,Dscrmn,Gussng,user_level
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,-2.017182,20.079513,0.052178,0.015996
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,-1.723821,4.616495,0.056888,0.015996
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,-0.167255,18.583456,0.754422,0.015996
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.496282,39.877030,0.946875,0.015996
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,-1.335100,6.965071,0.237969,0.015996
...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.767458,0.882364,0.123318,-1.407234
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,-0.277564,5.384278,0.099105,0.552926
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,-0.267161,10.263590,0.035658,0.552926
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,-0.229779,1.516802,0.513883,0.552926


In [5]:
def feature_engineering(df):

    # 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)

    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    # df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    # df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    # df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    # correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    # correct_t.columns = ["test_mean", 'test_sum']
    # correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    # correct_k.columns = ["tag_mean", 'tag_sum']

    # df = pd.merge(df, correct_t, on=['testId'], how="left")
    # df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")

    return df

In [6]:
df = feature_engineering(df)

## Train/Test 데이터 셋 분리

In [7]:
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):

    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [8]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'Dffclt', 'Dscrmn', 'Gussng', 'user_level'],
      dtype='object')

In [9]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'Dffclt', 'Dscrmn', 'Gussng', 'user_level']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [10]:
data_train = Pool(data=train[FEATS], label=y_train, cat_features=[0])
data_test = Pool(data=test[FEATS], label=y_test, cat_features=[0])

## 훈련 및 검증

In [11]:
params = {'iterations': 500, 'depth': 6, 'learning_rate': 0.05, 'eval_metric': 'AUC'}
model = cbt(**params)
model.fit(data_train, eval_set=data_test, verbose=50)


preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

0:	test: 0.8430776	best: 0.8430776 (0)	total: 333ms	remaining: 2m 46s
50:	test: 0.8808069	best: 0.8808069 (50)	total: 10.8s	remaining: 1m 34s
100:	test: 0.8891737	best: 0.8891737 (100)	total: 20s	remaining: 1m 19s
150:	test: 0.8931703	best: 0.8931703 (150)	total: 29.3s	remaining: 1m 7s
200:	test: 0.8960420	best: 0.8960420 (200)	total: 38.6s	remaining: 57.4s
250:	test: 0.8981223	best: 0.8981223 (250)	total: 47.6s	remaining: 47.2s
300:	test: 0.9000560	best: 0.9000560 (300)	total: 57.2s	remaining: 37.8s
350:	test: 0.9024735	best: 0.9024735 (350)	total: 1m 6s	remaining: 28.4s
400:	test: 0.9039185	best: 0.9039405 (399)	total: 1m 16s	remaining: 18.8s
450:	test: 0.9048031	best: 0.9048031 (450)	total: 1m 24s	remaining: 9.16s
499:	test: 0.9055631	best: 0.9055631 (499)	total: 1m 32s	remaining: 0us

bestTest = 0.9055630574
bestIteration = 499

VALID AUC : 0.9055630573578719 ACC : 0.8211260587942202



In [17]:
feature_importance = model.get_feature_importance()
print("Feature Importance:")
for i, importance in zip(FEATS, feature_importance):
    print(f"{i}: {importance}")

Feature Importance:
KnowledgeTag: 1.680318016119981
Dffclt: 39.9119064467376
Dscrmn: 5.539187641829382
Gussng: 4.986386316001236
user_level: 47.88220157931177


In [14]:
feature_importance

array([ 1.68031802, 39.91190645,  5.53918764,  4.98638632, 47.88220158])

## Inference

In [None]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data_3PL+level.csv')
test_df = pd.read_csv(test_csv_file_path)
#test_df = test_df.drop(['user_level'], axis=1)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [None]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [None]:
total_preds

array([ 5.93765598e-01,  9.43161677e-01,  2.34103666e-01,  9.58672101e-01,
        2.47250253e-01,  8.02447078e-01,  1.66960977e-01,  1.87971917e-01,
        1.95944187e-01,  1.00128501e+00,  3.35103558e-01,  1.74791342e-01,
        1.02514796e+00,  2.72487277e-01,  1.91416571e-01,  9.62348812e-01,
        2.16837167e-01,  9.00248409e-01,  1.00318905e+00,  1.30559367e-01,
        7.00183640e-01,  9.83949211e-01,  7.14567216e-01,  8.76227286e-02,
        1.90877867e-01,  4.49750397e-01,  6.29134602e-01,  9.78886342e-01,
        4.09987280e-01,  9.21869138e-01,  7.77234176e-01,  5.93719631e-01,
        6.98098136e-01,  2.82491764e-01,  8.62295309e-01,  8.72201801e-01,
        9.77653226e-01,  7.91939668e-01, -1.27440847e-03,  3.23025548e-01,
        2.87702037e-02,  2.68019356e-01,  3.83304041e-01,  4.27424087e-01,
        7.83301134e-01,  9.54082853e-01,  4.80763774e-01,  4.12040407e-01,
        8.74733623e-01,  5.71862600e-01,  9.64806405e-01, -7.29467891e-04,
        5.83439243e-01,  

In [None]:
# SAVE OUTPUT
# output_dir = './output/'
# write_path = os.path.join(output_dir, "submission_CatBoost.csv")
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)
# with open(write_path, 'w', encoding='utf8') as w:
#     print("writing prediction : {}".format(write_path))
#     w.write("id,prediction\n")
#     for id, p in enumerate(total_preds):
#         w.write('{},{}\n'.format(id,p))

writing prediction : ./output/submission_CatBoost.csv
