# CatBoost Baseline

In [1]:
# %pip install catboost

In [2]:
import os
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRegressor as cbt
from catboost import Pool
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [3]:
data_dir = '/data/ephemeral/level2-dkt-recsys-02/data/' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data_3PL+level.csv')
df = pd.read_csv(csv_file_path)
df = df.drop(['user_level'], axis=1)
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Dffclt,Dscrmn,Gussng
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,-2.017182,20.079513,0.052178
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,-1.723821,4.616495,0.056888
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,-0.167255,18.583456,0.754422
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.496282,39.877030,0.946875
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,-1.335100,6.965071,0.237969
...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.767458,0.882364,0.123318
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,-0.277564,5.384278,0.099105
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,-0.267161,10.263590,0.035658
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,-0.229779,1.516802,0.513883


In [4]:
def feature_engineering(df):

    # 유저별로 정렬
    df.sort_values(by=['userID', 'Timestamp'], inplace=True)
    
    # 데이터 타입 변경
    dtype = {
        'userID': 'int16',
        'answerCode': 'int8',
        'KnowledgeTag': 'int16'
    }
    df = df.astype(dtype)
    
    # 'Timestamp' 열을 날짜/시간 형식으로 파싱
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')

    # testTag 추가
    df['testTag'] = df['testId'].apply(lambda x: x[2]).astype('int16')

    # 유저별로 정답 누적 횟수 계산, 결측치 0
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_correct_answer'].fillna(0, inplace=True)
    
    # 유저별로 제출 누적 횟수 계산
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() 
    
    # 유저별로 누적 정답률 계산, 결측치 0.75
    df['user_acc'] = df['user_correct_answer'] / df['user_total_answer']
    df['user_acc'].fillna(0.75, inplace=True)

    # userID별 정답률 추가
    df['user_sum'] = df.groupby('userID')['answerCode'].transform('sum')
    df['user_mean'] = df.groupby('userID')['answerCode'].transform('mean')
    
    # assessmentItemID별 정답률 추가
    df['assessment_sum'] = df.groupby('assessmentItemID')['answerCode'].transform('sum')
    df['assessment_mean'] = df.groupby('assessmentItemID')['answerCode'].transform('mean')
    
    # testId별 정답률 추가
    df['test_sum'] = df.groupby('testId')['answerCode'].transform('sum')
    df['test_mean'] = df.groupby('testId')['answerCode'].transform('mean')
    
    # KnowledgeTag별 정답률 추가
    df['knowledgeTag_sum'] = df.groupby('KnowledgeTag')['answerCode'].transform('sum')
    df['knowledgeTag_mean'] = df.groupby('KnowledgeTag')['answerCode'].transform('mean')
    
    # testTag별 정답률 추가
    df['testTag_sum'] = df.groupby('testTag')['answerCode'].transform('sum')
    df['testTag_mean'] = df.groupby('testTag')['answerCode'].transform('mean')

    # 상대적 정답률
    df['relative_answer_assessment'] = df['answerCode'] - df.groupby('assessmentItemID')['answerCode'].transform('mean')
    
    # 유저별 상대적 정답률 평균 - 학습 수준 레벨
    df['relative_answer_mean'] = df.groupby('userID')['relative_answer_assessment'].transform('mean')

    # 유저가 문항을 푼 시간
    df['time_to_solve'] = df.groupby(['userID', 'testId'])['Timestamp'].diff().dt.total_seconds().shift(-1)
    
    # 결측치 이전 행의 값으로 채움
    df['time_to_solve'].fillna(method='ffill', inplace=True)

    # 유저별 문항 시간 평균
    #df['time_to_solve_mean'] = df.groupby('userID')['time_to_solve'].transform('mean')
    df['time_to_solve_mean'] = df.groupby(['userID', 'testId'])['time_to_solve'].transform('mean')

    # clip(0, 255)는 메모리를 위해 uint8 데이터 타입을 쓰기 위함
    df['prior_assessment_frequency'] = df.groupby(['userID', 'assessmentItemID']).cumcount().clip(0, 255)

    # 각 태그별로 이전에 몇번 풀었는지
    df['prior_KnowledgeTag_frequency'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()
    
    # 시험지 태그별 학년별 몇번 풀었는지
    df['prior_testTag_frequency'] = df.groupby(['userID', 'testTag']).cumcount()

    return df

In [5]:
df = feature_engineering(df)

  df['time_to_solve'].fillna(method='ffill', inplace=True)


## Train/Test 데이터 셋 분리

In [6]:
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):

    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [7]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'Dffclt', 'Dscrmn', 'Gussng', 'testTag',
       'user_correct_answer', 'user_total_answer', 'user_acc', 'user_sum',
       'user_mean', 'assessment_sum', 'assessment_mean', 'test_sum',
       'test_mean', 'knowledgeTag_sum', 'knowledgeTag_mean', 'testTag_sum',
       'testTag_mean', 'relative_answer_assessment', 'relative_answer_mean',
       'time_to_solve', 'time_to_solve_mean', 'prior_assessment_frequency',
       'prior_KnowledgeTag_frequency', 'prior_testTag_frequency'],
      dtype='object')

In [8]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = [
#  'userID',
#  'assessmentItemID',
#  'testId',
#  'answerCode',
#  'Timestamp',
 'KnowledgeTag',
 'Dffclt',
 'Dscrmn',
 'Gussng',
 'testTag',
 'user_correct_answer',
 'user_total_answer',
 'user_acc',
#  'user_sum',
 'user_mean',
#  'assessment_sum',
#  'assessment_mean',
#  'test_sum',
#  'test_mean',
#  'knowledgeTag_sum',
#  'knowledgeTag_mean',
#  'testTag_sum',
#  'testTag_mean',
#  'relative_answer_assessment',
 'relative_answer_mean',
 'time_to_solve',
 'time_to_solve_mean',
#  'prior_assessment_frequency',
#  'prior_KnowledgeTag_frequency',
 'prior_testTag_frequency'
 ]
# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [13]:
data_train = Pool(data=train[FEATS], label=y_train, cat_features=[0, 4])
data_test = Pool(data=test[FEATS], label=y_test, cat_features=[0, 4])

## 훈련 및 검증

In [14]:
params = {'iterations': 500, 'depth': 8, 'learning_rate': 0.05, 'eval_metric': 'AUC'}
model = cbt(**params)
model.fit(data_train, eval_set=data_test, verbose=50)


preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

0:	test: 0.7561921	best: 0.7561921 (0)	total: 515ms	remaining: 4m 16s
50:	test: 0.7840049	best: 0.7840049 (50)	total: 19.6s	remaining: 2m 52s
100:	test: 0.7912642	best: 0.7912642 (100)	total: 37.3s	remaining: 2m 27s
150:	test: 0.7946320	best: 0.7946320 (150)	total: 55.5s	remaining: 2m 8s
200:	test: 0.7964680	best: 0.7964680 (200)	total: 1m 13s	remaining: 1m 48s
250:	test: 0.7977495	best: 0.7977495 (250)	total: 1m 31s	remaining: 1m 30s
300:	test: 0.7988605	best: 0.7988605 (300)	total: 1m 49s	remaining: 1m 12s
350:	test: 0.7994280	best: 0.7994329 (348)	total: 2m 7s	remaining: 54s
400:	test: 0.8000632	best: 0.8000632 (400)	total: 2m 25s	remaining: 36s
450:	test: 0.8005798	best: 0.8005798 (450)	total: 2m 43s	remaining: 17.7s
499:	test: 0.8008790	best: 0.8009219 (489)	total: 3m	remaining: 0us

bestTest = 0.8009218906
bestIteration = 489

Shrink model to first 490 iterations.
VALID AUC : 0.8009218906137493 ACC : 0.726457399103139



In [15]:
feature_importance = model.get_feature_importance()
print("Feature Importance:")
for i, importance in zip(FEATS, feature_importance):
    print(f"{i}: {importance}")

Feature Importance:
KnowledgeTag: 1.1678089120795294
Dffclt: 32.43888138112129
Dscrmn: 4.626252971956579
Gussng: 6.696824402388
testTag: 5.77825932499787
user_correct_answer: 1.2169726555491673
user_total_answer: 0.668994063488848
user_acc: 3.580148620004684
user_mean: 2.472197760939635
relative_answer_mean: 13.606148962335576
time_to_solve: 15.515624394494708
time_to_solve_mean: 10.286859771111818
prior_testTag_frequency: 1.9450267795322005


## Inference

In [16]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data_3PL+level.csv')
test_df = pd.read_csv(test_csv_file_path)
#test_df = test_df.drop(['user_level'], axis=1)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

  df['time_to_solve'].fillna(method='ffill', inplace=True)


In [17]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [18]:
# SAVE OUTPUT
output_dir = './output/'
write_path = os.path.join(output_dir, "submission_CatReg_FE.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : ./output/submission_CatReg_FE.csv
