# CatBoost 활용한 베이스라인

In [2]:
import pandas as pd
import os
import random
import numpy as np

## 1. 데이터 로딩

In [3]:
data_dir = '/opt/ml/input/data' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)

In [4]:
raw_data = pd.read_csv(csv_file_path) 
raw_data

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


## 2. Feature Engineering

In [7]:
import datetime

def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    df.insert(3, "solveTime", np.NaN)
    train_np = df.to_numpy()

    idx_list = list()

    for i in range(len(train_np)-1):
        # 현재 문제의 timestamp을 가져온다
        current_date_data, current_time_data = train_np[i][5].strip().split(" ")
        current_year, current_month, current_day = map(int, list(current_date_data.split("-")))
        current_hour, current_minute, current_second = map(int, list(current_time_data.split(":")))

        # 다음 문제의 timestamp를 가져온다.
        next_date_data, next_time_data = train_np[i+1][5].strip().split(" ")
        next_year, next_month, next_day = map(int, list(next_date_data.split("-")))
        next_hour, next_minute, next_second = map(int, list(next_time_data.split(":")))


        # 같은 유저가 다음 문제도 같은 시험지를 풀고 았거나, 
        # 다른 시험 문제지를 같은 날짜에 풀었을 경우
        # 문제 푸는 시간 = 다음 문제가 시작 시간 - 현재 문제가 시작한 시간
        if train_np[i][0]==train_np[i+1][0] and \
            (train_np[i][2]==train_np[i+1][2] or (train_np[i][5]!=train_np[i+1][5] and current_date_data == next_date_data)):         
            train_np[i][3] = datetime.datetime(next_year, next_month, next_day, next_hour, next_minute, next_second) - datetime.datetime(current_year, current_month, current_day, current_hour, current_minute, current_second)
            train_np[i][3] = train_np[i][3].total_seconds() # 초로 변환
            #if train_np[i][3]>150 : train_np[i][3] = 150

        else :
            # 마지막으로 푼 문제인 경우는 60으로 통일
            train_np[i][3] = 60.0

        df.iloc[i,3] = train_np[i][3]

        #if train_np[i][3] == 0  : idx_list.append(i)
   
    train_np[-1][3] = 60.0
    df.iloc[-1,3] = train_np[i][3]

    
    # test_mean_solveTime = df.groupby('testId')['solveTime'].mean()
    # test_mean_solveTime.columns = ['test_mean_solveTime']
    # # user_mean_solveTime = df.groupby('userID')['solveTime'].mean()
    # # user_mean_solveTime.columns = ['user_mean_solveTime']
    # tag_mean_solveTime = df.groupby('KnowledgeTag')['solveTime'].mean()
    # tag_mean_solveTime.columns = ['tag_mean_solveTime']
    # #assessmentItem_mean_solveTime = df.groupby('assessmentItemID')['solveTime'].mean()
    # #assessmentItem_mean_solveTime.columns = ['assessmentItem_mean_solveTime']

    # df = pd.merge(df, test_mean_solveTime, on=['testId'], how="left")
    # #df = pd.merge(df, user_mean_solveTime, on=['userID'], how="left")
    # df = pd.merge(df, tag_mean_solveTime, on=['KnowledgeTag'], how="left")
    # #df = pd.merge(df, assessmentItem_mean_solveTime, on=['assessmentItemID'], how="left")

    df.insert(1, "testType", np.NaN)
    df.insert(2, "testID", np.NaN)
    df.insert(3, "questionID", np.NaN)

    train_np = df.to_numpy()


    for i in range(len(df)):
        assessmentItemID = train_np[i][4]
        df.iloc[i,1] = int(assessmentItemID[2])
        df.iloc[i,2] = int(assessmentItemID[4:7])
        df.iloc[i,3] = int(assessmentItemID[8:])

    # df = pd.DataFrame(train_np, columns=df.columns)

    
    #df = df.drop(idx_list, axis=0)
    #df = df.drop(['solveTime'], axis=1)

    return df

In [8]:
df = pd.read_csv(csv_file_path) 
df = feature_engineering(df)
#df = feature_engineering(df).drop(columns=['Timestamp'])
df

Unnamed: 0,userID,testType,testID,questionID,assessmentItemID,testId,solveTime,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,0,6.0,1.0,1.0,A060001001,A060000001,3.0,1,2020-03-24 00:17:11,7224,,0,,0.947683,1268,0.955022,637
1,0,6.0,1.0,2.0,A060001002,A060000001,8.0,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,0.947683,1268,0.913187,3040
2,0,6.0,1.0,3.0,A060001003,A060000001,7.0,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,0.947683,1268,0.913187,3040
3,0,6.0,1.0,4.0,A060001004,A060000001,7.0,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,0.947683,1268,0.913187,3040
4,0,6.0,1.0,5.0,A060001005,A060000001,11.0,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,0.947683,1268,0.913187,3040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,3.0,71.0,5.0,A030071005,A030000071,60.0,0,2020-06-05 06:50:21,438,1.0,4,0.250000,0.662590,921,0.689706,2814
2266582,7441,4.0,165.0,1.0,A040165001,A040000165,11.0,1,2020-08-21 01:06:39,8836,1.0,5,0.200000,0.655109,718,0.697874,2199
2266583,7441,4.0,165.0,2.0,A040165002,A040000165,46.0,1,2020-08-21 01:06:50,8836,2.0,6,0.333333,0.655109,718,0.697874,2199
2266584,7441,4.0,165.0,3.0,A040165003,A040000165,73.0,1,2020-08-21 01:07:36,8836,3.0,7,0.428571,0.655109,718,0.697874,2199


In [8]:
# col = list(df.columns)
# col[-2] = 'test_mean_solveTime'
# col[-1] = 'tag_mean_solveTime'
# df.columns = col 
# df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,test_mean_solveTime,tag_mean_solveTime
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,0.947683,1268,0.955022,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,0.947683,1268,0.913187,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,0.947683,1268,0.913187,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,0.947683,1268,0.913187,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,0.947683,1268,0.913187,3040
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1.0,4,0.250000,0.662590,921,0.689706,2814
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1.0,5,0.200000,0.655109,718,0.697874,2199
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,2.0,6,0.333333,0.655109,718,0.697874,2199
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,3.0,7,0.428571,0.655109,718,0.697874,2199


## 3. Train/Test 데이터 셋 분리

In [9]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [20]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']#, 'userID' ,'testType', 'testID', 'questionID']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [21]:
#!pip install lightgbm

In [22]:
from catboost import CatBoostClassifier,  Pool
from catboost import CatBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

In [23]:
cb_train = Pool(train[FEATS], y_train)
cb_test = Pool(test[FEATS], y_test)

## 4. 훈련 및 검증

In [48]:
model = CatBoostRegressor(iterations=20,
                           depth=2,
                           learning_rate=1,
                           loss_function='RMSE',
                           verbose=True)

model.fit(train[FEATS], y_train)
preds = model.predict(cb_test)
# acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
# auc = roc_auc_score(y_test, preds)

# print(f'VALID AUC : {auc} ACC : {acc}\n')

0:	learn: 0.4487468	total: 27.2ms	remaining: 27.2ms
1:	learn: 0.4454708	total: 53.7ms	remaining: 0us


In [49]:
# INSTALL MATPLOTLIB IN ADVANCE
# _ = lgb.plot_importance(model)

## 5. Inference

In [50]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [51]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [52]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission_cb.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/submission_cb.csv
