### 데이터 전처리

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from dataset import feature_engineering, custom_train_test_split, make_dataset

dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}


train_data = pd.read_csv('/opt/ml/input/data/cv_train_data.csv',dtype=dtype, parse_dates=['Timestamp'])
train_data = feature_engineering(train_data)

### 모델


In [None]:
from sklearn.metrics import accuracy_score

train, valid = custom_train_test_split(train_data)
FEATS, y_train, x_train, y_valid, x_valid = make_dataset(train, valid)


train_pool = Pool(x_train ,y_train, cat_features = ['assessmentItemID', 'testId'])
eval_pool = Pool(x_valid , y_valid, cat_features = ['assessmentItemID', 'testId'])

model = CatBoostClassifier(
            iterations = 300,
            random_seed = 42,
            learning_rate = 0.001,
            loss_function = 'Logloss', ## 사실 Default 값은 Logloss 이다. 만약 CatBoostRegressor 였으면, RMSE 이다.
            custom_metric = ['Logloss','AUC'],
            early_stopping_rounds = 30,
            use_best_model =  True,
            task_type = "GPU",
            bagging_temperature = 1,
            verbose = False)

model.fit(train_pool, eval_set=eval_pool,plot=True) ## ,save_snapshot=True

### Valid 추출

In [None]:
cv_valid_data = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv',dtype=dtype, parse_dates=['Timestamp'])
test_data = feature_engineering(cv_valid_data)

# test 데이터셋은 각 유저의 마지막 interaction만 추출
test_data = test_data[test_data['userID'] != test_data['userID'].shift(-1)]
test_data = test_data.drop(['answerCode'], axis=1)

preds = model.predict(test_data[FEATS], prediction_type='Probability')[:,1]

output_dir = 'output/'
write_path = os.path.join(output_dir, "catboost_valid.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(preds):
        w.write('{},{}\n'.format(id,p))

        

### Test 추출

In [None]:
test_data = pd.read_csv('/opt/ml/input/data/test_data.csv',dtype=dtype, parse_dates=['Timestamp'])
test_data = feature_engineering(test_data)
# test 데이터셋은 각 유저의 마지막 interaction만 추출
test_data = test_data[test_data['userID'] != test_data['userID'].shift(-1)]
test_data = test_data.drop(['answerCode'], axis=1)

preds = model.predict(test_data, prediction_type='Probability')[:,1]

output_dir = 'output/'
write_path = os.path.join(output_dir, "catboost_test.csv")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(preds):
        w.write('{},{}\n'.format(id,p))