In [None]:
%load_ext nb_black
%load_ext lab_black

In [None]:
!jupyter nbextension enable --py widgetsnbextension

# Cat Boost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
import os
import random

from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import eli5
from eli5.sklearn import PermutationImportance

import warnings

warnings.filterwarnings(action="ignore")  # 경고 출력 무시

## 1. 데이터로딩

In [None]:
data_dir = "/opt/ml/input/data/"  # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, "all_feature_data.csv")  # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path)


# 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
df.sort_values(by=["userID", "Timestamp"], inplace=True)

In [None]:
df.head()

## 2. Train/Test 데이터 셋 분리 (option1, option2에서 하나만 실행)

### Option 1
- train 데이터에서 train, valid set을 나눔

In [None]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)


def option1_train_test_split(df, ratio=0.8, split=True):

    df = df[df.dataset == 1]

    users = list(zip(df["userID"].value_counts().index, df["userID"].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio * len(df)
    sum_of_train_data = 0
    user_ids = []

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    train = df[df["userID"].isin(user_ids)]
    test = df[df["userID"].isin(user_ids) == False]

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test["userID"] != test["userID"].shift(-1)]
    return train, test

### Option 2
- train 데이터를 모두 훈련에 사용
- valid를 test셋의 마지막 두번째 데이터로 진행

In [None]:
def option2_train_test_split(df):
    # use train dataset only for train
    train = df[df.dataset == 1]

    # use test dataset only for valid
    test = df[(df.dataset == 2)]  # & (df.answerCode != -1)]  # -1 인 answerCode 제외

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test["userID"] != test["userID"].shift(-1)]

    return train, test

## 3. Feature Engineering

In [None]:
def feature_engineering(df, option="option1"):

    # 카테고리형 feature
    categories = [
        "assessmentItemID",
        "testId",
        "KnowledgeTag",
        "bigClassAccCate",
        "bigClass",
        "KTAccuracyCate",
        "day",
        "month",
        "year",
        "wday",
        "weekNum",
        "hour",
        "elapsedTimeClass",
        "tagCluster",
        "testLV",
        "userLVbyTest",
        "userLVbyTestAVG",
        "tagLV",
        "userLVbyTag",
        "userLVbyTagAVG",
        "tagClass",
    ]  # TODO : category feature를 변환시켜줘야함

    le = preprocessing.LabelEncoder()
    # df["elo"] = df["elo"].transform(lambda x: int(x * 100000000))

    for category in categories:
        if df[category].dtypes != "int":  # float, str type -> int로 전환
            df[category] = le.fit_transform(df[category])
        df[category] = df[category].astype("category")

    return df

In [None]:
df = feature_engineering(df)

In [None]:
train, valid = option1_train_test_split(df)
train.head()

## 4. 데이터셋 정의

In [None]:
# X, y 값 분리
y_train = train["answerCode"]
train = train.drop(["answerCode"], axis=1)

y_valid = valid["answerCode"]
valid = valid.drop(["answerCode"], axis=1)

In [None]:
# TODO :사용할 Feature 설정
FEATS = [
    "assessmentItemID",
    # "testId",
    # "KnowledgeTag",
    "accuracy",
    # "user_total_answer",
    "testMean",
    # "testSum",
    "testStd",
    "tagMean",
    # "tagSum",
    "tagStd",
    # "assessSum",
    "assessMean",
    "assessStd",
    # -- 여기서부터 Custom Feature Engineering
    # "bigClass",
    "bigClassAcc",
    "bigClassElapsedTimeAvg",
    "bigClassAccCate",
    # "recAccuracy",
    "cumAccuracy",
    # "cumCorrect",
    # "day",
    "month",
    # "year",
    # "wday",
    # "weekNum",
    # "hour",
    "elapsedTime",
    "elapsedTimeClass",
    # "KnowledgeTagAcc",
    # "KTAccuracyCate",
    # "seenCount",
    "tagCluster",
    # "tagCount",
    # "testLV",
    # "userLVbyTest",
    "userLVbyTestAVG",
    # "tagLV",
    # "userLVbyTag",
    "userLVbyTagAVG",
    # "bigClassCount",
    "recCount",
    "elo",
    "eloTest",
    "eloTag",
    "tagClass",
    "GradeAcc",
    "RepeatedTime",
]

In [None]:
cat_cols = train[FEATS].columns[train[FEATS].dtypes == "category"].to_list()
num_cols = train[FEATS].columns[train[FEATS].dtypes != "category"].to_list()

print(f"cat_cols: {cat_cols}")
print(f"num_cols: {num_cols}")

## 5. 훈련 및 검증

In [None]:
train_pool = Pool(train[FEATS], y_train, cat_features=cat_cols)
eval_pool = Pool(valid[FEATS], y_valid, cat_features=cat_cols)

In [None]:
params = {
    "iterations": 1500,
    "learning_rate": 0.1,  # 0.1
    "eval_metric": "AUC",
    "random_seed": 42,
    "logging_level": "Silent",
    "early_stopping_rounds": 50,
    "use_best_model": True,
    # "task_type": "GPU",
    "bagging_temperature": 1,
    "cat_features": cat_cols,
}

model = CatBoostClassifier(
    **params,
)

model.fit(
    train[FEATS],
    y_train,
    eval_set=[(valid[FEATS], y_valid)],
    # cat_features=cat_cols,
    plot=True,
    # logging_level="Verbose",  # you can uncomment this for text output
)

preds = model.predict_proba(valid[FEATS])[:, 1]
acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_valid, preds)

print(f"VALID AUC : {auc} ACC : {acc}\n")

### Permutation Importance 출력

In [None]:
perm = PermutationImportance(
    model,
    scoring="roc_auc",
    n_iter=1,
    random_state=42,
    cv=None,
    refit=False,
).fit(valid[FEATS], y_valid)
eli5.show_weights(perm, top=len(FEATS), feature_names=FEATS)

### Feature Importance 출력

In [None]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(FEATS)[sorted_idx])
plt.title("Feature Importance")

## 6. Inference

### 6-1. Inferece by test [-2] (test dataset 뒤에서 두번째 값으로 성능 측정)

In [None]:
# use test dataset only for valid
test = df[(df.dataset == 2) & (df.answerCode != -1)]  # -1 인 answerCode 제외

# test데이터셋은 각 유저의 마지막 interaction만 추출
test = test[test["userID"] != test["userID"].shift(-1)]

y_test = test["answerCode"]
test = test.drop(["answerCode"], axis=1)

preds = model.predict_proba(test[FEATS])[:, 1]
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f"VALID AUC : {auc} ACC : {acc}")

### 6-2. 제출

In [None]:
test_df = df[df.dataset == 2]

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df["userID"] != test_df["userID"].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(["answerCode"], axis=1)

# MAKE PREDICTION
total_preds = model.predict_proba(test_df[FEATS])[:, 1]

In [None]:
# SAVE OUTPUT
output_dir = "output/"
write_path = os.path.join(output_dir, "CatBoost_submission_(8411).csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, "w", encoding="utf8") as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write("{},{}\n".format(id, p))