In [34]:
%load_ext nb_black
%load_ext lab_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


<IPython.core.display.Javascript object>

In [35]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


<IPython.core.display.Javascript object>

# Cat Boost

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
import os
import random

from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import eli5
from eli5.sklearn import PermutationImportance

import warnings

warnings.filterwarnings(action="ignore")  # 경고 출력 무시

<IPython.core.display.Javascript object>

## 1. 데이터로딩

In [37]:
data_dir = "/opt/ml/input/data/"  # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, "all_feature_data.csv")  # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path)

# 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
df.sort_values(by=["userID", "Timestamp"], inplace=True)

<IPython.core.display.Javascript object>

## 2. Train/Test 데이터 셋 분리 (option1, option2에서 하나만 실행)

### Option 1
- train 데이터에서 train, valid set을 나눔

In [5]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)


def option1_train_test_split(df, ratio=0.8, split=True):

    users = list(zip(df["userID"].value_counts().index, df["userID"].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio * len(df)
    sum_of_train_data = 0
    user_ids = []

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    train = df[df["userID"].isin(user_ids)]
    test = df[df["userID"].isin(user_ids) == False]

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test["userID"] != test["userID"].shift(-1)]
    return train, test

<IPython.core.display.Javascript object>

### Option 2
- train 데이터를 모두 훈련에 사용
- valid를 test셋의 마지막 두번째 데이터로 진행

In [38]:
def option2_train_test_split(df):
    # use train dataset only for train
    train = df[df.dataset == 1]

    # use test dataset only for valid
    test = df[(df.dataset == 2) & (df.answerCode != -1)]  # -1 인 answerCode 제외

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test["userID"] != test["userID"].shift(-1)]

    return train, test

<IPython.core.display.Javascript object>

## 3. Feature Engineering

In [39]:
def feature_engineering(df, option="option1"):

    # 카테고리형 feature
    categories = [
        "assessmentItemID",
        "testId",
        "KnowledgeTag",
        "bigClassAccCate",
        "bigClass",
        "KTAccuracyCate",
        "day",
        "month",
        "year",
        "wday",
        "weekNum",
        "hour",
        "elapsedTimeClass",
        "tagCluster",
        "testLV",
        "userLVbyTest",
        "userLVbyTestAVG",
        "tagLV",
        "userLVbyTag",
        "userLVbyTagAVG",
    ]  # TODO : category feature를 변환시켜줘야함

    le = preprocessing.LabelEncoder()
    # df["elo"] = df["elo"].transform(lambda x: int(x * 100000000))

    if option == "total":
        for category in categories:
            if df[category].dtypes != "int":
                df[category] = le.fit_transform(df[category])
            df[category] = df[category].astype("category")

        return df

    if option == "option1":
        # 유저 별 분리
        train_df = df[(df.dataset == 1)]
        train, valid = option1_train_test_split(train_df)
    elif option == "option2":
        train, valid = option2_train_test_split(df)

    for category in categories:
        if train[category].dtypes != "int":  # float, str type -> int로 전환
            train[category] = le.fit_transform(train[category])
            valid[category] = le.transform(valid[category])

        train[category] = train[category].astype("category")
        valid[category] = valid[category].astype("category")

    return train, valid

<IPython.core.display.Javascript object>

In [40]:
train, valid = feature_engineering(df, option="option1")
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,tagLV,cumCorrect,elo,...,testLV,seenCount,userLVbyTag,userLVbyTagAVG,tagCount,weekNum,bigClassCount,wday,recCount,year
0,0,5354,975,1,2020-03-24 00:17:11,7224,1,0,0.0,0.980768,...,0,0,10,34,0,13,274,1,1,2020
1,0,5355,975,1,2020-03-24 00:17:14,7225,1,0,1.0,0.973315,...,0,0,10,34,0,13,274,1,2,2020
2,0,5356,975,1,2020-03-24 00:17:22,7225,1,0,2.0,0.947292,...,0,0,10,34,1,13,274,1,3,2020
3,0,5357,975,1,2020-03-24 00:17:29,7225,1,0,3.0,0.974914,...,0,0,10,34,2,13,274,1,4,2020
4,0,5358,975,1,2020-03-24 00:17:36,7225,1,0,4.0,0.961391,...,0,0,10,34,3,13,274,1,5,2020


<IPython.core.display.Javascript object>

## 4. 데이터셋 정의

In [41]:
# X, y 값 분리
y_train = train["answerCode"]
train = train.drop(["answerCode"], axis=1)

y_valid = valid["answerCode"]
valid = valid.drop(["answerCode"], axis=1)

<IPython.core.display.Javascript object>

In [42]:
# TODO :사용할 Feature 설정
FEATS = [
    # "assessmentItemID",
    "testId",
    "KnowledgeTag",
    # "user_acc",
    # "user_total_answer",
    # "test_mean",
    # "test_sum",
    # "tag_mean",
    # "tag_sum",
    # -- 여기서부터 Custom Feature Engineering
    "bigClass",
    # "bigClassAcc",
    # "bigClassAccCate",
    # "recAccuracy",
    # "cumAccuracy",
    # "cumCorrect",
    "day",
    "month",
    "year",
    "wday",
    "weekNum",
    "hour",
    "elapsedTime",
    "elapsedTimeClass",
    # "KnowledgeTagAcc",
    # "KTAccuracyCate",
    "seenCount",
    "tagCluster",
    "tagCount",
    # "testLV",
    # "userLVbyTest",
    "userLVbyTestAVG",
    # "tagLV",
    # "userLVbyTag",
    "userLVbyTagAVG",
    # "bigClassCount",
    "bigClassElaspedTimeAvg",
    # "recCount",
    "elo",
]

<IPython.core.display.Javascript object>

In [43]:
train[FEATS].dtypes

testId                    category
KnowledgeTag              category
bigClass                  category
day                       category
month                     category
year                      category
wday                      category
weekNum                   category
hour                      category
elapsedTime                  int64
elapsedTimeClass          category
seenCount                    int64
tagCluster                category
tagCount                     int64
userLVbyTestAVG           category
userLVbyTagAVG            category
bigClassElaspedTimeAvg       int64
elo                        float64
dtype: object

<IPython.core.display.Javascript object>

In [44]:
cat_cols = train[FEATS].columns[train[FEATS].dtypes == "category"].to_list()
num_cols = train[FEATS].columns[train[FEATS].dtypes != "category"].to_list()

print(f"cat_cols: {cat_cols}")
print(f"num_cols: {num_cols}")

cat_cols: ['testId', 'KnowledgeTag', 'bigClass', 'day', 'month', 'year', 'wday', 'weekNum', 'hour', 'elapsedTimeClass', 'tagCluster', 'userLVbyTestAVG', 'userLVbyTagAVG']
num_cols: ['elapsedTime', 'seenCount', 'tagCount', 'bigClassElaspedTimeAvg', 'elo']


<IPython.core.display.Javascript object>

## 5. 훈련 및 검증

In [13]:
train_pool = Pool(train[FEATS], y_train, cat_features=cat_cols)
eval_pool = Pool(valid[FEATS], y_valid, cat_features=cat_cols)

<IPython.core.display.Javascript object>

In [51]:
params = {
    "iterations": 1000,
    "learning_rate": 0.06,  # 0.1
    "eval_metric": "AUC",
    "random_seed": 42,
    "logging_level": "Silent",
    "early_stopping_rounds": 50,
    "use_best_model": True,
    # "task_type": "GPU",
    "bagging_temperature": 1,
    "cat_features": cat_cols,
}

model = CatBoostClassifier(
    **params,
)

model.fit(
    train[FEATS],
    y_train,
    eval_set=[(train[FEATS], y_train), (valid[FEATS], y_valid)],
    # cat_features=cat_cols,
    plot=True,
    # logging_level="Verbose",  # you can uncomment this for text output
)

preds = model.predict_proba(valid[FEATS])[:, 1]
acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_valid, preds)

print(f"VALID AUC : {auc} ACC : {acc}\n")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

### Permutation Importance 출력

In [33]:
perm = PermutationImportance(
    model,
    scoring="roc_auc",
    n_iter=1,
    random_state=42,
    cv=None,
    refit=False,
).fit(valid[FEATS], y_valid)
eli5.show_weights(perm, top=len(FEATS), feature_names=FEATS)

<IPython.core.display.Javascript object>

### Feature Importance 출력

In [35]:
feature_importances = pd.DataFrame(columns=["feature", "importance"])
feature_importances["feature"] = train[FEATS].columns
feature_importances["importance"] = model.get_feature_importance(train_pool)
feature_importances.sort_values(by=["importance"], inplace=True)
feature_importances

Unnamed: 0,feature,importance
10,year,0.0
16,seenCount,0.133828
11,wday,0.216741
2,KnowledgeTag,0.438534
18,tagCount,0.494794
9,month,0.611772
13,hour,0.676635
7,cumCorrect,0.756245
12,weekNum,0.849879
19,userLVbyTestAVG,0.885907


<IPython.core.display.Javascript object>

## 6. Inference

### 6-1. Inferece by test [-2] (test dataset 뒤에서 두번째 값으로 성능 측정)

In [26]:
# use test dataset only for valid
test = df[(df.dataset == 2) & (df.answerCode != -1)]  # -1 인 answerCode 제외

# test데이터셋은 각 유저의 마지막 interaction만 추출
test = test[test["userID"] != test["userID"].shift(-1)]
test = feature_engineering(test, option="total")

y_test = test["answerCode"]
test = test.drop(["answerCode"], axis=1)

preds = model.predict_proba(test[FEATS])[:, 1]
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f"VALID AUC : {auc} ACC : {acc}")

VALID AUC : 0.8232006612876088 ACC : 0.7580645161290323


<IPython.core.display.Javascript object>

### 6-2. 종테기 결과

In [46]:
ground_truth = pd.read_csv("/opt/ml/workspace/submission_tester/ground_truth.csv")

<IPython.core.display.Javascript object>

In [47]:
test_df = df[df.dataset == 2]

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df["userID"] != test_df["userID"].shift(-1)]
test_df = feature_engineering(test_df, option="total")

# DROP ANSWERCODE
test_df = test_df.drop(["answerCode"], axis=1)

<IPython.core.display.Javascript object>

In [48]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df[FEATS])[:, 1]

<IPython.core.display.Javascript object>

In [49]:
auc = roc_auc_score(ground_truth["gt"], total_preds)
acc = accuracy_score(ground_truth["gt"], np.where(total_preds >= 0.5, 1, 0))

print("* Contains hidden testcase *")
print(f"auc : {auc}\nacc : {acc}")

* Contains hidden testcase *
auc : 0.7974905860840279
acc : 0.7204301075268817


<IPython.core.display.Javascript object>

### 6-3. 제출

In [40]:
# SAVE OUTPUT
output_dir = "output/"
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, "w", encoding="utf8") as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write("{},{}\n".format(id, p))

writing prediction : output/submission.csv


<IPython.core.display.Javascript object>