In [None]:
%load_ext nb_black
%load_ext lab_black

# LGBM

In [14]:
import pandas as pd
import numpy as np
import os
import random

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import warnings

warnings.filterwarnings(action="ignore")

## 1. 데이터 로딩

In [15]:
data_dir = "/opt/ml/input/data/"  # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, "all_feature_data.csv")  # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path)

# 2. Feature Engineering
- Special mission의 Feature Engineering 코드
- Category feature의 변환

In [21]:
def feature_engineering(df):

    # 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=["userID", "Timestamp"], inplace=True)

    # 카테고리형 feature
    categories = ["assessmentItemID", "testId", "bigClass"] # TODO : category feature를 변환시켜줘야함

    for category in categories:
        df[category] = df[category].astype("category")

    return df


df = feature_engineering(df)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,testMean,month,totalAnswer,...,recAccuracy,elapsedTimeClass,tagMean,elapsedTime,tagSum,testLV,seenCount,userLVbyTest,bigClassAcc,day
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.952667,3,0,...,1.0,1,0.957333,3,718,1.0,0,37.0,0.791908,24
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,0.952667,3,1,...,1.0,2,0.917067,8,3439,1.0,0,37.0,0.791908,24
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,0.952667,3,2,...,1.0,2,0.917067,7,3439,1.0,0,37.0,0.791908,24
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,0.952667,3,3,...,1.0,2,0.917067,7,3439,1.0,0,37.0,0.791908,24
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,0.952667,3,4,...,1.0,3,0.917067,11,3439,1.0,0,37.0,0.791908,24


In [22]:
train_df = df[(df.dataset == 1)] 
train_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,testMean,month,totalAnswer,...,recAccuracy,elapsedTimeClass,tagMean,elapsedTime,tagSum,testLV,seenCount,userLVbyTest,bigClassAcc,day
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.952667,3,0,...,1.000000,1,0.957333,3,718,1.0,0,37.0,0.791908,24
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,0.952667,3,1,...,1.000000,2,0.917067,8,3439,1.0,0,37.0,0.791908,24
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,0.952667,3,2,...,1.000000,2,0.917067,7,3439,1.0,0,37.0,0.791908,24
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,0.952667,3,3,...,1.000000,2,0.917067,7,3439,1.0,0,37.0,0.791908,24
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,0.952667,3,4,...,1.000000,3,0.917067,11,3439,1.0,0,37.0,0.791908,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1,0.666000,6,4,...,0.200000,12,0.694889,55,3127,4.0,0,40.0,0.200000,5
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1,0.652500,8,5,...,0.333333,3,0.698551,11,2410,4.0,0,40.0,1.000000,21
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,1,0.652500,8,6,...,0.428571,10,0.698551,46,2410,4.0,0,40.0,1.000000,21
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,1,0.652500,8,7,...,0.500000,15,0.698551,73,2410,4.0,0,40.0,1.000000,21


## 2. Train/Test 데이터 셋 분리

### Option 1
- train 데이터에서 train, valid set을 나눔

In [None]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)


def option1_train_test_split(df, ratio=0.8, split=True):

    users = list(zip(df["userID"].value_counts().index, df["userID"].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio * len(df)
    sum_of_train_data = 0
    user_ids = []

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    train = df[df["userID"].isin(user_ids)]
    test = df[df["userID"].isin(user_ids) == False]

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test["userID"] != test["userID"].shift(-1)]
    return train, test

# 유저 별 분리
train, test = option1_train_test_split(train_df)

### Option 2
- train 데이터를 모두 훈련에 사용
- valid를 test셋의 마지막 두번째 데이터로 진행

In [23]:
def option2_train_test_split(df):
    # use train dataset only for train
    train = df[df.dataset == 1]

    # use test dataset only for valid
    test = df[(df.dataset == 2) & (df.answerCode != -1)] # -1 인 answerCode 제외

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test["userID"] != test["userID"].shift(-1)]

    return train, test

train, test = option2_train_test_split(df)

- FEATS 에 사용할 feature를 설정

In [24]:


# TODO :사용할 Feature 설정
FEATS = [
    "assessmentItemID",
    "testId",
    "KnowledgeTag",
    # "user_acc",
    # "user_total_answer",
    # "test_mean",
    # "test_sum",
    # "tag_mean",
    # "tag_sum",
    # -- 여기서부터 Custom Feature Engineering
     "bigClass",
     "bigClassAcc",
    # "bigClassAccCate",
    # "cumAccuracy",
    # "cumCorrect",
     "elapsedTime",
    # "elapsedTimeClass",
    # "KnowledgeTagAcc",
    # "KTAccuracyCate",
     "recAccuracy",
    # "seenCount",
    # "tagCluster",
     "tagCount",
    # "testLV",
    # "userLVbyTest",
]

# X, y 값 분리
y_train = train["answerCode"]
train = train.drop(["answerCode"], axis=1)

y_test = test["answerCode"]
test = test.drop(["answerCode"], axis=1)

## 3. Dataset 정의

In [25]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

## 4. 훈련 및 검증

### hyper parameter 참고
- https://smecsm.tistory.com/133
- https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

In [26]:
# hyper parameters
# TODO : tunning
params = {
    # "learning_rate": 0.01,
    # "max_depth": 8,
    # "boosting": "gbdt",  # rf, gbdt, dart, goss
    "objective": "binary",
    "metric": "auc",
    # "num_leaves": 40,s
    # "feature_fraction": 0.8,
    # "bagging_fraction": 1,
    # "bagging_freq": 5,
    "seed": 42,
}


model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_test],
    verbose_eval=100,
    num_boost_round=10000,
    early_stopping_rounds=100,
)

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f"VALID AUC : {auc} ACC : {acc}\n")

[LightGBM] [Info] Number of positive: 1483205, number of negative: 783381
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10940
[LightGBM] [Info] Number of data points in the train set: 2266586, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654378 -> initscore=0.638341
[LightGBM] [Info] Start training from score 0.638341
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.791263	valid_1's auc: 0.765378
[200]	training's auc: 0.796151	valid_1's auc: 0.765045
Early stopping, best iteration is:
[121]	training's auc: 0.792985	valid_1's auc: 0.766534
VALID AUC : 0.7665340190047036 ACC : 0.6895161290322581



In [None]:
# INSTALL MATPLOTLIB IN ADVANCE
_ = lgb.plot_importance(model)

## 5. Inference

In [None]:
test_df = df[df.dataset == 2]

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df["userID"] != test_df["userID"].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(["answerCode"], axis=1)

In [None]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [None]:
# SAVE OUTPUT
output_dir = "output/"
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, "w", encoding="utf8") as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write("{},{}\n".format(id, p))