# LGBM

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import random
import warnings
import time
import math

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from lightgbm import LGBMClassifier

warnings.filterwarnings(action="ignore")  # 경고 출력 무시

In [10]:
seed = 42
os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)

## 1. Data Load

In [None]:
path = '/DKT'
print(os.listdir(path))
data_path = path + '/data'

train_path = data_path + '/train_data.csv'
test_path = data_path + '/test_data.csv'

In [12]:
df_train = pd.read_csv(train_path, parse_dates=["Timestamp"])
df_test = pd.read_csv(test_path, parse_dates=["Timestamp"])

df_train['dataset'] = 1
df_test['dataset'] = 2

df = pd.concat([df_train, df_test])
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1
...,...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,2
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,2
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,2
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,2


## 2. Feature Engineering

In [13]:
def feature_engineering(df):

    ########## 기본 feature ##########

    # 유저별 시퀀스를 고려하기 위해 정렬
    df.sort_values(by=["userID", "Timestamp"], inplace=True)
    df.reset_index(inplace = True)

    # 카테고리형 feature
    categories = ["assessmentItemID", "testId"]

    for category in categories:
        df[category] = df[category].astype("category")

    ########## 문제 관련 ##########

    # 문제 대분류 : 시험지 카테고리, 시험지 번호, 문제 번호
    df["category"] = df["testId"].apply(lambda x: int(x[2]))
    df["test_number"] = df["testId"].apply(lambda x: int(x[-3:]))
    df["problem_number"] = df["assessmentItemID"].apply(lambda x: int(x[-3:]))
    
    # 인접한 testId grouping
    index = df[df['testId'] != df['testId'].shift(-1)].index
    grouping = [0] * (index[0] + 1)
    for i in range(1, len(index)):
        grouping += [i] * (index[i] - index[i-1])
    df['grouping'] = grouping

    # 문항별 Mean Encoding
    per_test = df.groupby(["testId"])["answerCode"].agg(["mean", "sum"])
    per_test.columns = ["answerRate_per_test", "answerCount_per_test"]
    df = pd.merge(df, per_test, on=["testId"], how="left")

    # Tag별 Mean Encoding
    per_tag = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "sum"])
    per_tag.columns = ["answerRate_per_tag", "answerCount_per_tag"]
    df = pd.merge(df, per_tag, on=["KnowledgeTag"], how="left")

    # 시험지별 Mean Encoding
    per_ass = df.groupby(["assessmentItemID"])["answerCode"].agg(["mean", "sum"])
    per_ass.columns = ["answerRate_per_ass", "answerCount_per_ass"]
    df = pd.merge(df, per_ass, on=["assessmentItemID"], how="left")

    # 문제 번호별 Mean Encoding
    per_pnum = df.groupby(["problem_number"])["answerCode"].agg(["mean", "sum"])
    per_pnum.columns = ["answerRate_per_pnum", "answerCount_per_pnum"]
    df = pd.merge(df, per_pnum, on=["problem_number"], how="left")

    # 시험지 별 문제 수와 태그 수
    f = lambda x: len(set(x))
    test = df.groupby(["testId"]).agg({"problem_number": "max", "KnowledgeTag": f})
    test.reset_index(inplace=True)
    test.columns = ["testId", "problem_count", "tag_count"]
    df = pd.merge(df, test, on="testId", how="left")
    df["problem_position"] = df["problem_number"] / df["problem_count"]

    ########## Time 관련 ##########

    # 문제 풀이 시간 ver1 : elapsed shift(1)
    diff = df[["userID", "grouping", "Timestamp"]].groupby(["userID", "grouping"]).diff().fillna(pd.Timedelta(seconds=0))
    df["elapsed_shift"] = diff["Timestamp"].apply(lambda x: x.total_seconds())

    # 문제 풀이 시간 ver2 : 문제 푸는 시간
    df['elapsed'] = df['elapsed_shift'].shift(-1).fillna(value = 0)
    temp = df[df['testId'] == df['testId'].shift(-1)].groupby('grouping')['elapsed'].mean().to_dict()
    df.loc[df['testId'] != df['testId'].shift(-1), 'elapsed'] = df.loc[df['testId'] != df['testId'].shift(-1), 'grouping'].map(temp).fillna(value = 0).astype(int)

    # 맞춘 문제와 틀린 문제
    correct_df = df[df["answerCode"] == 1]
    wrong_df = df[df["answerCode"] == 0]

    # Tag별 모든 문제, 맞춘 문제, 틀린 문제별 풀이 시간의 평균
    mean_elapsed_tag = df.groupby(["KnowledgeTag"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_tag.columns = ["KnowledgeTag", "mean_elp_tag_all"]
    df = pd.merge(df, mean_elapsed_tag, on=["KnowledgeTag"], how="left")

    mean_elapsed_tag_o = correct_df.groupby(["KnowledgeTag"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_tag_o.columns = ["KnowledgeTag", "mean_elp_tag_o"]
    df = pd.merge(df, mean_elapsed_tag_o, on=["KnowledgeTag"], how="left")

    mean_elapsed_tag_x = wrong_df.groupby(["KnowledgeTag"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_tag_x.columns = ["KnowledgeTag", "mean_elp_tag_x"]
    df = pd.merge(df, mean_elapsed_tag_x, on=["KnowledgeTag"], how="left")

    # 문제별 모든 문제, 맞춘 문제, 틀린 문제별 풀이 시간의 평균
    mean_elapsed_ass = df.groupby(["assessmentItemID"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_ass.columns = ["assessmentItemID", "mean_elp_ass_all"]
    df = pd.merge(df, mean_elapsed_ass, on=["assessmentItemID"], how="left")

    mean_elapsed_ass_o = correct_df.groupby(["assessmentItemID"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_ass_o.columns = ["assessmentItemID", "mean_elp_ass_o"]
    df = pd.merge(df, mean_elapsed_ass_o, on=["assessmentItemID"], how="left")

    mean_elapsed_ass_x = wrong_df.groupby(["assessmentItemID"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_ass_x.columns = ["assessmentItemID", "mean_elp_ass_x"]
    df = pd.merge(df, mean_elapsed_ass_x, on=["assessmentItemID"], how="left")

    # 문제 번호별 모든 문제, 맞춘 문제, 틀린 문제별 풀이 시간의 평균
    mean_elapsed_pnum = df.groupby(["problem_number"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_pnum.columns = ["problem_number", "mean_elp_pnum_all"]
    df = pd.merge(df, mean_elapsed_pnum, on=["problem_number"], how="left")

    mean_elapsed_pnum_o = correct_df.groupby(["problem_number"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_pnum_o.columns = ["problem_number", "mean_elp_pnum_o"]
    df = pd.merge(df, mean_elapsed_pnum_o, on=["problem_number"], how="left")

    mean_elapsed_pnum_x = wrong_df.groupby(["problem_number"])["elapsed"].agg("mean").reset_index()
    mean_elapsed_pnum_x.columns = ["problem_number", "mean_elp_pnum_x"]
    df = pd.merge(df, mean_elapsed_pnum_x, on=["problem_number"], how="left")

    # 유저 평균과의 시간 차이
    for i in range(1,6):
        df[f"timestep_{i}"] = df.groupby("userID")["answerCode"].shift(i).fillna(1).astype(int)

    df_time = df[["userID", "elapsed"]].groupby(["userID"]).agg("median").reset_index()
    df_time.rename(columns={"elapsed": "user_median_elapsed"}, inplace=True)
    df = df.merge(df_time, on="userID", how="left")
    df["timeDelta_userAverage"] = df["elapsed"] - df["user_median_elapsed"]

    # 문제 정답 / 오답자들의 문제 풀이 시간 중위수
    col_name = ["median_elapsed_wrong_users", "median_elapsed_correct_users"]
    for i in range(2):
        df_median_elapsed = (df[["assessmentItemID", "answerCode", "elapsed"]].groupby(["assessmentItemID", "answerCode"]).agg("median").reset_index())
        df_median_elapsed = df_median_elapsed[df_median_elapsed["answerCode"] == i].drop("answerCode", axis=1)
        df_median_elapsed.rename(columns={"elapsed": col_name[i]}, inplace=True)
        df = df.merge(df_median_elapsed, on=["assessmentItemID"], how="left")
    
    
    ########## User 관련 ##########
    
    # User별 정답률, 문제푼 횟수, 맞춘 문제수
    df["problem_correct_per_user"] = (df.groupby("userID")["answerCode"].transform(lambda x: x.cumsum().shift(1)).fillna(0))
    df["problem_solved_per_user"] = df.groupby("userID")["answerCode"].cumcount()
    df["cum_answerRate_per_user"] = (df["problem_correct_per_user"] / df["problem_solved_per_user"]).fillna(0)
    
    # 유저별 Tag 문제 누적 값
    df["acc_tag_count_per_user"] = df.groupby(["userID", "KnowledgeTag"]).cumcount()

    # User별로 대분류별 맞춘 문제 개수, 대분류별 맞춘 문제 개수, 대분류별 정답률
    df["correct_answer_per_cat"] = (df.groupby(["userID", "category"])["answerCode"].transform(lambda x: x.cumsum().shift(1)).fillna(0))
    df["acc_count_per_cat"] = df.groupby(["userID", "category"]).cumcount()
    df["acc_answerRate_per_cat"] = (df["correct_answer_per_cat"] / df["acc_count_per_cat"]).fillna(0)
    df["acc_elapsed_per_cat"] = (df.groupby(["userID", "category"])["elapsed"].transform(lambda x: x.cumsum()).fillna(0))
    
    return df

In [14]:
df = feature_engineering(df)

## 3. Train/Test Split

- 마지막 interaction 때문에 custom split 진행
- 반복문을 이용해서 train -> Split

In [22]:
# Feature Selection
FEATS = [
    "KnowledgeTag",
    "answerRate_per_tag", 
    "answerCount_per_tag",
    "tag_count",
    "mean_elp_tag_all",
    "mean_elp_tag_o",
    "mean_elp_tag_x",
    "answerRate_per_test", 
    "answerCount_per_test",
    "cum_answerRate_per_user",
    "problem_correct_per_user",
    "problem_solved_per_user",
    "mean_elp_ass_all",
    "mean_elp_ass_o",
    "mean_elp_ass_x",
    "answerRate_per_ass", 
    "answerCount_per_ass",
    "elapsed",
    'elapsed_shift',
    "category",
    "acc_answerRate_per_cat",
    "acc_count_per_cat",
    "acc_elapsed_per_cat",
    "correct_answer_per_cat",
    "test_number",
    "mean_elp_pnum_all",
    "mean_elp_pnum_o",
    "mean_elp_pnum_x",
    "acc_tag_count_per_user",
    "problem_count",
    "problem_number",
    "answerRate_per_pnum",
    "answerCount_per_pnum",
    "problem_position",
    'timeDelta_userAverage',
    'timestep_1', 'timestep_2', 'timestep_3', 'timestep_4', 'timestep_5',
    "median_elapsed_wrong_users", "median_elapsed_correct_users"
]

In [23]:
# User별 split
train, valid = train_test_split(df[df['answerCode'] != -1], test_size = 0.2, random_state = seed, shuffle = True)

y_train = train["answerCode"]
train = train.drop(["answerCode"], axis=1)

y_valid = valid["answerCode"]
valid = valid.drop(["answerCode"], axis=1)

print(train.shape, y_train.shape, valid.shape, y_valid.shape)

(2020764, 50) (2020764,) (505192, 50) (505192,)


## 4. 훈련 및 검증

In [25]:
model = LGBMClassifier(n_estimators=1000)

model.fit(X=train[FEATS],
          y=y_train,
          eval_set=[(valid[FEATS], y_valid)],
          eval_metric="auc",
          # early_stopping_rounds=100,
          verbose=5,
         )

[5]	valid_0's auc: 0.835951	valid_0's binary_logloss: 0.541557
[10]	valid_0's auc: 0.842638	valid_0's binary_logloss: 0.496088
[15]	valid_0's auc: 0.846491	valid_0's binary_logloss: 0.472959
[20]	valid_0's auc: 0.849438	valid_0's binary_logloss: 0.46015
[25]	valid_0's auc: 0.851582	valid_0's binary_logloss: 0.452668
[30]	valid_0's auc: 0.85305	valid_0's binary_logloss: 0.448117
[35]	valid_0's auc: 0.854234	valid_0's binary_logloss: 0.445051
[40]	valid_0's auc: 0.855107	valid_0's binary_logloss: 0.443048
[45]	valid_0's auc: 0.855966	valid_0's binary_logloss: 0.441292
[50]	valid_0's auc: 0.856484	valid_0's binary_logloss: 0.440242
[55]	valid_0's auc: 0.856999	valid_0's binary_logloss: 0.439245
[60]	valid_0's auc: 0.857542	valid_0's binary_logloss: 0.438306
[65]	valid_0's auc: 0.857916	valid_0's binary_logloss: 0.437654
[70]	valid_0's auc: 0.858295	valid_0's binary_logloss: 0.437037
[75]	valid_0's auc: 0.858607	valid_0's binary_logloss: 0.436502
[80]	valid_0's auc: 0.858927	valid_0's bina

In [26]:
#  LGBoost 모델 추론
preds = model.predict_proba(valid[FEATS])[:, 1]
acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_valid, preds)

print(f"VALID AUC : {auc} ACC : {acc}\n")

VALID AUC : 0.8641703025809075 ACC : 0.8067922690778951



In [27]:
test_df = df[df.dataset == 2]
# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df["userID"] != test_df["userID"].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(["answerCode"], axis=1)

# MAKE PREDICTION
total_probs = model.predict_proba(test_df[FEATS])[:,-1]

In [None]:
# SAVE OUTPUT
write_path = os.path.join(data_path, "submission_Lgbm.csv")
with open(write_path, "w", encoding="utf8") as w:
    w.write("id,prediction\n")
    for id, p in enumerate(total_probs):
        w.write("{},{}\n".format(id, p))

###**콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다.

