# LGBM을 활용한 베이스라인

In [140]:
import pandas as pd
import os
import random
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

In [141]:
import warnings 
warnings.filterwarnings("ignore")

In [142]:
# !pip install optuna

## 1. 데이터 로딩

In [143]:
data_dir = '/opt/ml/input/data/' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path)

## 2. Feature Engineering

In [144]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df

In [145]:
df = feature_engineering(df)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,0.947683,1268,0.955022,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,0.947683,1268,0.913187,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,0.947683,1268,0.913187,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,0.947683,1268,0.913187,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,0.947683,1268,0.913187,3040


## 3. Train/Test 데이터 셋 분리

In [146]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [147]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [148]:
# !pip install lightgbm

In [149]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

In [150]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

## 하이퍼파라미터 튜닝

## 4. 훈련 및 검증 (+ 하이퍼파라미터 튜닝(optuna))

In [152]:
def objective(trial: Trial):
    params = {
        'objective': 'binary',
        'bagging_fraction': trial.suggest_float("bagging_fraction", 0.5, 0.6, step=0.01),
        'bagging_seed': 11, ##
        'learning_rate': trial.suggest_categorical("lr", [0.001, 0.005, 0.01, 0.05, 0.1]),
        'num_iterations': trial.suggest_int("n_iter", 100, 1000, 100),
        'max_depth': trial.suggest_categorical('max_depth', [-1, 1]), # need to consider
        'boosting': 'gbdt',
        'early_stopping': trial.suggest_categorical('patience', [0, 5, 10, 15, 20])

    }
    model = lgb.train(
        params, 
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=500,
        early_stopping_rounds=100
    )

    preds = model.predict(test[FEATS])
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    return auc

sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="maximize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2022-11-25 17:56:30,819][0m A new study created in memory with name: lgbm_parameter_opt[0m


[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 10 rounds


[32m[I 2022-11-25 17:56:34,596][0m Trial 0 finished with value: 0.6802742993440667 and parameters: {'bagging_fraction': 0.54, 'lr': 0.001, 'n_iter': 100, 'max_depth': -1, 'patience': 10}. Best is trial 0 with value: 0.6802742993440667.[0m


[100]	training's binary_logloss: 0.630835	valid_1's binary_logloss: 0.74029
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.630835	valid_1's binary_logloss: 0.74029
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 10 rounds
[100]	training's binary_logloss: 0.604284	valid_1's binary_logloss: 0.714731
[200]	training's binary_logloss: 0.587834	valid_1's binary_logloss: 0.699589
[300]	training's binary_logloss: 0.579609	valid_1's binary_logloss: 0.692499
[400]	training's binary_logloss: 0.575022	valid_1's 

[32m[I 2022-11-25 17:56:51,883][0m Trial 1 finished with value: 0.6831847545219638 and parameters: {'bagging_fraction': 0.52, 'lr': 0.01, 'n_iter': 700, 'max_depth': 1, 'patience': 10}. Best is trial 1 with value: 0.6831847545219638.[0m


[700]	training's binary_logloss: 0.56945	valid_1's binary_logloss: 0.686116
Did not meet early stopping. Best iteration is:
[700]	training's binary_logloss: 0.56945	valid_1's binary_logloss: 0.686116
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 5 rounds


[32m[I 2022-11-25 17:56:53,354][0m Trial 2 finished with value: 0.6880669846948917 and parameters: {'bagging_fraction': 0.56, 'lr': 0.1, 'n_iter': 1000, 'max_depth': -1, 'patience': 5}. Best is trial 2 with value: 0.6880669846948917.[0m


Early stopping, best iteration is:
[36]	training's binary_logloss: 0.562779	valid_1's binary_logloss: 0.682851
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 5 rounds
[100]	training's binary_logloss: 0.637805	valid_1's binary_logloss: 0.74749
[200]	training's binary_logloss: 0.632429	valid_1's binary_logloss: 0.742155
[300]	training's binary_logloss: 0.627607	valid_1's binary_logloss: 0.737391
[400]	training's binary_logloss: 0.623266	valid_1's binary_logloss: 0.733155
[500]	training's binary_logloss: 0.61934	valid_1's binary_logloss:

[32m[I 2022-11-25 17:57:07,385][0m Trial 3 finished with value: 0.6619310276287022 and parameters: {'bagging_fraction': 0.5, 'lr': 0.001, 'n_iter': 600, 'max_depth': 1, 'patience': 5}. Best is trial 2 with value: 0.6880669846948917.[0m


[600]	training's binary_logloss: 0.61578	valid_1's binary_logloss: 0.725826
Did not meet early stopping. Best iteration is:
[600]	training's binary_logloss: 0.61578	valid_1's binary_logloss: 0.725826
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 20 rounds
[100]	training's binary_logloss: 0.561524	valid_1's binary_logloss: 0.683336
[200]	training's binary_logloss: 0.559385	valid_1's binary_logloss: 0.681498


[32m[I 2022-11-25 17:57:15,010][0m Trial 4 finished with value: 0.6897207314649175 and parameters: {'bagging_fraction': 0.5, 'lr': 0.05, 'n_iter': 900, 'max_depth': -1, 'patience': 20}. Best is trial 4 with value: 0.6897207314649175.[0m


Early stopping, best iteration is:
[220]	training's binary_logloss: 0.559046	valid_1's binary_logloss: 0.681123
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 5 rounds
[100]	training's binary_logloss: 0.604284	valid_1's binary_logloss: 0.714731
[200]	training's binary_logloss: 0.587834	valid_1's binary_logloss: 0.699589
[300]	training's binary_logloss: 0.579609	valid_1's binary_logloss: 0.692499
[400]	training's binary_logloss: 0.575022	valid_1's binary_logloss: 0.68912
[500]	training's binary_logloss: 0.572282	valid_1's binary_loglos

[32m[I 2022-11-25 17:57:31,262][0m Trial 5 finished with value: 0.6830714569668057 and parameters: {'bagging_fraction': 0.58, 'lr': 0.01, 'n_iter': 800, 'max_depth': 1, 'patience': 5}. Best is trial 4 with value: 0.6897207314649175.[0m


Early stopping, best iteration is:
[673]	training's binary_logloss: 0.569708	valid_1's binary_logloss: 0.686239
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 5 rounds
[100]	training's binary_logloss: 0.572103	valid_1's binary_logloss: 0.687404


[32m[I 2022-11-25 17:57:35,525][0m Trial 6 finished with value: 0.6836409262572054 and parameters: {'bagging_fraction': 0.53, 'lr': 0.05, 'n_iter': 200, 'max_depth': 1, 'patience': 5}. Best is trial 4 with value: 0.6897207314649175.[0m


Early stopping, best iteration is:
[166]	training's binary_logloss: 0.568453	valid_1's binary_logloss: 0.685447
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
[100]	training's binary_logloss: 0.604284	valid_1's binary_logloss: 0.714731
[200]	training's binary_logloss: 0.587834	valid_1's binary_logloss: 0.699589
[300]	training's binary_logloss: 0.579609	valid_1's binary_logloss: 0.692499
[400]	training's binary_logloss: 0.575022	valid_1's binary_logloss: 0.68912
[500]	training's binary_logloss: 0.572282	valid_1's binary_logloss: 0.687455
[600]	training's binary_logloss: 0.570565	valid_

[32m[I 2022-11-25 17:58:00,280][0m Trial 7 finished with value: 0.6842213277678394 and parameters: {'bagging_fraction': 0.5, 'lr': 0.01, 'n_iter': 1000, 'max_depth': 1, 'patience': 0}. Best is trial 4 with value: 0.6897207314649175.[0m


[1000]	training's binary_logloss: 0.567772	valid_1's binary_logloss: 0.684876
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
[100]	training's binary_logloss: 0.604284	valid_1's binary_logloss: 0.714731
[200]	training's binary_logloss: 0.587834	valid_1's binary_logloss: 0.699589
[300]	training's binary_logloss: 0.579609	valid_1's binary_logloss: 0.692499
[400]	training's binary_logloss: 0.575022	valid_1's binary_logloss: 0.68912
[500]	training's binary_logloss: 0.572282	valid_1's binary_logloss: 0.687455
[600]	training's binary_logloss: 0.570565	valid_1's binary_logloss: 0.686622
[700]	training's binary_logloss: 0.56945	valid_1's binary_logloss: 0.

[32m[I 2022-11-25 17:58:24,128][0m Trial 8 finished with value: 0.6837885112303717 and parameters: {'bagging_fraction': 0.6, 'lr': 0.01, 'n_iter': 900, 'max_depth': 1, 'patience': 0}. Best is trial 4 with value: 0.6897207314649175.[0m


[900]	training's binary_logloss: 0.568167	valid_1's binary_logloss: 0.685323
[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 20 rounds
[100]	training's binary_logloss: 0.637805	valid_1's binary_logloss: 0.74749


[32m[I 2022-11-25 17:58:28,811][0m Trial 9 finished with value: 0.6530426356589147 and parameters: {'bagging_fraction': 0.58, 'lr': 0.001, 'n_iter': 200, 'max_depth': 1, 'patience': 20}. Best is trial 4 with value: 0.6897207314649175.[0m


[200]	training's binary_logloss: 0.632429	valid_1's binary_logloss: 0.742155
Did not meet early stopping. Best iteration is:
[200]	training's binary_logloss: 0.632429	valid_1's binary_logloss: 0.742155
Best Score: 0.6897207314649175
Best trial: {'bagging_fraction': 0.5, 'lr': 0.05, 'n_iter': 900, 'max_depth': -1, 'patience': 20}


In [153]:
# INSTALL MATPLOTLIB IN ADVANCE
# _ = lgb.plot_importance(model)

## 5. Inference

In [154]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [155]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [156]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/submission.csv
