# LGBM

In [1]:
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns

## 1. 데이터 로딩

In [2]:
data_dir = '../data' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)

dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

df = pd.read_csv(csv_file_path, dtype=dtype, parse_dates=['Timestamp']) 

In [3]:
df['testId'].nunique()

1537

## 2. Feature Engineering

In [4]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    ### 소요시간
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    elapsed_median = df[df['elapsed'] < 3600]['elapsed'].median()
    def elapsed_preprocessing(x):
        if x >= 3600 or x == 0:
            return elapsed_median
        return x
    df['elapsed'] = df['elapsed'].apply(elapsed_preprocessing)

    ### 문제 푼 시간 추가
    df['Time'] = df['Timestamp'].dt.hour

    ### 연속적으로 푼 태그 정보 추가
    df['continuous_tag'] = (df.groupby(['userID', 'KnowledgeTag']).cumcount() + 1).astype(int)
    
    ### 유저 별 평균 소요 시간 추가
    def percentile(x):
        return np.sum(x) / len(x)

    user_time_groupby = df.groupby('userID').agg({
        'elapsed' : percentile
    }).reset_index()

    user_time_groupby.columns = ['userID', 'userTime']

    df = pd.merge(df, user_time_groupby, on=['userID'], how="left")
    
    
    ### 사용자가 마지막으로 푼 문항과 같은 지식 태그를 가진 문항을 푼 시간과 
    # 사용자가 마지막으로 푼 문항과의 시간 간격을 계산
    df['time_since_last_question'] = df.groupby('userID')['Timestamp'].diff().apply(lambda x: x.total_seconds()) 
    df['time_since_last_question'] = df.groupby(['userID', 'KnowledgeTag'])['time_since_last_question'].shift(1) # elapsed
    # 'similar_question_answered' 컬럼을 생성합니다.
    # 사용자가 마지막으로 푼 문항과 같은 지식 태그를 가진 문항을 푼 지 여부
    df['similar_question_answered'] = df['time_since_last_question'].apply(lambda x: 1 if x is not None and x <= 60*60*24*3 else 0)
    df = df.drop(['time_since_last_question'], axis=1)
    
    ### 사용자가 문항을 푼 시간대 (오전/오후/밤/새벽)
    time = df['Timestamp'].dt.hour
    def timezone_preprocessing(x):
        if 7 <= x < 13: # 7~12
            x = 0
        elif 13 <= x < 20: # 13 ~ 19
            x = 1 
        elif 20 <= x < 24: # 20 ~ 23
            x = 2
        else:
            x = 3
        return x
    df['Timezone'] = time.apply(timezone_preprocessing)
    
    ### 주말 여부
    time = df['Timestamp'].dt.day_name()
    def isWeekend_preprocessing(x):
        if x =='Saturday' or x =='Sunday':
            x = 1
        else:
            x = 0
        return x
    df['isWeekend'] = time.apply(isWeekend_preprocessing)
    
    ### 문항이 속한 태그 중 사용자가 이전에 푼 문항에서 맞춘 비율
    df['user_tag_cum_acc'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].transform(lambda x: x.expanding().mean().shift(1))
    
    
    
    
    return df

In [5]:
def feature_engineering_train(df):
    global correct_t, correct_k, question_accuracy
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    ### 사용자가 이전에 푼 문항과 동일한 지식 태그를 포함한 문항일 경우 정답률이 높아지는 경향이 있는 지 여부
    # 문항별 정답률 계산
    question_accuracy = df.groupby('assessmentItemID')['answerCode'].mean()
    
    # 이전에 푼 문항과 동일한 태그를 가진 문항일 경우 1, 그렇지 않을 경우 0으로 구성된 새로운 변수 생성
    df['similar_tag_answered'] = df.groupby('userID')['KnowledgeTag'].apply(lambda x: x.eq(x.shift()).astype(int)).reset_index(drop=True)
    # 이전에 푼 문항과 동일한 태그를 가진 문항일 경우 그 문항의 정답률 계산
    df['similar_tag_accuracy'] = df['assessmentItemID'].map(question_accuracy) * df['similar_tag_answered']
    # 만약 이전에 푼 문항과 동일한 태그를 가진 문항이 없을 경우에는 정답률이 0이 되도록 함
    df['similar_tag_accuracy'].fillna(0, inplace=True)
    df = df.drop(['similar_tag_answered'], axis=1)
    
    return df

In [6]:
def feature_engineering_test(df):
    global correct_t, correct_k, question_accuracy
    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    # 이전에 푼 문항과 동일한 태그를 가진 문항일 경우 1, 그렇지 않을 경우 0으로 구성된 새로운 변수 생성
    df['similar_tag_answered'] = df.groupby('userID')['KnowledgeTag'].apply(lambda x: x.eq(x.shift()).astype(int)).reset_index(drop=True)
    # 이전에 푼 문항과 동일한 태그를 가진 문항일 경우 그 문항의 정답률 계산
    df['similar_tag_accuracy'] = df['assessmentItemID'].map(question_accuracy) * df['similar_tag_answered']
    # 만약 이전에 푼 문항과 동일한 태그를 가진 문항이 없을 경우에는 정답률이 0이 되도록 함
    df['similar_tag_accuracy'].fillna(0, inplace=True)
    df = df.drop(['similar_tag_answered'], axis=1)
    
    return df

In [7]:
df = feature_engineering(df)
df = feature_engineering_train(df)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,userTime,similar_question_answered,Timezone,isWeekend,user_tag_cum_acc,test_mean,test_sum,tag_mean,tag_sum,similar_tag_accuracy
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,28.0,...,49.851007,0,3,0,,0.947683,1268,0.955022,637,0.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3.0,...,49.851007,0,3,0,,0.947683,1268,0.913187,3040,0.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,8.0,...,49.851007,1,3,0,1.0,0.947683,1268,0.913187,3040,0.910314
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,7.0,...,49.851007,1,3,0,1.0,0.947683,1268,0.913187,3040,0.96861
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,7.0,...,49.851007,1,3,0,1.0,0.947683,1268,0.913187,3040,0.941704


### Feature Engineering 연습장

In [9]:
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,userTime,similar_question_answered,Timezone,isWeekend,user_tag_cum_acc,test_mean,test_sum,tag_mean,tag_sum,similar_tag_accuracy
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,28.0,...,49.851007,0,3,0,,0.947683,1268,0.955022,637,0.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3.0,...,49.851007,0,3,0,,0.947683,1268,0.913187,3040,0.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,8.0,...,49.851007,1,3,0,1.0,0.947683,1268,0.913187,3040,0.910314
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,7.0,...,49.851007,1,3,0,1.0,0.947683,1268,0.913187,3040,0.96861
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,7.0,...,49.851007,1,3,0,1.0,0.947683,1268,0.913187,3040,0.941704


## 3. Train/Test 데이터 셋 분리

In [10]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [11]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
### 추가할 Feature 뒤에 넣기
FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum', 'elapsed', 'Time', 'continuous_tag', 'userTime' , 'similar_question_answered', 'isWeekend', 'user_tag_cum_acc', 'similar_tag_accuracy']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [12]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

## 4. 훈련 및 검증

In [14]:
# Baseline 성능 추출

from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

lgb_train = LGBMClassifier().fit(train[FEATS], y_train)
lgb_test = LGBMClassifier().fit(test[FEATS], y_test)

model = LGBMClassifier(
objective='binary',
num_boost_round=500,
early_stopping_rounds=100
)

model.fit(
train[FEATS], y_train,
eval_set=[(train[FEATS], y_train), (test[FEATS], y_test)],
eval_metric='binary_logloss',
verbose=100
)

preds = model.predict_proba(test[FEATS])[:, 1]
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')



[100]	valid_0's binary_logloss: 0.473688	valid_1's binary_logloss: 0.524646
[200]	valid_0's binary_logloss: 0.469613	valid_1's binary_logloss: 0.522083
[300]	valid_0's binary_logloss: 0.466806	valid_1's binary_logloss: 0.520194
[400]	valid_0's binary_logloss: 0.464281	valid_1's binary_logloss: 0.519301
[500]	valid_0's binary_logloss: 0.462104	valid_1's binary_logloss: 0.519008
VALID AUC : 0.8208159411647784 ACC : 0.7463876432486298



In [15]:
# Optuna Libraries
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

# train_test_split
from sklearn.model_selection import train_test_split

from optuna.integration import LightGBMPruningCallback


# random sampler
sampler = TPESampler(seed=10)

# define function
def objective(trial):

    params = {
        'objective': 'binary',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', -1, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    # Generate model
    model = LGBMClassifier(**params)
    model.fit(
        train[FEATS], y_train,
        eval_set=[(train[FEATS], y_train), (test[FEATS], y_test)],
        eval_metric='auc',
        verbose=100
        )
                           
	# 평가지표 원하는 평가 지표가 있을 시 바꾸어 준다.
    AUC = roc_auc_score(y_test, model.predict_proba(test[FEATS])[:, 1])
    return AUC

optuna_model = optuna.create_study(direction='maximize', study_name='LGBMClassifier', sampler=sampler)
optuna_model.optimize(objective, n_trials=50)

[32m[I 2023-05-11 23:59:30,979][0m A new study created in memory with name: LGBMClassifier[0m
[32m[I 2023-05-11 23:59:48,140][0m Trial 0 finished with value: 0.7952246074339098 and parameters: {'num_leaves': 396, 'min_child_samples': 6, 'max_depth': 63, 'learning_rate': 0.7513158437132257, 'subsample': 0.5486563110723314, 'colsample_bytree': 0.30231698097776294, 'reg_alpha': 0.19806286475962398, 'reg_lambda': 0.7605307121989587}. Best is trial 0 with value: 0.7952246074339098.[0m


[100]	valid_0's auc: 0.880211	valid_0's binary_logloss: 0.404838	valid_1's auc: 0.795225	valid_1's binary_logloss: 0.558574


[32m[I 2023-05-12 00:00:02,742][0m Trial 1 finished with value: 0.8020691711389386 and parameters: {'num_leaves': 88, 'min_child_samples': 13, 'max_depth': 68, 'learning_rate': 0.9538594127329871, 'subsample': 0.10355343969512301, 'colsample_bytree': 0.5609730370471989, 'reg_alpha': 0.8126209616521135, 'reg_lambda': 0.6125260668293881}. Best is trial 1 with value: 0.8020691711389386.[0m


[100]	valid_0's auc: 0.848249	valid_0's binary_logloss: 0.447761	valid_1's auc: 0.802069	valid_1's binary_logloss: 0.544637


[32m[I 2023-05-12 00:00:19,132][0m Trial 2 finished with value: 0.8049552772808587 and parameters: {'num_leaves': 370, 'min_child_samples': 33, 'max_depth': 92, 'learning_rate': 0.7174300255637137, 'subsample': 0.5882899312101352, 'colsample_bytree': 0.22795304284137427, 'reg_alpha': 0.3733407600514692, 'reg_lambda': 0.6741336150663453}. Best is trial 2 with value: 0.8049552772808587.[0m


[100]	valid_0's auc: 0.872024	valid_0's binary_logloss: 0.416969	valid_1's auc: 0.804955	valid_1's binary_logloss: 0.543264


[32m[I 2023-05-12 00:00:34,453][0m Trial 3 finished with value: 0.8096352613794474 and parameters: {'num_leaves': 227, 'min_child_samples': 46, 'max_depth': 62, 'learning_rate': 0.518006860128847, 'subsample': 0.6853574637383205, 'colsample_bytree': 0.6409350580640899, 'reg_alpha': 0.8052231968327465, 'reg_lambda': 0.5216471523936341}. Best is trial 3 with value: 0.8096352613794474.[0m


[100]	valid_0's auc: 0.86395	valid_0's binary_logloss: 0.429023	valid_1's auc: 0.809635	valid_1's binary_logloss: 0.531155


[32m[I 2023-05-12 00:00:50,031][0m Trial 4 finished with value: 0.8181107135758299 and parameters: {'num_leaves': 466, 'min_child_samples': 35, 'max_depth': 8, 'learning_rate': 0.3076930560698413, 'subsample': 0.2025859256771948, 'colsample_bytree': 0.845813193676909, 'reg_alpha': 0.04689631938924976, 'reg_lambda': 0.6262871483113925}. Best is trial 4 with value: 0.8181107135758299.[0m


[100]	valid_0's auc: 0.849131	valid_0's binary_logloss: 0.447359	valid_1's auc: 0.818111	valid_1's binary_logloss: 0.521473


[32m[I 2023-05-12 00:01:05,400][0m Trial 5 finished with value: 0.7989415623136553 and parameters: {'num_leaves': 281, 'min_child_samples': 83, 'max_depth': 19, 'learning_rate': 0.8582817994331559, 'subsample': 0.41648737548887915, 'colsample_bytree': 0.7791829223768715, 'reg_alpha': 0.2959617068796787, 'reg_lambda': 0.8839364795611863}. Best is trial 4 with value: 0.8181107135758299.[0m


[100]	valid_0's auc: 0.879	valid_0's binary_logloss: 0.406632	valid_1's auc: 0.798942	valid_1's binary_logloss: 0.554704


[32m[I 2023-05-12 00:01:21,145][0m Trial 6 finished with value: 0.8118236931027629 and parameters: {'num_leaves': 168, 'min_child_samples': 20, 'max_depth': 39, 'learning_rate': 0.1025257708130638, 'subsample': 0.8389950920532356, 'colsample_bytree': 0.23603681767830748, 'reg_alpha': 0.3841144486921996, 'reg_lambda': 0.9442607122388011}. Best is trial 4 with value: 0.8181107135758299.[0m


[100]	valid_0's auc: 0.832925	valid_0's binary_logloss: 0.469479	valid_1's auc: 0.811824	valid_1's binary_logloss: 0.529875


[32m[I 2023-05-12 00:01:39,067][0m Trial 7 finished with value: 0.8151073345259392 and parameters: {'num_leaves': 506, 'min_child_samples': 48, 'max_depth': 83, 'learning_rate': 0.25886039286498874, 'subsample': 0.6376344834077958, 'colsample_bytree': 0.9125485842984646, 'reg_alpha': 0.5345579488018151, 'reg_lambda': 0.5902013629854229}. Best is trial 4 with value: 0.8181107135758299.[0m


[100]	valid_0's auc: 0.874328	valid_0's binary_logloss: 0.417937	valid_1's auc: 0.815107	valid_1's binary_logloss: 0.525068


[32m[I 2023-05-12 00:01:53,542][0m Trial 8 finished with value: 0.8165165970979924 and parameters: {'num_leaves': 22, 'min_child_samples': 39, 'max_depth': 7, 'learning_rate': 0.31240531915939007, 'subsample': 0.3976473807839188, 'colsample_bytree': 0.7964472665895362, 'reg_alpha': 0.039959208689977266, 'reg_lambda': 0.42949217843163834}. Best is trial 4 with value: 0.8181107135758299.[0m


[100]	valid_0's auc: 0.829366	valid_0's binary_logloss: 0.470555	valid_1's auc: 0.816517	valid_1's binary_logloss: 0.524341


[32m[I 2023-05-12 00:02:11,160][0m Trial 9 finished with value: 0.8183571854502086 and parameters: {'num_leaves': 162, 'min_child_samples': 66, 'max_depth': 34, 'learning_rate': 0.0526663826429445, 'subsample': 0.8919236570661244, 'colsample_bytree': 0.7869165284293129, 'reg_alpha': 0.8780966427248583, 'reg_lambda': 0.41750914383926696}. Best is trial 9 with value: 0.8183571854502086.[0m


[100]	valid_0's auc: 0.83256	valid_0's binary_logloss: 0.467925	valid_1's auc: 0.818357	valid_1's binary_logloss: 0.52308


[32m[I 2023-05-12 00:02:27,791][0m Trial 10 finished with value: 0.8116209501093221 and parameters: {'num_leaves': 132, 'min_child_samples': 79, 'max_depth': 38, 'learning_rate': 0.01865079068547048, 'subsample': 0.9414633334925344, 'colsample_bytree': 0.9668805726256086, 'reg_alpha': 0.994849816882248, 'reg_lambda': 0.14295565954325912}. Best is trial 9 with value: 0.8183571854502086.[0m


[100]	valid_0's auc: 0.822801	valid_0's binary_logloss: 0.488501	valid_1's auc: 0.811621	valid_1's binary_logloss: 0.546272


[32m[I 2023-05-12 00:02:37,765][0m Trial 11 finished with value: 0.7828031206519579 and parameters: {'num_leaves': 480, 'min_child_samples': 60, 'max_depth': 1, 'learning_rate': 0.20617119982889767, 'subsample': 0.9908721765297974, 'colsample_bytree': 0.7426269343062807, 'reg_alpha': 0.020210255899761886, 'reg_lambda': 0.326424180475558}. Best is trial 9 with value: 0.8183571854502086.[0m


[100]	valid_0's auc: 0.797876	valid_0's binary_logloss: 0.50845	valid_1's auc: 0.782803	valid_1's binary_logloss: 0.574624


[32m[I 2023-05-12 00:02:53,673][0m Trial 12 finished with value: 0.8118535082488572 and parameters: {'num_leaves': 293, 'min_child_samples': 67, 'max_depth': 24, 'learning_rate': 0.39061976449125135, 'subsample': 0.12624448492801954, 'colsample_bytree': 0.9480107876205339, 'reg_alpha': 0.689527894328622, 'reg_lambda': 0.3116104560931572}. Best is trial 9 with value: 0.8183571854502086.[0m


[100]	valid_0's auc: 0.86766	valid_0's binary_logloss: 0.424574	valid_1's auc: 0.811854	valid_1's binary_logloss: 0.527783


[32m[I 2023-05-12 00:03:12,151][0m Trial 13 finished with value: 0.793577817531306 and parameters: {'num_leaves': 383, 'min_child_samples': 100, 'max_depth': 21, 'learning_rate': 0.01313939892698951, 'subsample': 0.8093292124542684, 'colsample_bytree': 0.4407630223327704, 'reg_alpha': 0.5408380341451666, 'reg_lambda': 0.0358932945233954}. Best is trial 9 with value: 0.8183571854502086.[0m


[100]	valid_0's auc: 0.818118	valid_0's binary_logloss: 0.519205	valid_1's auc: 0.793578	valid_1's binary_logloss: 0.597314


[32m[I 2023-05-12 00:03:28,515][0m Trial 14 finished with value: 0.8205992844364938 and parameters: {'num_leaves': 213, 'min_child_samples': 28, 'max_depth': 36, 'learning_rate': 0.15585947509136514, 'subsample': 0.2916359513592697, 'colsample_bytree': 0.6848336148152859, 'reg_alpha': 0.1584307443683503, 'reg_lambda': 0.46210213114531384}. Best is trial 14 with value: 0.8205992844364938.[0m


[100]	valid_0's auc: 0.845591	valid_0's binary_logloss: 0.452403	valid_1's auc: 0.820599	valid_1's binary_logloss: 0.518483


[32m[I 2023-05-12 00:03:44,504][0m Trial 15 finished with value: 0.8194822102961639 and parameters: {'num_leaves': 196, 'min_child_samples': 25, 'max_depth': 44, 'learning_rate': 0.16560898762351217, 'subsample': 0.2912476020834808, 'colsample_bytree': 0.653131114459744, 'reg_alpha': 0.19842040307359388, 'reg_lambda': 0.41477184590870353}. Best is trial 14 with value: 0.8205992844364938.[0m


[100]	valid_0's auc: 0.845203	valid_0's binary_logloss: 0.452789	valid_1's auc: 0.819482	valid_1's binary_logloss: 0.519216


[32m[I 2023-05-12 00:04:00,703][0m Trial 16 finished with value: 0.8204671039554761 and parameters: {'num_leaves': 227, 'min_child_samples': 24, 'max_depth': 52, 'learning_rate': 0.15637821322005174, 'subsample': 0.29516260927156257, 'colsample_bytree': 0.6312906813001123, 'reg_alpha': 0.14195061905782722, 'reg_lambda': 0.24700926868793838}. Best is trial 14 with value: 0.8205992844364938.[0m


[100]	valid_0's auc: 0.846757	valid_0's binary_logloss: 0.451133	valid_1's auc: 0.820467	valid_1's binary_logloss: 0.517444


[32m[I 2023-05-12 00:04:17,231][0m Trial 17 finished with value: 0.7894643212085072 and parameters: {'num_leaves': 289, 'min_child_samples': 24, 'max_depth': 53, 'learning_rate': 0.13555880613238758, 'subsample': 0.2953187326529057, 'colsample_bytree': 0.10946460396278818, 'reg_alpha': 0.16109420147902803, 'reg_lambda': 0.2288310005593477}. Best is trial 14 with value: 0.8205992844364938.[0m


[100]	valid_0's auc: 0.820924	valid_0's binary_logloss: 0.485026	valid_1's auc: 0.789464	valid_1's binary_logloss: 0.560411


[32m[I 2023-05-12 00:04:31,519][0m Trial 18 finished with value: 0.8123593718942556 and parameters: {'num_leaves': 100, 'min_child_samples': 6, 'max_depth': 53, 'learning_rate': 0.4540024555344977, 'subsample': 0.46310044437173425, 'colsample_bytree': 0.4846399661550769, 'reg_alpha': 0.10907714928303691, 'reg_lambda': 0.15826235061741756}. Best is trial 14 with value: 0.8205992844364938.[0m


[100]	valid_0's auc: 0.845768	valid_0's binary_logloss: 0.451611	valid_1's auc: 0.812359	valid_1's binary_logloss: 0.52804


[32m[I 2023-05-12 00:04:48,616][0m Trial 19 finished with value: 0.8196521566289008 and parameters: {'num_leaves': 242, 'min_child_samples': 19, 'max_depth': 82, 'learning_rate': 0.18269294310409137, 'subsample': 0.294034829575933, 'colsample_bytree': 0.6570141867293168, 'reg_alpha': 0.2760019887570968, 'reg_lambda': 0.28002889990281404}. Best is trial 14 with value: 0.8205992844364938.[0m


[100]	valid_0's auc: 0.85005	valid_0's binary_logloss: 0.446846	valid_1's auc: 0.819652	valid_1's binary_logloss: 0.518789


[32m[I 2023-05-12 00:05:05,975][0m Trial 20 finished with value: 0.8209004174120452 and parameters: {'num_leaves': 323, 'min_child_samples': 30, 'max_depth': 28, 'learning_rate': 0.1206628562815722, 'subsample': 0.19511614632546723, 'colsample_bytree': 0.5743198223291003, 'reg_alpha': 0.12822868331665174, 'reg_lambda': 0.4928381997993244}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.84821	valid_0's binary_logloss: 0.449365	valid_1's auc: 0.8209	valid_1's binary_logloss: 0.517441


[32m[I 2023-05-12 00:05:24,382][0m Trial 21 finished with value: 0.8202454780361758 and parameters: {'num_leaves': 357, 'min_child_samples': 30, 'max_depth': 33, 'learning_rate': 0.10098980769848015, 'subsample': 0.2101010374809917, 'colsample_bytree': 0.5701996676860228, 'reg_alpha': 0.10528081368556855, 'reg_lambda': 0.4646419469941677}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.847227	valid_0's binary_logloss: 0.450586	valid_1's auc: 0.820245	valid_1's binary_logloss: 0.518699


[32m[I 2023-05-12 00:05:41,207][0m Trial 22 finished with value: 0.8192566090240508 and parameters: {'num_leaves': 340, 'min_child_samples': 41, 'max_depth': 27, 'learning_rate': 0.2538894159035606, 'subsample': 0.3458354640884884, 'colsample_bytree': 0.6975295989087648, 'reg_alpha': 0.0006752147985487067, 'reg_lambda': 0.5163998967795702}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.862184	valid_0's binary_logloss: 0.431842	valid_1's auc: 0.819257	valid_1's binary_logloss: 0.519622


[32m[I 2023-05-12 00:05:58,554][0m Trial 23 finished with value: 0.8198409858874975 and parameters: {'num_leaves': 321, 'min_child_samples': 55, 'max_depth': 46, 'learning_rate': 0.16588150646455216, 'subsample': 0.1864867228254049, 'colsample_bytree': 0.5986126941434994, 'reg_alpha': 0.14150130498804736, 'reg_lambda': 0.3502162669316501}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.852898	valid_0's binary_logloss: 0.443837	valid_1's auc: 0.819841	valid_1's binary_logloss: 0.519737


[32m[I 2023-05-12 00:06:16,263][0m Trial 24 finished with value: 0.8181484794275493 and parameters: {'num_leaves': 225, 'min_child_samples': 15, 'max_depth': 56, 'learning_rate': 0.11033679640042665, 'subsample': 0.24870526598177442, 'colsample_bytree': 0.5034874565067562, 'reg_alpha': 0.26538638935657544, 'reg_lambda': 0.37396942340463335}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.841956	valid_0's binary_logloss: 0.456948	valid_1's auc: 0.818148	valid_1's binary_logloss: 0.522137


[32m[I 2023-05-12 00:06:34,529][0m Trial 25 finished with value: 0.8164251639833036 and parameters: {'num_leaves': 410, 'min_child_samples': 28, 'max_depth': 13, 'learning_rate': 0.35984712855208156, 'subsample': 0.3527321458032238, 'colsample_bytree': 0.7005236140817173, 'reg_alpha': 0.2243696464112535, 'reg_lambda': 0.4899052848713154}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.872698	valid_0's binary_logloss: 0.418549	valid_1's auc: 0.816425	valid_1's binary_logloss: 0.523244


[32m[I 2023-05-12 00:06:49,913][0m Trial 26 finished with value: 0.8187596899224806 and parameters: {'num_leaves': 197, 'min_child_samples': 44, 'max_depth': 30, 'learning_rate': 0.23695611137036218, 'subsample': 0.14147543399919257, 'colsample_bytree': 0.4148529921952153, 'reg_alpha': 0.0987162933876125, 'reg_lambda': 0.25961860886442484}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.848637	valid_0's binary_logloss: 0.448581	valid_1's auc: 0.81876	valid_1's binary_logloss: 0.520788


[32m[I 2023-05-12 00:07:08,011][0m Trial 27 finished with value: 0.8197982508447623 and parameters: {'num_leaves': 266, 'min_child_samples': 36, 'max_depth': 72, 'learning_rate': 0.0704748788411092, 'subsample': 0.22908988053314355, 'colsample_bytree': 0.5902406387360564, 'reg_alpha': 0.3806168779349701, 'reg_lambda': 0.5319996706961021}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.838918	valid_0's binary_logloss: 0.46098	valid_1's auc: 0.819798	valid_1's binary_logloss: 0.52075


[32m[I 2023-05-12 00:07:26,446][0m Trial 28 finished with value: 0.7963446630888491 and parameters: {'num_leaves': 319, 'min_child_samples': 14, 'max_depth': 43, 'learning_rate': 0.012326482383402837, 'subsample': 0.16210018571454446, 'colsample_bytree': 0.5048510185790789, 'reg_alpha': 0.15969772688118444, 'reg_lambda': 0.38381143133641654}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.81846	valid_0's binary_logloss: 0.519744	valid_1's auc: 0.796345	valid_1's binary_logloss: 0.596409


[32m[I 2023-05-12 00:07:44,091][0m Trial 29 finished with value: 0.8190339892665474 and parameters: {'num_leaves': 425, 'min_child_samples': 8, 'max_depth': 65, 'learning_rate': 0.2026312015120984, 'subsample': 0.5223029560341728, 'colsample_bytree': 0.7210616874269979, 'reg_alpha': 0.21429534916943432, 'reg_lambda': 0.7176858073903785}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.863952	valid_0's binary_logloss: 0.430017	valid_1's auc: 0.819034	valid_1's binary_logloss: 0.519653


[32m[I 2023-05-12 00:07:58,358][0m Trial 30 finished with value: 0.8173772609819121 and parameters: {'num_leaves': 29, 'min_child_samples': 23, 'max_depth': 59, 'learning_rate': 0.12064219554774527, 'subsample': 0.4929690198794331, 'colsample_bytree': 0.6377610772882089, 'reg_alpha': 0.07636790751794897, 'reg_lambda': 0.7810310462207863}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.827239	valid_0's binary_logloss: 0.473214	valid_1's auc: 0.817377	valid_1's binary_logloss: 0.523678


[32m[I 2023-05-12 00:08:16,788][0m Trial 31 finished with value: 0.8207364341085271 and parameters: {'num_leaves': 356, 'min_child_samples': 30, 'max_depth': 33, 'learning_rate': 0.08922757197321178, 'subsample': 0.236604344860122, 'colsample_bytree': 0.5495114743054306, 'reg_alpha': 0.10735013458578471, 'reg_lambda': 0.4509139974413219}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.845559	valid_0's binary_logloss: 0.452764	valid_1's auc: 0.820736	valid_1's binary_logloss: 0.51809


[32m[I 2023-05-12 00:08:34,423][0m Trial 32 finished with value: 0.8192685350824886 and parameters: {'num_leaves': 255, 'min_child_samples': 31, 'max_depth': 16, 'learning_rate': 0.07472693274803258, 'subsample': 0.11875979830253572, 'colsample_bytree': 0.5465218229225322, 'reg_alpha': 0.1476444474743269, 'reg_lambda': 0.5661708489598729}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.838932	valid_0's binary_logloss: 0.460704	valid_1's auc: 0.819269	valid_1's binary_logloss: 0.520975


[32m[I 2023-05-12 00:08:51,377][0m Trial 33 finished with value: 0.8187567084078713 and parameters: {'num_leaves': 201, 'min_child_samples': 19, 'max_depth': 48, 'learning_rate': 0.14013778390519474, 'subsample': 0.22141416011184445, 'colsample_bytree': 0.3945676721553484, 'reg_alpha': 0.06770604848438394, 'reg_lambda': 0.46533243259153273}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.842306	valid_0's binary_logloss: 0.456196	valid_1's auc: 0.818757	valid_1's binary_logloss: 0.519979


[32m[I 2023-05-12 00:09:08,088][0m Trial 34 finished with value: 0.8180689723712979 and parameters: {'num_leaves': 326, 'min_child_samples': 51, 'max_depth': 36, 'learning_rate': 0.256045586869763, 'subsample': 0.2591362932775907, 'colsample_bytree': 0.6075624756140584, 'reg_alpha': 0.006740584238126179, 'reg_lambda': 0.6186470393992993}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.859969	valid_0's binary_logloss: 0.434724	valid_1's auc: 0.818069	valid_1's binary_logloss: 0.520589


[32m[I 2023-05-12 00:09:26,027][0m Trial 35 finished with value: 0.8197445835817928 and parameters: {'num_leaves': 385, 'min_child_samples': 32, 'max_depth': 28, 'learning_rate': 0.19433557713789296, 'subsample': 0.10529356999389781, 'colsample_bytree': 0.5579229760375949, 'reg_alpha': 0.3142068300074361, 'reg_lambda': 0.5606296505087327}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.858977	valid_0's binary_logloss: 0.436623	valid_1's auc: 0.819745	valid_1's binary_logloss: 0.518894


[32m[I 2023-05-12 00:09:42,283][0m Trial 36 finished with value: 0.7970552574040944 and parameters: {'num_leaves': 426, 'min_child_samples': 40, 'max_depth': 71, 'learning_rate': 0.5813326315258445, 'subsample': 0.3505608186557109, 'colsample_bytree': 0.6698819704954544, 'reg_alpha': 0.20550874442166583, 'reg_lambda': 0.48361598087365804}. Best is trial 20 with value: 0.8209004174120452.[0m


[100]	valid_0's auc: 0.888644	valid_0's binary_logloss: 0.394818	valid_1's auc: 0.797055	valid_1's binary_logloss: 0.55126


[32m[I 2023-05-12 00:09:58,228][0m Trial 37 finished with value: 0.8218376068376068 and parameters: {'num_leaves': 295, 'min_child_samples': 11, 'max_depth': 40, 'learning_rate': 0.309499654386327, 'subsample': 0.16944365603392822, 'colsample_bytree': 0.6179541243552774, 'reg_alpha': 0.0743213687587011, 'reg_lambda': 0.651200440144716}. Best is trial 37 with value: 0.8218376068376068.[0m


[100]	valid_0's auc: 0.862455	valid_0's binary_logloss: 0.431481	valid_1's auc: 0.821838	valid_1's binary_logloss: 0.516344


[32m[I 2023-05-12 00:10:14,883][0m Trial 38 finished with value: 0.8149592526336712 and parameters: {'num_leaves': 358, 'min_child_samples': 11, 'max_depth': 42, 'learning_rate': 0.3088898289205759, 'subsample': 0.17379999199516216, 'colsample_bytree': 0.5235953044208411, 'reg_alpha': 0.06312909788087778, 'reg_lambda': 0.6540928128667923}. Best is trial 37 with value: 0.8218376068376068.[0m


[100]	valid_0's auc: 0.86602	valid_0's binary_logloss: 0.427567	valid_1's auc: 0.814959	valid_1's binary_logloss: 0.524149


[32m[I 2023-05-12 00:10:33,195][0m Trial 39 finished with value: 0.8207582985489962 and parameters: {'num_leaves': 301, 'min_child_samples': 5, 'max_depth': 23, 'learning_rate': 0.06800336199567188, 'subsample': 0.17618747246753863, 'colsample_bytree': 0.7504221015295607, 'reg_alpha': 0.06899819469303779, 'reg_lambda': 0.5786596957311649}. Best is trial 37 with value: 0.8218376068376068.[0m


[100]	valid_0's auc: 0.840894	valid_0's binary_logloss: 0.458166	valid_1's auc: 0.820758	valid_1's binary_logloss: 0.518655


[32m[I 2023-05-12 00:10:52,107][0m Trial 40 finished with value: 0.8195776187636652 and parameters: {'num_leaves': 309, 'min_child_samples': 5, 'max_depth': 13, 'learning_rate': 0.05553020585726348, 'subsample': 0.19018331996895532, 'colsample_bytree': 0.8451992409674579, 'reg_alpha': 0.3336464763884882, 'reg_lambda': 0.6769146110368526}. Best is trial 37 with value: 0.8218376068376068.[0m


[100]	valid_0's auc: 0.837273	valid_0's binary_logloss: 0.462007	valid_1's auc: 0.819578	valid_1's binary_logloss: 0.520401


[32m[I 2023-05-12 00:11:10,464][0m Trial 41 finished with value: 0.8224587557145697 and parameters: {'num_leaves': 273, 'min_child_samples': 12, 'max_depth': 21, 'learning_rate': 0.09008688308426457, 'subsample': 0.16873213360536907, 'colsample_bytree': 0.7356898221081205, 'reg_alpha': 0.055959364666762546, 'reg_lambda': 0.6087755634865101}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.842964	valid_0's binary_logloss: 0.455421	valid_1's auc: 0.822459	valid_1's binary_logloss: 0.516612


[32m[I 2023-05-12 00:11:28,715][0m Trial 42 finished with value: 0.8217809580600278 and parameters: {'num_leaves': 270, 'min_child_samples': 11, 'max_depth': 22, 'learning_rate': 0.08132603338541228, 'subsample': 0.14242490157800897, 'colsample_bytree': 0.7492966887587864, 'reg_alpha': 0.055922864120474314, 'reg_lambda': 0.5578629767536827}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.84177	valid_0's binary_logloss: 0.456903	valid_1's auc: 0.821781	valid_1's binary_logloss: 0.517238


[32m[I 2023-05-12 00:11:46,590][0m Trial 43 finished with value: 0.8218594712780759 and parameters: {'num_leaves': 272, 'min_child_samples': 12, 'max_depth': 21, 'learning_rate': 0.08012491230797725, 'subsample': 0.1016165546135878, 'colsample_bytree': 0.7588747922940531, 'reg_alpha': 0.051213767570510826, 'reg_lambda': 0.6039036454876981}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.841749	valid_0's binary_logloss: 0.456966	valid_1's auc: 0.821859	valid_1's binary_logloss: 0.517308


[32m[I 2023-05-12 00:12:00,709][0m Trial 44 finished with value: 0.8156807791691513 and parameters: {'num_leaves': 270, 'min_child_samples': 11, 'max_depth': 6, 'learning_rate': 0.2385813874226118, 'subsample': 0.10125215233622506, 'colsample_bytree': 0.8241905672141414, 'reg_alpha': 0.03163904406332628, 'reg_lambda': 0.6439137992351213}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.833256	valid_0's binary_logloss: 0.46623	valid_1's auc: 0.815681	valid_1's binary_logloss: 0.525316


[32m[I 2023-05-12 00:12:17,892][0m Trial 45 finished with value: 0.8215802027429934 and parameters: {'num_leaves': 247, 'min_child_samples': 17, 'max_depth': 20, 'learning_rate': 0.11542636729384856, 'subsample': 0.14724854429745135, 'colsample_bytree': 0.7753200609308095, 'reg_alpha': 0.0008587291526337137, 'reg_lambda': 0.5999245650086961}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.844373	valid_0's binary_logloss: 0.453702	valid_1's auc: 0.82158	valid_1's binary_logloss: 0.516986


[32m[I 2023-05-12 00:12:35,114][0m Trial 46 finished with value: 0.8154005167958656 and parameters: {'num_leaves': 169, 'min_child_samples': 16, 'max_depth': -1, 'learning_rate': 0.0360336915387833, 'subsample': 0.13770756586835373, 'colsample_bytree': 0.7683931813834635, 'reg_alpha': 0.04382750703914944, 'reg_lambda': 0.5992217719231276}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.829165	valid_0's binary_logloss: 0.473496	valid_1's auc: 0.815401	valid_1's binary_logloss: 0.529485


[32m[I 2023-05-12 00:12:51,103][0m Trial 47 finished with value: 0.8177181474855894 and parameters: {'num_leaves': 259, 'min_child_samples': 11, 'max_depth': 18, 'learning_rate': 0.21327783062928907, 'subsample': 0.15745608863204752, 'colsample_bytree': 0.8900922152944375, 'reg_alpha': 0.000993090246550657, 'reg_lambda': 0.7139000729357876}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.854196	valid_0's binary_logloss: 0.441958	valid_1's auc: 0.817718	valid_1's binary_logloss: 0.52152


[32m[I 2023-05-12 00:13:09,608][0m Trial 48 finished with value: 0.809802226197575 and parameters: {'num_leaves': 284, 'min_child_samples': 18, 'max_depth': 11, 'learning_rate': 0.013463816796995154, 'subsample': 0.10125272112157614, 'colsample_bytree': 0.7320009204336166, 'reg_alpha': 0.059314119999684174, 'reg_lambda': 0.5506406257276487}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.824713	valid_0's binary_logloss: 0.501982	valid_1's auc: 0.809802	valid_1's binary_logloss: 0.567515


[32m[I 2023-05-12 00:13:23,372][0m Trial 49 finished with value: 0.8140419399721726 and parameters: {'num_leaves': 248, 'min_child_samples': 9, 'max_depth': 5, 'learning_rate': 0.2821966326585219, 'subsample': 0.13520435937332292, 'colsample_bytree': 0.8044734114427132, 'reg_alpha': 0.0001365143300456062, 'reg_lambda': 0.6155049249911634}. Best is trial 41 with value: 0.8224587557145697.[0m


[100]	valid_0's auc: 0.829943	valid_0's binary_logloss: 0.470004	valid_1's auc: 0.814042	valid_1's binary_logloss: 0.527023


In [16]:
model_trial = optuna_model.best_trial
model_trial_params = model_trial.params
print('Best Trial: score {},\nparams {}'.format(model_trial.value, model_trial_params))

Best Trial: score 0.8224587557145697,
params {'num_leaves': 273, 'min_child_samples': 12, 'max_depth': 21, 'learning_rate': 0.09008688308426457, 'subsample': 0.16873213360536907, 'colsample_bytree': 0.7356898221081205, 'reg_alpha': 0.055959364666762546, 'reg_lambda': 0.6087755634865101}


In [17]:
# Modeling fit
model = LGBMClassifier(**model_trial_params, verbose=0)
model.fit(train[FEATS], y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


## 5. Inference

In [18]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path, dtype=dtype, parse_dates=['Timestamp'])

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)
test_df = feature_engineering_test(test_df)


# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)




In [19]:
# print(set(test_df['assessmentItemID'].unique()) - set(df['assessmentItemID'].unique()))
# print(set(test_df['userID'].unique()) - set(df['userID'].unique()))
# print(set(test_df['testId'].unique()) - set(df['testId'].unique()))
# print(set(test_df['KnowledgeTag'].unique()) - set(df['KnowledgeTag'].unique()))

In [20]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df[FEATS])[:, 1]

In [22]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/submission.csv
