In [44]:
import pandas as pd
import numpy as np
import datetime as dt
import optuna
import statsmodels.api as sm
import datetime
import re
import ray

from collections import defaultdict
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from statsmodels.tsa.seasonal import STL
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, ElasticNet, ridge_regression, SGDRegressor, RANSACRegressor, SGDOneClassSVM
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.svm import SVR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
import lightgbm as lgb

from tqdm import tqdm
import warnings
import random
import os
seed=777
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
pd.set_option('mode.chained_assignment',  None)
sampler = TPESampler(seed=seed)

train_df = pd.read_csv("train_feature_engineering.csv")
test_df = pd.read_csv("test_feature_engineering.csv")
total = pd.read_csv("total.csv")

# total = pd.read_csv("sequential_total.csv")
# test_df = pd.read_csv("sequential_test.csv")

total = total.loc[total["answerCode"] != -1]
test_df = test_df.loc[test_df['answerCode'] == -1]

def extract_datetime(df):
    df['month'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.month)
    df['day'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.day)
    df['hour'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.hour)
    df['minute'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.minute)
    df['second'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.second)
    df.drop(['Timestamp'], axis=1, inplace=True)
    return df

def extract_testId(df):
    df['testClass'] = df['testId'].apply(lambda x : int(x[2]))
    df['testCode'] = df['testId'].apply(lambda x : int(x[7:]))
    df.drop(['testId'], axis=1, inplace=True)
    return df

def extract_assessmentItemID(df):
    df['assessmentItemCode'] = df['assessmentItemID'].apply(lambda x : int(x[7:]))
    df.drop(['assessmentItemID'], axis=1, inplace=True)
    return df

# train_df = extract_datetime(train_df)
# test_df = extract_datetime(test_df)
# train_df = extract_testId(train_df)
# test_df = extract_testId(test_df)
# train_df = extract_assessmentItemID(train_df)
# test_df = extract_assessmentItemID(test_df)

# cut = int(len(train_df)*0.8)
# h_train = train_df[:cut]
# h_valid = train_df[cut:]

# h_train_X = h_train.drop('answerCode', axis=1)
# h_train_y = h_train['answerCode']
# h_valid_X = h_valid.drop('answerCode', axis=1)
# h_valid_y = h_valid['answerCode']

train_X = train_df.drop(['answerCode', 'assessmentItemID', 'testId', 'Timestamp', 'relative_answered_correctly'], axis=1)
train_y = train_df[['userID', 'answerCode']]
test_X = test_df.drop(['answerCode', 'assessmentItemID', 'testId', 'Timestamp', 'relative_answered_correctly'], axis=1)

# total = pd.concat([total.iloc[:, 0:9], total.iloc[:, 27:]], axis=1)
# test_df = pd.concat([test_df.iloc[:, 0:9], test_df.iloc[:, 27:]], axis=1)

# train_X = total.drop(['userID', 'answerCode', 'assessmentItemID', 'testId', 'Timestamp'], axis=1)
# train_y = total[['userID', 'answerCode']]
# test_X = test_df.drop(['userID', 'answerCode', 'assessmentItemID', 'testId', 'Timestamp'], axis=1)

h_train_X, h_valid_X, h_train_y, h_valid_y = train_test_split(train_X, train_y['answerCode'], test_size=0.1, stratify=train_y['answerCode'], random_state=seed)

print(h_train_X.shape, h_train_y.shape, h_valid_X.shape, h_valid_y.shape)

(2039927, 22) (2039927,) (226659, 22) (226659,)


In [42]:
h_train_X.columns

Index(['userID', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first',
       'testId_last', 'elapsed', 'accuracy_by_assessment', 'accuracy_by_test',
       'accuracy_by_tag', 'accuracy_by_assessment_last',
       'accuracy_by_test_first', 'accuracy_by_test_last',
       'relative_answered_correctly', 'prior_ac_count', 'prior_quest_count',
       'prior_ac_accuracy', 'prior_relative_ac_sum', 'prior_relative_accuracy',
       'prior_assessment_frequency', 'prior_test_frequency',
       'prior_tags_frequency', 'diff_time_btw_tags', 'prev_tag_answer'],
      dtype='object')

In [7]:
h_train_X.columns
for col in h_train_X.columns:
    if 'accuracy_by' in col:
        h_train_X.drop(col, axis=1, inplace=True)
        h_valid_X.drop(col, axis=1, inplace=True)

In [8]:
h_train_X.columns

Index(['KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last',
       'trend_elapsed', 'seasonal_elapsed', 'resid_elapsed',
       'trend_prior_ac_count', 'seasonal_prior_ac_count',
       'resid_prior_ac_count', 'trend_prior_quest_count',
       'seasonal_prior_quest_count', 'resid_prior_quest_count',
       'trend_prior_ac_accuracy', 'seasonal_prior_ac_accuracy',
       'resid_prior_ac_accuracy', 'trend_prior_relative_ac_sum',
       'seasonal_prior_relative_ac_sum', 'resid_prior_relative_ac_sum',
       'trend_prior_relative_accuracy', 'seasonal_prior_relative_accuracy',
       'resid_prior_relative_accuracy', 'trend_prior_assessment_frequency',
       'seasonal_prior_assessment_frequency',
       'resid_prior_assessment_frequency', 'trend_prior_test_frequency',
       'seasonal_prior_test_frequency', 'resid_prior_test_frequency',
       'trend_prior_tags_frequency', 'seasonal_prior_tags_frequency',
       'resid_prior_tags_frequency', 'trend_diff_time_btw_tags',
   

## Decomposition

In [12]:
# total_decompose_col = defaultdict(lambda: np.zeros(len(total)))
# ignore = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'assessmentItemID_last', 'testId_first', 'testId_last', 'relative_answered_correctly']
# for key, group in tqdm(total.groupby(by=["userID"]), total=total['userID'].nunique()):
#     indices = group.index
#     for idx, col in enumerate(group.columns):
#         if not (col in ignore or re.findall('trend|seasonal|resid', col)):
#             res = STL(group[col], period=60).fit()
#             total_decompose_col[f'trend_{col}'][indices] = res.trend.values
#             total_decompose_col[f'seasonal_{col}'][indices] = res.seasonal.values
#             total_decompose_col[f'resid_{col}'][indices] = res.resid.values
            
# new_total = pd.concat([total, pd.DataFrame(total_decompose_col)], axis=1)
# new_total.to_csv("sequential_total.csv", index=False)
# new_total

  1%|          | 60/7442 [02:09<3:38:39,  1.78s/it]

## Forward Selection

In [52]:
# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
def forward_stepwise_regression(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in tqdm(range(0, len(features))) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.OLS(y_train, x_tr).fit()
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)
        
        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

features, result = forward_stepwise_regression(h_train_X, h_train_y)

pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

 98%|█████████▊| 54/55 [1:34:06<01:44, 104.57s/it]


In [53]:
train_X.columns, len(train_X.columns)
set(h_train_X.columns) - set(features)

{'seasonal_accuracy_by_tag'}

In [24]:
nt = pd.concat([h_train_X.iloc[:, :5], h_train_X.iloc[:, 22:]], axis=1)
nv = pd.concat([h_valid_X.iloc[:, :5], h_valid_X.iloc[:, 22:]], axis=1)
m1 = LinearRegression()
m1.fit(nt, h_train_y)
p1 = m1.predict(nv)

print(m1.coef_)
print(accuracy_score(h_valid_y, np.where(p1 > 0.5, 1, 0)))
print(roc_auc_score(h_valid_y, p1))
print(classification_report(h_valid_y, np.where(p1 > 0.5, 1, 0)))

[-6.30055633e-07  7.02585874e-06  1.16222987e-02 -4.00802945e-04
  4.37596084e-05  1.98934622e-03  2.93536597e-02  2.84672447e-02
  1.05157175e+00  1.05241908e+00  1.03720081e+00 -9.33494039e-01
 -1.19892188e-01 -9.94245981e-02 -2.86054395e-02 -8.15118124e-03
 -2.94443181e-02  5.01969891e-02 -3.51579306e-01 -3.59062332e-01
 -1.59586420e-01 -6.62212069e-02 -6.63421904e-02 -4.21559188e-02
  2.17484395e-02  3.91650231e-02 -3.61184466e-02  3.18352374e-02
  8.55464831e-02  3.47768721e-02 -4.12970541e-02 -7.44496546e-02
  6.53020952e-02 -2.13334563e-02 -1.02240067e-01 -1.09291096e-04
 -3.61078439e-02 -4.25307638e-02  8.64821695e-02  7.69605915e-02
  2.64675108e-02  1.54534984e-01  4.52728791e-02  2.42721498e-02
 -7.78606590e-02 -1.01207995e-01 -9.87393335e-02 -1.21525983e-01
  2.89793094e-03 -2.29129212e-03  1.22888392e-02  8.39047750e-03
  7.29033866e-03  1.16538394e+00  1.44449379e-01  1.37814312e-01]
0.7926910860175749
0.8498385694673957
              precision    recall  f1-score   suppo

In [24]:
m1 = LinearRegression()
m1.fit(h_train_X, h_train_y)
p1 = m1.predict(h_valid_X)

print(accuracy_score(h_valid_y, np.where(p1 > 0.5, 1, 0)))
print(roc_auc_score(h_valid_y, p1))
print(classification_report(h_valid_y, np.where(p1 > 0.5, 1, 0)))

0.792049359451456
0.8492645021231646
              precision    recall  f1-score   support

         0.0       0.75      0.60      0.67    174474
         1.0       0.81      0.89      0.85    330718

    accuracy                           0.79    505192
   macro avg       0.78      0.75      0.76    505192
weighted avg       0.79      0.79      0.79    505192



In [55]:
m2 = LinearRegression()
m2.fit(h_train_X[features], h_train_y)
p2 = m2.predict(h_valid_X[features])

print(accuracy_score(h_valid_y, np.where(p2 > 0.5, 1, 0)))
print(roc_auc_score(h_valid_y, p2))
print(classification_report(h_valid_y, np.where(p2 > 0.5, 1, 0)))

0.7926646933768988
0.8498357590095279
              precision    recall  f1-score   support

         0.0       0.75      0.61      0.67    261710
         1.0       0.81      0.89      0.85    496077

    accuracy                           0.79    757787
   macro avg       0.78      0.75      0.76    757787
weighted avg       0.79      0.79      0.79    757787



## Linear model parameter search

In [None]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-4, 1),
        'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
        'max_iter' : trial.suggest_int('max_iter', 1000, 10000),
    }
    model = LogisticRegression(**param, random_state=seed)
    # LR_model = model.fit(h_train_X, h_train_y)
    # loss = roc_auc_score(h_valid_y, LR_model.predict_proba(h_valid_X)[:, 1])
    LR_model = model.fit(nt, h_train_y)
    loss = roc_auc_score(h_valid_y, LR_model.predict_proba(nv)[:, 1])
    
    return loss
study_LR = optuna.create_study(direction='maximize', sampler=sampler)
study_LR.optimize(objective, n_trials=1000)

In [3]:
model = LinearRegression()
LR_model = model.fit(h_train_X, h_train_y)
preds = LR_model.predict(h_valid_X)
loss = roc_auc_score(h_valid_y, preds)
loss

0.8162293044025792

In [None]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1.0),
        'max_iter' : trial.suggest_int('max_iter', 1000, 10000),
        'selection' : trial.suggest_categorical('selection' , ['cyclic','random']),
    }
    model = Lasso(**param, random_state=777)
    Lasso_model = model.fit(h_train_X, h_train_y)
    preds = Lasso_model.predict_proba(h_valid_X)[:, 1]
    # pred_labels = np.rint(preds)
    loss = roc_auc_score(h_valid_y, preds)
    return loss
    
study_Lasso = optuna.create_study(direction='maximize', sampler=sampler)
study_Lasso.optimize(objective, n_trials=1000)

In [20]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1.0),
        'max_iter' : trial.suggest_int('max_iter', 1000, 10000),
        'selection' : trial.suggest_categorical('selection' , ['cyclic','random']),
        'l1_ratio' : trial.suggest_uniform('l1_ratio' , 1e-6 , 1.0),
        'alpha' : trial.suggest_uniform('alpha' , 1e-6 , 2.0),
    }
    random.seed(777)
    model = ElasticNet(**param, random_state=777)
    ElasticNet_model = model.fit(h_train_X, h_train_y)
    preds = ElasticNet_model.predict(h_valid_X)
    # pred_labels = np.rint(preds)
    loss = roc_auc_score(h_valid_y, preds)
    return loss
study_ElasticNet = optuna.create_study(direction='maximize', sampler=sampler)
study_ElasticNet.optimize(objective, n_trials=1000)

[32m[I 2022-11-28 15:17:54,888][0m A new study created in memory with name: no-name-a821e5ad-559f-4678-808c-de85783bbf88[0m
[32m[I 2022-11-28 15:17:56,186][0m Trial 0 finished with value: 0.7393077021129613 and parameters: {'tol': 0.7269892573300858, 'max_iter': 7917, 'selection': 'random', 'l1_ratio': 0.09337416345597153, 'alpha': 0.15937269153708025}. Best is trial 0 with value: 0.7393077021129613.[0m
[32m[I 2022-11-28 15:17:57,313][0m Trial 1 finished with value: 0.6290944276840383 and parameters: {'tol': 0.5896141638421108, 'max_iter': 4090, 'selection': 'cyclic', 'l1_ratio': 0.6817795980016734, 'alpha': 1.104514075217506}. Best is trial 0 with value: 0.7393077021129613.[0m
[32m[I 2022-11-28 15:17:58,384][0m Trial 2 finished with value: 0.6355215439610826 and parameters: {'tol': 0.26886078902820126, 'max_iter': 4359, 'selection': 'cyclic', 'l1_ratio': 0.39064870280554764, 'alpha': 0.3863256181112427}. Best is trial 0 with value: 0.7393077021129613.[0m
[32m[I 2022-11-28

In [None]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1.0),
        'max_iter' : trial.suggest_int('max_iter', 1000, 10000),
        'loss' : trial.suggest_categorical('loss' , ['squared_error','huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
        'penalty' : 'elasticnet',
        'l1_ratio' : trial.suggest_uniform('l1_ratio' , 1e-6 , 1.0),
        'alpha' : trial.suggest_uniform('alpha' , 1e-6 , 2.0),
        'learning_rate' : trial.suggest_categorical('learning_rate' , ['constant','optimal', 'invscaling', 'adaptive']),
        'eta0' : trial.suggest_uniform('eta0' , 1e-6 , 1.0),
        'power_t' : trial.suggest_uniform('power_t' , 1e-6 , 1.0),
        'early_stopping' : True,
        'validation_fraction' : 0.2,
        'n_iter_no_change' : trial.suggest_int('n_iter_no_change', 5, 50),
    }
    if param['loss'] in ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']:
        param['epsilon'] = trial.suggest_uniform('epsilon' , 1e-6 , 1.0)

    model = RANSACRegressor(**param, random_state=777)
    RANSAC_model = model.fit(h_train_X, h_train_y)
    preds = RANSAC_model.predict(h_valid_X)
    pred_labels = np.rint(preds)
    loss = roc_auc_score(h_valid_y, pred_labels)
    return loss
study_RANSAC = optuna.create_study(direction='maximize', sampler=sampler)
study_RANSAC.optimize(objective, n_trials=1000)

In [None]:
def objective(trial):
    param = {
        'nu' : trial.suggest_uniform('nu' , 0.0 , 1.0),
        'max_iter' : trial.suggest_int('max_iter', 1000, 10000),
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1.0),
        'learning_rate' : trial.suggest_categorical('learning_rate' , ['constant','optimal', 'invscaling', 'adaptive']),
        'eta0' : trial.suggest_uniform('eta0' , 1e-6 , 1.0),
        'power_t' : trial.suggest_uniform('power_t' , 1e-6 , 1.0)
    }
    random.seed(777)
    model = SGDOneClassSVM(**param, random_state=777)
    SGDOneClass_model = model.fit(h_train_X, h_train_y)
    preds = SGDOneClass_model.predict(h_valid_X)
    loss = roc_auc_score(h_valid_y, preds)
    return loss
study_SGDOneClass = optuna.create_study(direction='maximize', sampler=sampler)
study_SGDOneClass.optimize(objective, n_trials=1000)

## Support vector model paramter search

In [None]:
def objective(trial):
    param = {
        'alpha' : trial.suggest_uniform('alpha' , 0.0 , 1.0),
    }
    model = BernoulliNB(**param)
    Bernoulli_model = model.fit(h_train_X, h_train_y)
    preds = Bernoulli_model.predict(h_valid_X)
    pred_labels = np.rint(preds)
    loss = roc_auc_score(h_valid_y, pred_labels)
    return loss
study_Bernoulli = optuna.create_study(direction='maximize', sampler=sampler)
study_Bernoulli.optimize(objective, n_trials=1000)

In [2]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1.0),
        # 'max_iter' : trial.suggest_int('max_iter', 1000, 10000),
        'kernel' : trial.suggest_categorical('kernel' , ['linear', 'poly', 'rbf', 'sigmoid']),
        'gamma' : trial.suggest_categorical('gamma' , ['scale', 'auto']),
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1),
        'C' : trial.suggest_uniform('alpha' , 1e-6 , 10),
        'epsilon' : trial.suggest_uniform('epsilon' , 1e-6 , 1.0),
    }
    if param['kernel'] in ['poly', 'sigmoid']:
        if param['kernel'] == 'poly':
            param['degree'] = trial.suggest_int('degree', 1, 10)
        param['coef0'] = trial.suggest_uniform('l1_ratio' , 1e-6 , 1.0)
    
    model = SVR(**param)
    SVR_model = model.fit(h_train_X, h_train_y)
    preds = SVR_model.predict(h_valid_X)
    pred_labels = np.rint(preds)
    loss = roc_auc_score(h_valid_y, pred_labels)
    return loss
study_SVR = optuna.create_study(direction='maximize', sampler=sampler)
study_SVR.optimize(objective, n_trials=1000)

[32m[I 2022-11-26 15:02:01,774][0m A new study created in memory with name: no-name-6c8a18e4-34ed-4118-bdae-4e5a252efcdf[0m


## LGBM parameter serach

In [2]:
import wandb
import lightgbm as lgb
wandb_kwargs = {"project": "lgb-optuna"}
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

@wandbc.track_in_wandb()
def objective(trial):
    param = {
        # 'objective': 'binary', # 이진 분류
        "objective": trial.suggest_categorical("objective", ["binary", "cross_entropy"]),
        'verbose': -1,
        'metric': 'AUC',
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 0.001, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        # 'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 1e-4),
        'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 1e-4),
        'path_smooth' : trial.suggest_loguniform('path_smooth', 1e-8, 1e-3),
        'num_leaves' : trial.suggest_int('num_leaves', 30, 200),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_bin' : trial.suggest_int('max_bin', 100, 255),
        'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.5, 0.9),
        'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.5, 0.9),
        # 'device' : 'gpu',
        # 'reg_alpha' : None,
    }
    categorical = [0, 1, 2, 3, ]
    model = lgb.LGBMRegressor(**param, categorical_feature=categorical, random_state=seed)
    lgb_model = model.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
    loss = roc_auc_score(h_valid_y, lgb_model.predict(h_valid_X))
    return loss
        
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=100, callbacks=[wandbc])

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjmkim_[0m. Use [1m`wandb login --relogin`[0m to force relogin


  @wandbc.track_in_wandb()
[32m[I 2022-11-30 17:33:16,309][0m A new study created in memory with name: no-name-351beb0e-f300-4864-962b-78f3a2b6a736[0m




[32m[I 2022-11-30 18:12:55,699][0m Trial 0 finished with value: 0.8621601557256083 and parameters: {'objective': 'cross_entropy', 'max_depth': 3, 'learning_rate': 0.023965198184953764, 'n_estimators': 8369, 'min_child_samples': 93, 'lambda_l1': 8.09013813952634e-06, 'lambda_l2': 1.1857274872193908e-05, 'path_smooth': 2.2183258107863225e-07, 'num_leaves': 140, 'min_data_in_leaf': 18, 'max_bin': 112, 'feature_fraction': 0.7358455013823457, 'bagging_fraction': 0.6373362152887747}. Best is trial 0 with value: 0.8621601557256083.[0m




[32m[I 2022-11-30 18:24:58,977][0m Trial 1 finished with value: 0.8654467835998472 and parameters: {'objective': 'binary', 'max_depth': 11, 'learning_rate': 0.04537017362092559, 'n_estimators': 2761, 'min_child_samples': 40, 'lambda_l1': 7.793138524702794e-08, 'lambda_l2': 5.568912592525814e-08, 'path_smooth': 8.979258351381595e-07, 'num_leaves': 63, 'min_data_in_leaf': 65, 'max_bin': 237, 'feature_fraction': 0.7489355297282327, 'bagging_fraction': 0.6012475776818278}. Best is trial 1 with value: 0.8654467835998472.[0m




[32m[I 2022-11-30 18:46:37,283][0m Trial 2 finished with value: 0.8652564369668044 and parameters: {'objective': 'cross_entropy', 'max_depth': 5, 'learning_rate': 0.03552805397960183, 'n_estimators': 5233, 'min_child_samples': 62, 'lambda_l1': 1.3504606405290654e-06, 'lambda_l2': 1.1304209523340012e-08, 'path_smooth': 4.173364449604953e-06, 'num_leaves': 183, 'min_data_in_leaf': 80, 'max_bin': 119, 'feature_fraction': 0.6183490776349688, 'bagging_fraction': 0.7448094317092934}. Best is trial 1 with value: 0.8654467835998472.[0m




[32m[I 2022-11-30 20:56:06,866][0m Trial 3 finished with value: 0.8663504281708264 and parameters: {'objective': 'binary', 'max_depth': 12, 'learning_rate': 0.003757448986688627, 'n_estimators': 5623, 'min_child_samples': 57, 'lambda_l1': 7.742869968546266e-07, 'lambda_l2': 1.4707533372708972e-05, 'path_smooth': 3.769193132689633e-08, 'num_leaves': 146, 'min_data_in_leaf': 42, 'max_bin': 153, 'feature_fraction': 0.6798074999663026, 'bagging_fraction': 0.5107769041950028}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-11-30 21:43:58,862][0m Trial 4 finished with value: 0.8621089618533742 and parameters: {'objective': 'cross_entropy', 'max_depth': 4, 'learning_rate': 0.008817903457666037, 'n_estimators': 5327, 'min_child_samples': 36, 'lambda_l1': 6.246865795612923e-07, 'lambda_l2': 1.1623519829192828e-08, 'path_smooth': 3.0558674531010326e-08, 'num_leaves': 148, 'min_data_in_leaf': 86, 'max_bin': 166, 'feature_fraction': 0.8395097215390547, 'bagging_fraction': 0.7187164843565222}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-11-30 21:47:41,841][0m Trial 5 finished with value: 0.8604723149828511 and parameters: {'objective': 'cross_entropy', 'max_depth': 4, 'learning_rate': 0.4770637172354527, 'n_estimators': 3429, 'min_child_samples': 90, 'lambda_l1': 1.5395577746201769e-07, 'lambda_l2': 1.6091375151327235e-07, 'path_smooth': 0.00010744916431686771, 'num_leaves': 173, 'min_data_in_leaf': 67, 'max_bin': 148, 'feature_fraction': 0.5117794317165487, 'bagging_fraction': 0.6599109264908691}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-11-30 21:49:20,187][0m Trial 6 finished with value: 0.8571911912008306 and parameters: {'objective': 'binary', 'max_depth': 13, 'learning_rate': 0.3659431202433013, 'n_estimators': 1402, 'min_child_samples': 32, 'lambda_l1': 2.0246746967152848e-07, 'lambda_l2': 4.229005234665855e-05, 'path_smooth': 0.0009949418900135276, 'num_leaves': 161, 'min_data_in_leaf': 13, 'max_bin': 250, 'feature_fraction': 0.8900849833714146, 'bagging_fraction': 0.5810067212290936}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-11-30 21:52:36,408][0m Trial 7 finished with value: 0.8636718817542025 and parameters: {'objective': 'binary', 'max_depth': 14, 'learning_rate': 0.15297555478638752, 'n_estimators': 2534, 'min_child_samples': 6, 'lambda_l1': 6.667380713006627e-06, 'lambda_l2': 1.2529435095128308e-05, 'path_smooth': 2.316717147746947e-06, 'num_leaves': 167, 'min_data_in_leaf': 25, 'max_bin': 212, 'feature_fraction': 0.647150360092577, 'bagging_fraction': 0.712837180422895}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-11-30 22:10:32,553][0m Trial 8 finished with value: 0.8662096078923768 and parameters: {'objective': 'cross_entropy', 'max_depth': 12, 'learning_rate': 0.015686409016513518, 'n_estimators': 1147, 'min_child_samples': 43, 'lambda_l1': 1.2242096024682502e-06, 'lambda_l2': 2.6549112632588423e-08, 'path_smooth': 0.0001851236657325161, 'num_leaves': 118, 'min_data_in_leaf': 65, 'max_bin': 229, 'feature_fraction': 0.5067569163275346, 'bagging_fraction': 0.8081208008262204}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-11-30 22:12:43,605][0m Trial 9 finished with value: 0.8593726641350229 and parameters: {'objective': 'binary', 'max_depth': 4, 'learning_rate': 0.4616040032583131, 'n_estimators': 1156, 'min_child_samples': 28, 'lambda_l1': 1.6997150743496004e-05, 'lambda_l2': 2.558321068797796e-08, 'path_smooth': 2.462786506980637e-08, 'num_leaves': 89, 'min_data_in_leaf': 86, 'max_bin': 110, 'feature_fraction': 0.7544202416604087, 'bagging_fraction': 0.7681060138388102}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-12-01 00:10:03,609][0m Trial 10 finished with value: 0.8602919919955238 and parameters: {'objective': 'binary', 'max_depth': 9, 'learning_rate': 0.0011346718284596584, 'n_estimators': 8225, 'min_child_samples': 67, 'lambda_l1': 1.1701422302987875e-08, 'lambda_l2': 1.514244546950089e-06, 'path_smooth': 1.0221500021102691e-07, 'num_leaves': 40, 'min_data_in_leaf': 41, 'max_bin': 197, 'feature_fraction': 0.6128815889065602, 'bagging_fraction': 0.5019513328033253}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-12-01 01:54:38,681][0m Trial 11 finished with value: 0.8660157189580127 and parameters: {'objective': 'cross_entropy', 'max_depth': 10, 'learning_rate': 0.0029055609921674504, 'n_estimators': 6252, 'min_child_samples': 54, 'lambda_l1': 2.0281502771134802e-06, 'lambda_l2': 1.1905119852415031e-06, 'path_smooth': 4.382195681565747e-05, 'num_leaves': 114, 'min_data_in_leaf': 45, 'max_bin': 153, 'feature_fraction': 0.5109657623647793, 'bagging_fraction': 0.881378823619513}. Best is trial 3 with value: 0.8663504281708264.[0m




[32m[I 2022-12-01 03:08:14,174][0m Trial 12 finished with value: 0.8670172167848542 and parameters: {'objective': 'binary', 'max_depth': 12, 'learning_rate': 0.006303437371629695, 'n_estimators': 6876, 'min_child_samples': 71, 'lambda_l1': 8.986040465602197e-05, 'lambda_l2': 1.9681978581061574e-07, 'path_smooth': 2.5772018880316357e-05, 'num_leaves': 118, 'min_data_in_leaf': 46, 'max_bin': 193, 'feature_fraction': 0.5651388533061754, 'bagging_fraction': 0.8404330925762327}. Best is trial 12 with value: 0.8670172167848542.[0m




[32m[I 2022-12-01 05:14:04,395][0m Trial 13 finished with value: 0.8668999258965717 and parameters: {'objective': 'binary', 'max_depth': 15, 'learning_rate': 0.004221895748020529, 'n_estimators': 6935, 'min_child_samples': 77, 'lambda_l1': 4.750446113986196e-05, 'lambda_l2': 2.973793812356837e-07, 'path_smooth': 8.981452112922886e-06, 'num_leaves': 118, 'min_data_in_leaf': 34, 'max_bin': 186, 'feature_fraction': 0.5738040144973052, 'bagging_fraction': 0.8873814318645048}. Best is trial 12 with value: 0.8670172167848542.[0m




[33m[W 2022-12-01 06:17:35,767][0m Trial 14 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/optuna/integration/wandb.py", line 228, in wrapper
    return func(trial)
  File "/tmp/ipykernel_90738/1542862549.py", line 31, in objective
    lgb_model = model.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/lightgbm/sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/lightgbm/sklearn.py", line 748, in fit
    self._Booster = train(
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/lightgbm/engine.py", line 292, in train
    booste

KeyboardInterrupt: 

In [4]:
lgbm_param = {'objective': 'binary', 'max_depth': 12, 'learning_rate': 0.006303437371629695, 'n_estimators': 6876, 'min_child_samples': 71, 'lambda_l1': 8.986040465602197e-05, 'lambda_l2': 1.9681978581061574e-07, 'path_smooth': 2.5772018880316357e-05, 'num_leaves': 118, 'min_data_in_leaf': 46, 'max_bin': 193, 'feature_fraction': 0.5651388533061754, 'bagging_fraction': 0.8404330925762327}
categorical = [0, 1, 2, 3, ]
model = lgb.LGBMRegressor(**lgbm_param, categorical_feature=categorical, random_state=seed)
lgb_model = model.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
loss = roc_auc_score(h_valid_y, lgb_model.predict(h_valid_X))
loss

0.8670530333639976

In [5]:
prediction = lgb_model.predict(test_X)
submit = pd.read_csv("../data/sample_submission.csv")
submit['prediction'] = prediction
submit
# submit.to_csv("catclf_ms_data_decompose2.csv", index=False)

ValueError: Number of features of the model must match the input. Model n_features_ is 55 and input n_features is 73

In [10]:
trial = study_lgb.best_trial
trial_params = trial.params
print('Best Trial: score {},\nparams {}'.format(trial.value, trial_params))

Best Trial: score 0.7538142981445614,
params {'objective': 'binary', 'max_depth': 7, 'learning_rate': 0.035569482469021904, 'n_estimators': 4724, 'min_child_samples': 34, 'lambda_l1': 3.940615068410218e-06, 'lambda_l2': 3.706143818501372e-07, 'path_smooth': 4.4257537279128775e-06, 'num_leaves': 70, 'min_data_in_leaf': 43, 'max_bin': 233, 'feature_fraction': 0.8599103595773292, 'bagging_fraction': 0.870477752821887}


In [10]:
import wandb
import lightgbm as lgb
wandb_kwargs = {"project": "lgb-optuna"}
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

@wandbc.track_in_wandb()
def objective(trial):
    param = {
        # 'objective': 'binary', # 이진 분류
        "objective": trial.suggest_categorical("objective", ["binary", "cross_entropy"]),
        'verbose': -1,
        'metric': 'AUC',
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 0.001, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        # 'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 1e-4),
        'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 1e-4),
        'path_smooth' : trial.suggest_loguniform('path_smooth', 1e-8, 1e-3),
        'num_leaves' : trial.suggest_int('num_leaves', 30, 200),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_bin' : trial.suggest_int('max_bin', 100, 255),
        'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.5, 0.9),
        'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.5, 0.9),
        # 'device' : 'gpu',
        # 'reg_alpha' : None,
    }
    categorical = [0, 1, 2, 3, 4]
    model = lgb.LGBMClassifier(**param, categorical_feature=categorical, random_state=777)
    lgb_model = model.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
    loss = roc_auc_score(h_valid_y, lgb_model.predict_proba(h_valid_X)[:, 1])
    return loss
        
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=100, callbacks=[wandbc])

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjmkim_[0m. Use [1m`wandb login --relogin`[0m to force relogin


  @wandbc.track_in_wandb()
[32m[I 2022-11-28 16:54:45,994][0m A new study created in memory with name: no-name-19bd460c-e861-4ad1-81f1-c7c4729a7619[0m




[32m[I 2022-11-28 17:07:37,855][0m Trial 0 finished with value: 0.8440518994533951 and parameters: {'objective': 'cross_entropy', 'max_depth': 3, 'learning_rate': 0.023965198184953764, 'n_estimators': 8369, 'min_child_samples': 93, 'lambda_l1': 8.09013813952634e-06, 'lambda_l2': 1.1857274872193908e-05, 'path_smooth': 2.2183258107863225e-07, 'num_leaves': 140, 'min_data_in_leaf': 18, 'max_bin': 112, 'feature_fraction': 0.7358455013823457, 'bagging_fraction': 0.6373362152887747}. Best is trial 0 with value: 0.8440518994533951.[0m




[32m[I 2022-11-28 17:13:52,562][0m Trial 1 finished with value: 0.8495749475932372 and parameters: {'objective': 'binary', 'max_depth': 11, 'learning_rate': 0.04537017362092559, 'n_estimators': 2761, 'min_child_samples': 40, 'lambda_l1': 7.793138524702794e-08, 'lambda_l2': 5.568912592525814e-08, 'path_smooth': 8.979258351381595e-07, 'num_leaves': 63, 'min_data_in_leaf': 65, 'max_bin': 237, 'feature_fraction': 0.7489355297282327, 'bagging_fraction': 0.6012475776818278}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:23:06,011][0m Trial 2 finished with value: 0.8480753612812425 and parameters: {'objective': 'cross_entropy', 'max_depth': 5, 'learning_rate': 0.03552805397960183, 'n_estimators': 5233, 'min_child_samples': 62, 'lambda_l1': 1.3504606405290654e-06, 'lambda_l2': 1.1304209523340012e-08, 'path_smooth': 4.173364449604953e-06, 'num_leaves': 183, 'min_data_in_leaf': 80, 'max_bin': 119, 'feature_fraction': 0.6183490776349688, 'bagging_fraction': 0.7448094317092934}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:42:22,165][0m Trial 3 finished with value: 0.8486375200185176 and parameters: {'objective': 'binary', 'max_depth': 12, 'learning_rate': 0.003757448986688627, 'n_estimators': 5623, 'min_child_samples': 57, 'lambda_l1': 7.742869968546266e-07, 'lambda_l2': 1.4707533372708972e-05, 'path_smooth': 3.769193132689633e-08, 'num_leaves': 146, 'min_data_in_leaf': 42, 'max_bin': 153, 'feature_fraction': 0.6798074999663026, 'bagging_fraction': 0.5107769041950028}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:50:24,560][0m Trial 4 finished with value: 0.8420797901633372 and parameters: {'objective': 'cross_entropy', 'max_depth': 4, 'learning_rate': 0.008817903457666037, 'n_estimators': 5327, 'min_child_samples': 36, 'lambda_l1': 6.246865795612923e-07, 'lambda_l2': 1.1623519829192828e-08, 'path_smooth': 3.0558674531010326e-08, 'num_leaves': 148, 'min_data_in_leaf': 86, 'max_bin': 166, 'feature_fraction': 0.8395097215390547, 'bagging_fraction': 0.7187164843565222}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:51:25,616][0m Trial 5 finished with value: 0.8432185145983493 and parameters: {'objective': 'cross_entropy', 'max_depth': 4, 'learning_rate': 0.4770637172354527, 'n_estimators': 3429, 'min_child_samples': 90, 'lambda_l1': 1.5395577746201769e-07, 'lambda_l2': 1.6091375151327235e-07, 'path_smooth': 0.00010744916431686771, 'num_leaves': 173, 'min_data_in_leaf': 67, 'max_bin': 148, 'feature_fraction': 0.5117794317165487, 'bagging_fraction': 0.6599109264908691}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:51:44,020][0m Trial 6 finished with value: 0.8439205411484013 and parameters: {'objective': 'binary', 'max_depth': 13, 'learning_rate': 0.3659431202433013, 'n_estimators': 1402, 'min_child_samples': 32, 'lambda_l1': 2.0246746967152848e-07, 'lambda_l2': 4.229005234665855e-05, 'path_smooth': 0.0009949418900135276, 'num_leaves': 161, 'min_data_in_leaf': 13, 'max_bin': 250, 'feature_fraction': 0.8900849833714146, 'bagging_fraction': 0.5810067212290936}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:52:24,738][0m Trial 7 finished with value: 0.848400864091315 and parameters: {'objective': 'binary', 'max_depth': 14, 'learning_rate': 0.15297555478638752, 'n_estimators': 2534, 'min_child_samples': 6, 'lambda_l1': 6.667380713006627e-06, 'lambda_l2': 1.2529435095128308e-05, 'path_smooth': 2.316717147746947e-06, 'num_leaves': 167, 'min_data_in_leaf': 25, 'max_bin': 212, 'feature_fraction': 0.647150360092577, 'bagging_fraction': 0.712837180422895}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:54:52,060][0m Trial 8 finished with value: 0.8472837331477524 and parameters: {'objective': 'cross_entropy', 'max_depth': 12, 'learning_rate': 0.015686409016513518, 'n_estimators': 1147, 'min_child_samples': 43, 'lambda_l1': 1.2242096024682502e-06, 'lambda_l2': 2.6549112632588423e-08, 'path_smooth': 0.0001851236657325161, 'num_leaves': 118, 'min_data_in_leaf': 65, 'max_bin': 229, 'feature_fraction': 0.5067569163275346, 'bagging_fraction': 0.8081208008262204}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:55:23,096][0m Trial 9 finished with value: 0.84134796267625 and parameters: {'objective': 'binary', 'max_depth': 4, 'learning_rate': 0.4616040032583131, 'n_estimators': 1156, 'min_child_samples': 28, 'lambda_l1': 1.6997150743496004e-05, 'lambda_l2': 2.558321068797796e-08, 'path_smooth': 2.462786506980637e-08, 'num_leaves': 89, 'min_data_in_leaf': 86, 'max_bin': 110, 'feature_fraction': 0.7544202416604087, 'bagging_fraction': 0.7681060138388102}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 17:55:34,979][0m Trial 10 finished with value: 0.8199055110774112 and parameters: {'objective': 'binary', 'max_depth': 8, 'learning_rate': 0.0011346718284596584, 'n_estimators': 9701, 'min_child_samples': 73, 'lambda_l1': 1.0847835445978988e-08, 'lambda_l2': 5.896635273178497e-07, 'path_smooth': 2.9793403448656662e-06, 'num_leaves': 36, 'min_data_in_leaf': 43, 'max_bin': 197, 'feature_fraction': 0.7986338607237871, 'bagging_fraction': 0.8966121947838954}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 18:38:16,765][0m Trial 11 finished with value: 0.8459196631383821 and parameters: {'objective': 'binary', 'max_depth': 10, 'learning_rate': 0.002436367398926566, 'n_estimators': 7062, 'min_child_samples': 57, 'lambda_l1': 2.191094374852511e-08, 'lambda_l2': 2.5153519896171443e-06, 'path_smooth': 2.781119686904175e-07, 'num_leaves': 63, 'min_data_in_leaf': 43, 'max_bin': 154, 'feature_fraction': 0.6655840232133299, 'bagging_fraction': 0.5243481691016288}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 18:48:46,889][0m Trial 12 finished with value: 0.8488321885896815 and parameters: {'objective': 'binary', 'max_depth': 10, 'learning_rate': 0.08684550425140126, 'n_estimators': 3993, 'min_child_samples': 17, 'lambda_l1': 8.986040465602197e-05, 'lambda_l2': 3.182532034817751e-07, 'path_smooth': 2.5115287454355705e-07, 'num_leaves': 95, 'min_data_in_leaf': 46, 'max_bin': 187, 'feature_fraction': 0.5877278209354841, 'bagging_fraction': 0.5039614866664458}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 18:54:17,461][0m Trial 13 finished with value: 0.848982223287974 and parameters: {'objective': 'binary', 'max_depth': 9, 'learning_rate': 0.0954492553560932, 'n_estimators': 3344, 'min_child_samples': 13, 'lambda_l1': 4.750446113986196e-05, 'lambda_l2': 1.6310838729102475e-07, 'path_smooth': 3.5407966008900457e-07, 'num_leaves': 81, 'min_data_in_leaf': 100, 'max_bin': 189, 'feature_fraction': 0.5789087899783375, 'bagging_fraction': 0.5826926602087777}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:03:59,151][0m Trial 14 finished with value: 0.8482145999530265 and parameters: {'objective': 'binary', 'max_depth': 7, 'learning_rate': 0.06766795404777863, 'n_estimators': 3120, 'min_child_samples': 7, 'lambda_l1': 6.924601727667131e-08, 'lambda_l2': 1.2042129606287288e-07, 'path_smooth': 1.1045453507941276e-06, 'num_leaves': 54, 'min_data_in_leaf': 99, 'max_bin': 254, 'feature_fraction': 0.5738706987116737, 'bagging_fraction': 0.5966245815568667}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:04:35,574][0m Trial 15 finished with value: 0.8480682797991667 and parameters: {'objective': 'binary', 'max_depth': 15, 'learning_rate': 0.17331067665194327, 'n_estimators': 2587, 'min_child_samples': 20, 'lambda_l1': 9.012502804955756e-05, 'lambda_l2': 9.100646318662002e-08, 'path_smooth': 2.6988963020722614e-05, 'num_leaves': 78, 'min_data_in_leaf': 68, 'max_bin': 225, 'feature_fraction': 0.7344045013741353, 'bagging_fraction': 0.579062010193618}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:04:40,951][0m Trial 16 finished with value: 0.83275511198225 and parameters: {'objective': 'binary', 'max_depth': 10, 'learning_rate': 0.9727961777446633, 'n_estimators': 3964, 'min_child_samples': 45, 'lambda_l1': 5.697924878587713e-08, 'lambda_l2': 1.909319095228515e-06, 'path_smooth': 1.3880738425252598e-05, 'num_leaves': 110, 'min_data_in_leaf': 59, 'max_bin': 205, 'feature_fraction': 0.7875428231499065, 'bagging_fraction': 0.6471122807782793}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:24:00,711][0m Trial 17 finished with value: 0.847698702839841 and parameters: {'objective': 'binary', 'max_depth': 6, 'learning_rate': 0.04257105198501569, 'n_estimators': 6804, 'min_child_samples': 74, 'lambda_l1': 3.3457834103869835e-06, 'lambda_l2': 4.93511265618366e-08, 'path_smooth': 6.408637230570751e-07, 'num_leaves': 49, 'min_data_in_leaf': 99, 'max_bin': 233, 'feature_fraction': 0.5611882921669379, 'bagging_fraction': 0.5559537944244497}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:26:46,048][0m Trial 18 finished with value: 0.8390109910892796 and parameters: {'objective': 'binary', 'max_depth': 8, 'learning_rate': 0.01191034868279986, 'n_estimators': 286, 'min_child_samples': 20, 'lambda_l1': 2.5651726701450347e-05, 'lambda_l2': 4.0309386916773507e-07, 'path_smooth': 8.489457323819251e-08, 'num_leaves': 73, 'min_data_in_leaf': 79, 'max_bin': 176, 'feature_fraction': 0.7049400091953428, 'bagging_fraction': 0.6197977388470718}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:28:01,048][0m Trial 19 finished with value: 0.8473614369432355 and parameters: {'objective': 'binary', 'max_depth': 11, 'learning_rate': 0.14010733530610334, 'n_estimators': 2041, 'min_child_samples': 44, 'lambda_l1': 2.597953598011808e-07, 'lambda_l2': 2.030578917379983e-06, 'path_smooth': 8.408873846691999e-06, 'num_leaves': 33, 'min_data_in_leaf': 75, 'max_bin': 135, 'feature_fraction': 0.6172402307495495, 'bagging_fraction': 0.5500033228672638}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:41:33,278][0m Trial 20 finished with value: 0.8476173336232677 and parameters: {'objective': 'binary', 'max_depth': 9, 'learning_rate': 0.00554954903369397, 'n_estimators': 4477, 'min_child_samples': 14, 'lambda_l1': 4.8303316330941795e-08, 'lambda_l2': 6.384995708584644e-08, 'path_smooth': 1.0585580156113276e-08, 'num_leaves': 106, 'min_data_in_leaf': 27, 'max_bin': 188, 'feature_fraction': 0.8350612648044677, 'bagging_fraction': 0.6711948029254433}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:45:46,007][0m Trial 21 finished with value: 0.8489874781078546 and parameters: {'objective': 'binary', 'max_depth': 10, 'learning_rate': 0.07838533992919806, 'n_estimators': 3866, 'min_child_samples': 22, 'lambda_l1': 9.80662417439568e-05, 'lambda_l2': 3.0667153724766814e-07, 'path_smooth': 1.6093662720631216e-07, 'num_leaves': 98, 'min_data_in_leaf': 51, 'max_bin': 182, 'feature_fraction': 0.5650157358012802, 'bagging_fraction': 0.5006651088227922}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:47:17,140][0m Trial 22 finished with value: 0.8493739669801685 and parameters: {'objective': 'binary', 'max_depth': 11, 'learning_rate': 0.0760705273452212, 'n_estimators': 6289, 'min_child_samples': 27, 'lambda_l1': 3.73903360915764e-05, 'lambda_l2': 2.3333486039297534e-07, 'path_smooth': 7.381943376733496e-07, 'num_leaves': 126, 'min_data_in_leaf': 52, 'max_bin': 215, 'feature_fraction': 0.5400367800842937, 'bagging_fraction': 0.5461018165731338}. Best is trial 1 with value: 0.8495749475932372.[0m




[32m[I 2022-11-28 19:50:56,741][0m Trial 23 finished with value: 0.8496535366313872 and parameters: {'objective': 'binary', 'max_depth': 12, 'learning_rate': 0.03987800276535894, 'n_estimators': 6363, 'min_child_samples': 26, 'lambda_l1': 2.609132153725282e-05, 'lambda_l2': 2.701953961971303e-07, 'path_smooth': 8.987131174128772e-07, 'num_leaves': 125, 'min_data_in_leaf': 53, 'max_bin': 219, 'feature_fraction': 0.5372786182619828, 'bagging_fraction': 0.543113766114749}. Best is trial 23 with value: 0.8496535366313872.[0m




[32m[I 2022-11-28 22:28:31,104][0m Trial 24 finished with value: 0.8501431967671322 and parameters: {'objective': 'binary', 'max_depth': 12, 'learning_rate': 0.02157862041112959, 'n_estimators': 6185, 'min_child_samples': 38, 'lambda_l1': 2.0087962448323342e-05, 'lambda_l2': 6.651942422932263e-07, 'path_smooth': 1.1073929408727372e-06, 'num_leaves': 125, 'min_data_in_leaf': 55, 'max_bin': 240, 'feature_fraction': 0.5360257986299467, 'bagging_fraction': 0.5466449422595056}. Best is trial 24 with value: 0.8501431967671322.[0m




[32m[I 2022-11-29 00:12:30,002][0m Trial 25 finished with value: 0.8499670050543917 and parameters: {'objective': 'binary', 'max_depth': 13, 'learning_rate': 0.021760372754778688, 'n_estimators': 7987, 'min_child_samples': 40, 'lambda_l1': 3.1221842734641867e-06, 'lambda_l2': 3.978957512320673e-06, 'path_smooth': 1.3090564207743374e-06, 'num_leaves': 130, 'min_data_in_leaf': 33, 'max_bin': 241, 'feature_fraction': 0.6334457799222277, 'bagging_fraction': 0.6142408096545948}. Best is trial 24 with value: 0.8501431967671322.[0m




[32m[I 2022-11-29 01:12:29,116][0m Trial 26 finished with value: 0.8501823271792353 and parameters: {'objective': 'cross_entropy', 'max_depth': 14, 'learning_rate': 0.018115530725760505, 'n_estimators': 8243, 'min_child_samples': 34, 'lambda_l1': 1.0520332730357971e-05, 'lambda_l2': 1.0117335081605208e-06, 'path_smooth': 2.0640577162898693e-06, 'num_leaves': 129, 'min_data_in_leaf': 34, 'max_bin': 245, 'feature_fraction': 0.613968577337192, 'bagging_fraction': 0.5395403955865168}. Best is trial 26 with value: 0.8501823271792353.[0m




[32m[I 2022-11-29 01:16:46,549][0m Trial 27 finished with value: 0.8501911479950456 and parameters: {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.022894076323413972, 'n_estimators': 7955, 'min_child_samples': 48, 'lambda_l1': 2.9348525927502984e-06, 'lambda_l2': 4.664979552704591e-06, 'path_smooth': 4.918294149790792e-05, 'num_leaves': 135, 'min_data_in_leaf': 35, 'max_bin': 240, 'feature_fraction': 0.6140300603932235, 'bagging_fraction': 0.6860985552627415}. Best is trial 27 with value: 0.8501911479950456.[0m




[32m[I 2022-11-29 05:56:10,527][0m Trial 28 finished with value: 0.8505053170647445 and parameters: {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.00725967037321421, 'n_estimators': 9926, 'min_child_samples': 48, 'lambda_l1': 7.470648234536311e-06, 'lambda_l2': 9.90356054623688e-07, 'path_smooth': 4.661176297781611e-05, 'num_leaves': 153, 'min_data_in_leaf': 35, 'max_bin': 246, 'feature_fraction': 0.6077865188346295, 'bagging_fraction': 0.8248617891221075}. Best is trial 28 with value: 0.8505053170647445.[0m




[32m[I 2022-11-29 10:22:53,225][0m Trial 29 finished with value: 0.850342346385377 and parameters: {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.007630933121124611, 'n_estimators': 9600, 'min_child_samples': 50, 'lambda_l1': 7.2505948267600335e-06, 'lambda_l2': 5.332730741187984e-06, 'path_smooth': 4.9069891254268494e-05, 'num_leaves': 153, 'min_data_in_leaf': 34, 'max_bin': 255, 'feature_fraction': 0.6044206475427127, 'bagging_fraction': 0.8534789924337343}. Best is trial 28 with value: 0.8505053170647445.[0m




[32m[I 2022-11-29 15:15:38,566][0m Trial 30 finished with value: 0.8505140937570655 and parameters: {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.007450979579783284, 'n_estimators': 9895, 'min_child_samples': 51, 'lambda_l1': 3.4483864213103326e-06, 'lambda_l2': 6.195080373476888e-06, 'path_smooth': 5.877571744254479e-05, 'num_leaves': 194, 'min_data_in_leaf': 21, 'max_bin': 255, 'feature_fraction': 0.6808615243907541, 'bagging_fraction': 0.8672122858266406}. Best is trial 30 with value: 0.8505140937570655.[0m




[32m[I 2022-11-29 20:04:57,360][0m Trial 31 finished with value: 0.8504218555438536 and parameters: {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.007233489220606737, 'n_estimators': 9958, 'min_child_samples': 51, 'lambda_l1': 3.7997871645450814e-06, 'lambda_l2': 6.126302526980535e-06, 'path_smooth': 5.4322313100750316e-05, 'num_leaves': 199, 'min_data_in_leaf': 18, 'max_bin': 249, 'feature_fraction': 0.7001538502456381, 'bagging_fraction': 0.8691830254556957}. Best is trial 30 with value: 0.8505140937570655.[0m




## CatBoost parameter search

In [8]:
import gc
import torch

In [45]:
def objective(trial):
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        # "objective" : "RMSE",
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.4, 1.0),
        "depth": trial.suggest_int("depth", 3, 16),
        # "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 0.0001, 1.0),
        "n_estimators":trial.suggest_int("n_estimators", 1000, 5000),
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.01, 100.00)

    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    cat = CatBoostClassifier(**param, cat_features=[0, 1, 2, 3, 4], random_seed=seed, task_type="GPU")
    cat_model = cat.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)    
    preds = cat_model.predict_proba(h_valid_X)
    # pred_labels = np.rint(preds)
    # accuracy = roc_auc_score(h_valid_y, pred_labels)
    accuracy = roc_auc_score(h_valid_y, preds[:, 1])
    torch.cuda.empty_cache()
    gc.collect()
    return accuracy

study_cat = optuna.create_study(direction="maximize",)
study_cat.optimize(objective, n_trials=100)

[32m[I 2022-12-07 02:05:12,559][0m A new study created in memory with name: no-name-f1c893bd-3329-4cc6-b708-e5b374ab2ab7[0m
[33m[W 2022-12-07 02:05:18,949][0m Trial 0 failed because of the following error: CatBoostError('catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 100: no CUDA-capable device is detected')[0m
Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_106632/138017590.py", line 25, in objective
    cat_model = cat.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/catboost/core.py", line 5128, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/opt/conda/envs/dkt/lib/python3.8/site-packages/catboost/core.py", line 23

CatBoostError: catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 100: no CUDA-capable device is detected

In [14]:
random.seed(777)
# cat_params = {'objective': 'Logloss', 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.024787088576176734, 'n_estimators': 2995, 'l2_leaf_reg': 2.8454741246080777e-05, 'min_child_samples': 45, 'max_bin': 500, 'subsample': 0.618466706693154}
cat_params = {'objective': 'Logloss', 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.009442590476041634, 'n_estimators': 2540, 'l2_leaf_reg': 1.9627439630996592e-05, 'min_child_samples': 31, 'max_bin': 445, 'subsample': 0.9738440559510598}
cat = CatBoostClassifier(**cat_params, cat_features=[0, 1, 2, 3, 4], task_type="GPU", random_state=777,)
cat_model = cat.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)    
preds = cat_model.predict_proba(h_valid_X)
# pred_labels = np.rint(preds)
auc = roc_auc_score(h_valid_y, preds[:, 1])
auc

0.8757004935773576

In [15]:
prediction = cat_model.predict_proba(test_X)[:, 1]

In [16]:
submit = pd.read_csv("../data/sample_submission.csv")
submit['prediction'] = prediction
submit.to_csv("catclf_ms_data_decompose2.csv", index=False)

## ML model Sequential training
### Data - new_train, new_test 

In [23]:
grouped_train_x = h_train_X.groupby(by=['userID'])
grouped_valid_x = h_valid_X.groupby(by=['userID'])
grouped_train_y = h_train_y.groupby(by=['userID'])
grouped_valid_y = h_valid_y.groupby(by=['userID'])
for group_train_x, group_valid_x, group_train_y, group_valid_y in zip(grouped_train_x, grouped_valid_x, grouped_train_y, grouped_valid_y):
    model = LogisticRegression()
    model.fit(group_train_x[1], group_train_y[1]['answerCode'])
    print(f"[TRAIN] roc_auc_score : {roc_auc_score(model.predict(group_train_x[1]), group_train_y[1]['answerCode'])}")
    print(f"[VALID] roc_auc_score : {roc_auc_score(model.predict(group_valid_x[1]), group_valid_y[1]['answerCode'])}")
    break

[TRAIN] roc_auc_score : 0.697259159253574
[VALID] roc_auc_score : 0.6274710293115201


In [None]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-3, 1),
        'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
    }
    model = LogisticRegression(**param)
    model = model.fit(group_train_x[1], group_train_y[1]['answerCode'])
    loss = roc_auc_score(model.predict(group_valid_x[1]), group_valid_y[1]['answerCode'])
    return loss
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=1000, callbacks=[wandbc])

In [None]:
import lightgbm as lgb
def objective(trial):
    param = {
        # 'objective': 'binary', # 이진 분류
        "objective": trial.suggest_categorical("objective", ["binary", "cross_entropy"]),
        'verbose': -1,
        'metric': 'AUC',
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 0.0001, 2.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        # 'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 1e-4),
        'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 1e-4),
        'path_smooth' : trial.suggest_loguniform('path_smooth', 1e-8, 1e-3),
        'num_leaves' : trial.suggest_int('num_leaves', 30, 200),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_bin' : trial.suggest_int('max_bin', 100, 255),
        'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.5, 1),
        'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.5, 1),
        # 'device' : 'gpu',
        # 'reg_alpha' : None,
    }
    categorical = [0, 1, 2, 3, 4, 5, 10, 11, 13, 14, 15, 16, 17]
    model = lgb.LGBMClassifier(**param, categorical_feature=categorical)
    lgb_model = model.fit(group_train_x[1], group_train_y[1]['answerCode'], eval_set=[(group_valid_x[1], group_valid_y[1]['answerCode'])], verbose=-1, early_stopping_rounds=25)
    loss = roc_auc_score(group_valid_y[1]['answerCode'], lgb_model.predict(group_valid_x[1]))
    return loss
        
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=10000)

In [33]:
trial = study_lgb.best_trial
trial_params = trial.params
print('Best Trial: score {},\nparams {}'.format(trial.value, trial_params))

Best Trial: score 0.7159695468721335,
params {'objective': 'cross_entropy', 'max_depth': 5, 'learning_rate': 0.1460557874021894, 'n_estimators': 4044, 'min_child_samples': 77, 'lambda_l1': 6.716411565832376e-05, 'lambda_l2': 1.7186698526667583e-08, 'path_smooth': 5.935595604338357e-05, 'num_leaves': 93, 'min_data_in_leaf': 90, 'max_bin': 220, 'feature_fraction': 0.5248617093238126, 'bagging_fraction': 0.8611415978965095}


## Ensemble

In [3]:
test_df = pd.read_csv("test_feature_engineering.csv")
test_df = test_df.loc[test_df['answerCode']==-1]
test_X = test_df.drop(['row_id', 'answerCode', 'assessmentItemID', 'testId', 'Timestamp', 'relative_answered_correctly'], axis=1)

In [4]:
prediction = cat_model.predict(test_X)

In [2]:
# params = {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.011829111999395184, 'n_estimators': 9374, 'min_child_samples': 84, 'lambda_l1': 1.1225546503543914e-07, 'lambda_l2': 7.64205903058515e-06, 'path_smooth': 1.8542487799550667e-08, 'num_leaves': 200, 'min_data_in_leaf': 87, 'max_bin': 140, 'feature_fraction': 0.8453296813265502, 'bagging_fraction': 0.7203510445565607}
params = {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.00725967037321421, 'n_estimators': 9926, 'min_child_samples': 48, 'lambda_l1': 7.470648234536311e-06, 'lambda_l2': 9.90356054623688e-07, 'path_smooth': 4.661176297781611e-05, 'num_leaves': 153, 'min_data_in_leaf': 35, 'max_bin': 246, 'feature_fraction': 0.6077865188346295, 'bagging_fraction': 0.8248617891221075}
categorical = [0, 1, 2, 3, 4]
model = lgb.LGBMRegressor(**params, categorical_feature=categorical, random_state=777)
lgb_model = model.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
prediction = lgb_model.predict(h_valid_X)
loss = roc_auc_score(h_valid_y, prediction)
loss



0.8504901622752634

In [6]:
prediction = lgb_model.predict(test_X)

In [7]:
submit = pd.read_csv("../data/sample_submission.csv")
submit['prediction'] = prediction
submit.to_csv("lgbmregressor_ms_data.csv", index=False)

In [40]:
import torch
submit = pd.read_csv("../data/sample_submission.csv")
# catreg = pd.read_csv("catregressor_ms_data.csv")
catclf = pd.read_csv("./output/catclf_ms_data.csv")
lightgcn = pd.read_csv("./output/lightgcn_best.csv")
lstmattn = pd.read_csv("./output/lstmattn_best.csv")
lgbm = pd.read_csv("./output/lgbmregressor_ms_data.csv")
lastquery = pd.read_csv("./output/submissionlastquery.csv")
bert = pd.read_csv("./output/output_(3).csv")

lstmattn['prediction'] = torch.nn.Sigmoid()(torch.FloatTensor(lstmattn['prediction']))
ensemble = (0.2 * lightgcn['prediction'] + 0.1 * lstmattn['prediction'] + 0.2 * lgbm['prediction'] + 0.4 * catclf['prediction'] + 0.1 * bert['prediction'])
submit['prediction'] = ensemble
submit.to_csv("trial3.csv", index=False)
ensemble

0      0.528109
1      0.814639
2      0.278716
3      0.755765
4      0.254167
         ...   
739    0.027633
740    0.691986
741    0.835123
742    0.736162
743    0.684592
Name: prediction, Length: 744, dtype: float64

In [8]:
submit = pd.read_csv("../data/sample_submission.csv")
lgbmreg = pd.read_csv("lgbmregressor_ms_data.csv")
catclf = pd.read_csv("catregressor_ms_data.csv")

ensemble = (lgbmreg['prediction'] + catclf['prediction']) / 2
submit['prediction'] = ensemble
submit.to_csv("lgbmreg_ms_data_catclf_ensemble.csv", index=False)
ensemble

0      0.584764
1      0.767946
2      0.238857
3      0.850752
4      0.237645
         ...   
739    0.070194
740    0.821399
741    0.881537
742    0.823934
743    0.631287
Name: prediction, Length: 744, dtype: float64