In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import lightgbm as lgb
import optuna
from tqdm import tqdm

from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
pd.set_option('mode.chained_assignment',  None)
seed = 777
random.seed(seed)
sampler = TPESampler(seed=seed)

train_df = pd.read_csv("new_train.csv")
test_df = pd.read_csv("new_test.csv")

def extract_datetime(df):
    df['month'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.month)
    df['day'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.day)
    df['hour'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.hour)
    df['minute'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.minute)
    df['second'] = pd.to_datetime(df['Timestamp']).apply(lambda x : x.second)
    df.drop(['Timestamp'], axis=1, inplace=True)
    return df

def extract_testId(df):
    df['testClass'] = df['testId'].apply(lambda x : int(x[2]))
    df['testCode'] = df['testId'].apply(lambda x : int(x[7:]))
    df.drop(['testId'], axis=1, inplace=True)
    return df

def extract_assessmentItemID(df):
    df['assessmentItemCode'] = df['assessmentItemID'].apply(lambda x : int(x[7:]))
    df.drop(['assessmentItemID'], axis=1, inplace=True)
    return df

train_X = train_df.drop(['answerCode', 'assessmentItemID', 'testId', 'Timestamp'], axis=1)
train_y = train_df[['userID', 'answerCode']]
test_X = test_df.drop(['answerCode', 'assessmentItemID', 'testId', 'Timestamp'], axis=1)

# h_train_X, h_valid_X, h_train_y, h_valid_y = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y['answerCode'], random_state=777)

# print(h_train_X.shape, h_train_y.shape, h_valid_X.shape, h_valid_y.shape)

In [10]:
train_X

Unnamed: 0,userID,KnowledgeTag,assessmentItemCode,testClass,testCode,solvingtime,trend_solvingtime,seasonal_solvingtime,resid_solvingtime,user_solvingtime_avg,user_answer_cumsum_prev,user_answer_cumcount_prev,user_answerRatio_cum_prev,month,day,hour,minute,second
0,0,7224,1,6,1,3,53.914628,-54.685377,3.770749e+00,43.547651,0.0,0,0.000000,3,24,0,17,11
1,0,7225,2,6,1,8,54.018292,-47.133404,1.115112e+00,43.547651,1.0,1,1.000000,3,24,0,17,14
2,0,7225,3,6,1,7,54.122052,-30.645419,-1.647663e+01,43.547651,2.0,2,1.000000,3,24,0,17,22
3,0,7225,4,6,1,7,54.225885,-42.518931,-4.706954e+00,43.547651,3.0,3,1.000000,3,24,0,17,29
4,0,7225,5,6,1,11,54.329777,-57.067813,1.373804e+01,43.547651,4.0,4,1.000000,3,24,0,17,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,438,5,3,71,60,29.708316,30.291684,1.065814e-14,52.222222,1.0,4,0.250000,6,5,6,50,21
2525952,7441,8836,1,4,165,11,29.708316,-18.708316,0.000000e+00,52.222222,1.0,5,0.200000,8,21,1,6,39
2525953,7441,8836,2,4,165,46,29.708316,16.291684,7.105427e-15,52.222222,2.0,6,0.333333,8,21,1,6,50
2525954,7441,8836,3,4,165,73,29.708316,43.291684,0.000000e+00,52.222222,3.0,7,0.428571,8,21,1,7,36


In [115]:
train = pd.concat([train_X, train_y['answerCode']], axis=1)
train

Unnamed: 0,userID,KnowledgeTag,assessmentItemCode,testClass,testCode,solvingtime,trend_solvingtime,seasonal_solvingtime,resid_solvingtime,user_solvingtime_avg,user_answer_cumsum_prev,user_answer_cumcount_prev,user_answerRatio_cum_prev,month,day,hour,minute,second,answerCode
0,0,7224,1,6,1,3,53.914628,-54.685377,3.770749e+00,43.547651,0.0,0,0.000000,3,24,0,17,11,1
1,0,7225,2,6,1,8,54.018292,-47.133404,1.115112e+00,43.547651,1.0,1,1.000000,3,24,0,17,14,1
2,0,7225,3,6,1,7,54.122052,-30.645419,-1.647663e+01,43.547651,2.0,2,1.000000,3,24,0,17,22,1
3,0,7225,4,6,1,7,54.225885,-42.518931,-4.706954e+00,43.547651,3.0,3,1.000000,3,24,0,17,29,1
4,0,7225,5,6,1,11,54.329777,-57.067813,1.373804e+01,43.547651,4.0,4,1.000000,3,24,0,17,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,438,5,3,71,60,29.708316,30.291684,1.065814e-14,52.222222,1.0,4,0.250000,6,5,6,50,21,0
2525952,7441,8836,1,4,165,11,29.708316,-18.708316,0.000000e+00,52.222222,1.0,5,0.200000,8,21,1,6,39,1
2525953,7441,8836,2,4,165,46,29.708316,16.291684,7.105427e-15,52.222222,2.0,6,0.333333,8,21,1,6,50,1
2525954,7441,8836,3,4,165,73,29.708316,43.291684,0.000000e+00,52.222222,3.0,7,0.428571,8,21,1,7,36,1


In [144]:
sequence_x = []
sequence_y = []
window_size = 8
grouped = train.groupby(by=['userID'])
for key, group in tqdm(grouped):
    start, stride, end = 0, 1, window_size
    length = len(group)
    while len(group.iloc[start:end, :]) == window_size and end < length:
        sequence_x.append(np.concatenate([[key], group.iloc[start:end, 1:-1].values.reshape(-1)]))
        sequence_y.append(group.iloc[[end], [-1]].values.reshape(-1))
        start += stride
        end += stride
assert len(sequence_x) == len(sequence_y)

100%|██████████| 7442/7442 [26:03<00:00,  4.76it/s]  


In [149]:
new_train_X = pd.DataFrame(np.array(sequence_x))
new_train_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,127,128,129,130,131,132,133,134,135,136
0,0.0,7224.0,1.0,6.0,1.0,3.0,53.914628,-54.685377,3.770749e+00,43.547651,...,2.033385e+01,43.547651,6.0,7.0,0.857143,3.0,26.0,5.0,52.0,10.0
1,0.0,7225.0,2.0,6.0,1.0,8.0,54.018292,-47.133404,1.115112e+00,43.547651,...,1.213293e+01,43.547651,7.0,8.0,0.875000,3.0,26.0,5.0,53.0,14.0
2,0.0,7225.0,3.0,6.0,1.0,7.0,54.122052,-30.645419,-1.647663e+01,43.547651,...,-1.628860e+00,43.547651,8.0,9.0,0.888889,3.0,26.0,5.0,53.0,29.0
3,0.0,7225.0,4.0,6.0,1.0,7.0,54.225885,-42.518931,-4.706954e+00,43.547651,...,-2.025602e+00,43.547651,9.0,10.0,0.900000,3.0,26.0,5.0,53.0,48.0
4,0.0,7225.0,5.0,6.0,1.0,11.0,54.329777,-57.067813,1.373804e+01,43.547651,...,-1.124308e+00,43.547651,10.0,11.0,0.909091,3.0,26.0,5.0,53.0,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466415,7440.0,5267.0,4.0,5.0,96.0,25.0,26.213495,-1.213495,-3.552714e-15,33.733333,...,3.552714e-15,33.733333,5.0,10.0,0.500000,10.0,21.0,8.0,31.0,45.0
2466416,7440.0,10328.0,3.0,5.0,96.0,60.0,26.213495,33.786505,1.065814e-14,33.733333,...,0.000000e+00,33.733333,6.0,11.0,0.545455,10.0,21.0,8.0,32.0,46.0
2466417,7440.0,7691.0,1.0,3.0,136.0,47.0,26.213495,20.786505,-3.552714e-15,33.733333,...,-7.105427e-15,33.733333,6.0,12.0,0.500000,10.0,21.0,8.0,33.0,15.0
2466418,7440.0,7690.0,2.0,3.0,136.0,76.0,26.213495,49.786505,0.000000e+00,33.733333,...,0.000000e+00,33.733333,6.0,13.0,0.461538,10.0,21.0,8.0,33.0,17.0


In [151]:
new_train_y = pd.DataFrame(np.array(sequence_y))
new_train_y.columns = ["answerCode"]
new_train_y

Unnamed: 0,answerCode
0,1
1,1
2,1
3,1
4,1
...,...
2466415,0
2466416,0
2466417,0
2466418,0


In [152]:
h_train_X, h_valid_X, h_train_y, h_valid_y = train_test_split(new_train_X, new_train_y, test_size=0.2, stratify=new_train_y['answerCode'], random_state=777)
print(h_train_X.shape, h_train_y.shape, h_valid_X.shape, h_valid_y.shape)

(1973136, 137) (1973136, 1) (493284, 137) (493284, 1)


In [153]:
import wandb
import lightgbm as lgb
wandb_kwargs = {"project": "lgb-sequence-optuna"}
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

@wandbc.track_in_wandb()
def objective(trial):
    param = {
        # 'objective': 'binary', # 이진 분류
        "objective": trial.suggest_categorical("objective", ["binary", "cross_entropy"]),
        'verbose': -1,
        'metric': 'AUC',
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 0.001, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        # 'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 1e-4),
        'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 1e-4),
        'path_smooth' : trial.suggest_loguniform('path_smooth', 1e-8, 1e-3),
        'num_leaves' : trial.suggest_int('num_leaves', 30, 200),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_bin' : trial.suggest_int('max_bin', 100, 255),
        'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.5, 0.9),
        'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.5, 0.9),
        # 'device' : 'gpu',
        # 'reg_alpha' : None,
    }
    categorical = [0, 1, 2, 3, 4, 5, 10, 11, 13, 14, 15, 16, 17]
    model = lgb.LGBMClassifier(**param)#, categorical_feature=categorical)
    lgb_model = model.fit(h_train_X, h_train_y, eval_set=[(h_valid_X, h_valid_y)], verbose=0, early_stopping_rounds=25)
    loss = roc_auc_score(h_valid_y, lgb_model.predict(h_valid_X))
    return loss
        
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=100, callbacks=[wandbc])

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjmkim_[0m. Use [1m`wandb login --relogin`[0m to force relogin


  @wandbc.track_in_wandb()
[32m[I 2022-11-23 07:08:50,416][0m A new study created in memory with name: no-name-63fba3b7-0395-4cba-aa1c-5c4ea4794b7c[0m




[32m[I 2022-11-23 07:25:03,880][0m Trial 0 finished with value: 0.6792534921201606 and parameters: {'objective': 'cross_entropy', 'max_depth': 3, 'learning_rate': 0.023965198184953764, 'n_estimators': 8369, 'min_child_samples': 93, 'lambda_l1': 8.09013813952634e-06, 'lambda_l2': 1.1857274872193908e-05, 'path_smooth': 2.2183258107863225e-07, 'num_leaves': 140, 'min_data_in_leaf': 18, 'max_bin': 112, 'feature_fraction': 0.7358455013823457, 'bagging_fraction': 0.6373362152887747}. Best is trial 0 with value: 0.6792534921201606.[0m




[32m[I 2022-11-23 07:32:07,822][0m Trial 1 finished with value: 0.6953713814569009 and parameters: {'objective': 'binary', 'max_depth': 11, 'learning_rate': 0.04537017362092559, 'n_estimators': 2761, 'min_child_samples': 40, 'lambda_l1': 7.793138524702794e-08, 'lambda_l2': 5.568912592525814e-08, 'path_smooth': 8.979258351381595e-07, 'num_leaves': 63, 'min_data_in_leaf': 65, 'max_bin': 237, 'feature_fraction': 0.7489355297282327, 'bagging_fraction': 0.6012475776818278}. Best is trial 1 with value: 0.6953713814569009.[0m




In [17]:
trial = study_lgb.best_trial
trial_params = trial.params
print('Best Trial: score {},\nparams {}'.format(trial.value, trial_params))

Best Trial: score 0.7412035812157612,
params {'objective': 'cross_entropy', 'max_depth': 15, 'learning_rate': 0.02041383826399091, 'n_estimators': 6587, 'min_child_samples': 26, 'lambda_l1': 1.044850570378112e-06, 'lambda_l2': 5.188602499379273e-05, 'path_smooth': 0.0007486531006779142, 'num_leaves': 178, 'min_data_in_leaf': 52, 'max_bin': 107, 'feature_fraction': 0.8534619135004855, 'bagging_fraction': 0.5517526788448244}


In [23]:
grouped_train_x = h_train_X.groupby(by=['userID'])
grouped_valid_x = h_valid_X.groupby(by=['userID'])
grouped_train_y = h_train_y.groupby(by=['userID'])
grouped_valid_y = h_valid_y.groupby(by=['userID'])
for group_train_x, group_valid_x, group_train_y, group_valid_y in zip(grouped_train_x, grouped_valid_x, grouped_train_y, grouped_valid_y):
    model = LogisticRegression()
    model.fit(group_train_x[1], group_train_y[1]['answerCode'])
    print(f"[TRAIN] roc_auc_score : {roc_auc_score(model.predict(group_train_x[1]), group_train_y[1]['answerCode'])}")
    print(f"[VALID] roc_auc_score : {roc_auc_score(model.predict(group_valid_x[1]), group_valid_y[1]['answerCode'])}")
    break

[TRAIN] roc_auc_score : 0.697259159253574
[VALID] roc_auc_score : 0.6274710293115201


In [None]:
def objective(trial):
    param = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-3, 1),
        'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
    }
    model = LogisticRegression(**param)
    model = model.fit(group_train_x[1], group_train_y[1]['answerCode'])
    loss = roc_auc_score(model.predict(group_valid_x[1]), group_valid_y[1]['answerCode'])
    return loss
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=1000, callbacks=[wandbc])

In [None]:
import lightgbm as lgb
def objective(trial):
    param = {
        # 'objective': 'binary', # 이진 분류
        "objective": trial.suggest_categorical("objective", ["binary", "cross_entropy"]),
        'verbose': -1,
        'metric': 'AUC',
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 0.0001, 2.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        # 'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
        'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 1e-4),
        'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 1e-4),
        'path_smooth' : trial.suggest_loguniform('path_smooth', 1e-8, 1e-3),
        'num_leaves' : trial.suggest_int('num_leaves', 30, 200),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_bin' : trial.suggest_int('max_bin', 100, 255),
        'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.5, 1),
        'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.5, 1),
        # 'device' : 'gpu',
        # 'reg_alpha' : None,
    }
    categorical = [0, 1, 2, 3, 4, 5, 10, 11, 13, 14, 15, 16, 17]
    model = lgb.LGBMClassifier(**param, categorical_feature=categorical)
    lgb_model = model.fit(group_train_x[1], group_train_y[1]['answerCode'], eval_set=[(group_valid_x[1], group_valid_y[1]['answerCode'])], verbose=-1, early_stopping_rounds=25)
    loss = roc_auc_score(group_valid_y[1]['answerCode'], lgb_model.predict(group_valid_x[1]))
    return loss
        
study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective, n_trials=10000)

In [33]:
trial = study_lgb.best_trial
trial_params = trial.params
print('Best Trial: score {},\nparams {}'.format(trial.value, trial_params))

Best Trial: score 0.7159695468721335,
params {'objective': 'cross_entropy', 'max_depth': 5, 'learning_rate': 0.1460557874021894, 'n_estimators': 4044, 'min_child_samples': 77, 'lambda_l1': 6.716411565832376e-05, 'lambda_l2': 1.7186698526667583e-08, 'path_smooth': 5.935595604338357e-05, 'num_leaves': 93, 'min_data_in_leaf': 90, 'max_bin': 220, 'feature_fraction': 0.5248617093238126, 'bagging_fraction': 0.8611415978965095}
