## dataload

In [34]:
import mlflow
import mlflow.lightgbm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import pandas as pd
import os
import random
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
from utils import custom_train_test_split, lgbm_predict, post_slack

SEED=13
## 1. 데이터 로딩
data_dir = '/opt/ml/input/data' # 경로
after_fe_path = os.path.join(data_dir, 'after_fe_train_test_cumsum2.pkl')
df = pd.read_pickle(after_fe_path)

train_df = df[df.kind=='train']
train, valid = custom_train_test_split(train_df, ratio=0.7, seed=SEED) # 훈련데이터 split
test = df[df.kind=='test'] # 테스트 데이터
train2 = test[test.answerCode!=-1] # 테스트데이터 마지막 제출 2번쨰꺼까지 훈련데이터로 사용
train = pd.concat([train,train2]) # 훈련데이터 병합
train.shape, valid.shape, test.shape

x_train = train.drop('answerCode',axis=1)
y_train = train[['answerCode']]

x_valid = valid.drop('answerCode',axis=1)
y_valid = valid[['answerCode']]
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

((1845539, 33), (1974, 33), (260114, 33))

((1845539, 32), (1845539, 1), (1974, 32), (1974, 1))

In [35]:
x_train[['userID','uidIdx']]

Unnamed: 0,userID,uidIdx
1678,2,2
1679,2,2
1680,2,2
1681,2,2
1682,2,2
...,...,...
2526670,7439,7439
2526671,7439,7439
2526672,7439,7439
2526673,7439,7439


In [24]:
params = {
#     "max_depth": 8,  # 8,
#     "min_data_in_leaf": 1000,
    # "feature_fraction": 0.6,  # 0.8,
#     "bagging_fraction": 0.75,
    # "max_cat_group": 64,
    "objective": "binary",
#     "boosting": "gbdt",  # dart
#     "learning_rate": 0.01,  # 0.01,
    # "bagging_freq": 5,
    "seed": 42,
    # "max_bin": 50,
#     "num_leaves": 80,  # 40,
#     "metric": "auc",
}

## 그냥 solvesec_cumsum 사용

In [5]:
# 사용할 Feature 설정
FEATS = ['uidIdx',
         'assIdx',
         'testIdx',
         'KnowledgeTag',
         'user_correct_answer',
         'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std',
         'tag_std',
         'tag_mean',
         'tag_sum',
         'solvesec_2400',
         'time_category',
         'solvesec_cumsum',
         'solvecumsum_category'
        ]

cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','time_category','solvecumsum_category']
cat_feats_idx = [i for i,e in enumerate(FEATS) if e in cat_feats]


from datetime import datetime
date = datetime.now().strftime('%m/%d %a')
title=f"🌈({date})[LGBM Cumsum + category] 피처: {len(FEATS)}개"
using_feats=", ".join(FEATS)
desc=f"사용된 피처({len(FEATS)})\n{using_feats}"
cat_feats_idx, title, desc

([0, 1, 2, 3, 21, 23],
 '🌈(11/29 Tue)[LGBM Cumsum + category] 피처: 24개',
 '사용된 피처(24)\nuidIdx, assIdx, testIdx, KnowledgeTag, user_correct_answer, user_total_answer, big_category, mid_category, problem_num, month, day, dayname, hour, user_acc, test_mean, test_sum, test_std, tag_std, tag_mean, tag_sum, solvesec_2400, time_category, solvesec_cumsum, solvecumsum_category')

In [8]:
mlflow.lightgbm.autolog()
lgb_x_train = lgb.Dataset(x_train[FEATS], y_train)
lgb_x_valid = lgb.Dataset(x_valid[FEATS], y_valid)

with mlflow.start_run(run_name=title, description=desc) as run:
    model2 = lgb.train(
        params, 
        lgb_x_train,
        valid_sets=[lgb_x_valid],
        verbose_eval=100,
        num_boost_round=3200,
        categorical_feature=cat_feats_idx
    #     early_stopping_rounds=200,
    )

    preds = model2.predict(x_valid[FEATS])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    mlflow.log_metric("VAL AUC",auc)
    mlflow.log_metric("VAL Set SEED",SEED)
    model2_run_id=run.info.run_id
    post_slack("done")
    lgbm_predict(test, model2, FEATS, 'LGBM_Baseline_solvetime_Cumsum추가_category_추가.csv')

[LightGBM] [Info] Number of positive: 1208276, number of negative: 637263
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18557
[LightGBM] [Info] Number of data points in the train set: 1845539, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654701 -> initscore=0.639767
[LightGBM] [Info] Start training from score 0.639767
[100]	valid_0's binary_logloss: 0.56237
[200]	valid_0's binary_logloss: 0.555351
[300]	valid_0's binary_logloss: 0.553151
[400]	valid_0's binary_logloss: 0.551699
[500]	valid_0's binary_logloss: 0.551188
[600]	valid_0's binary_logloss: 0.548887
[700]	valid_0's binary_logloss: 0.549058
[800]	valid_0's binary_logloss: 0.548756
[900]	valid_0's binary_logloss: 0.549501
[1000]	valid_0's binary_logloss: 0.550635
[1100]	valid_0's binary_logloss: 0.551213
[1200]	valid_0's binary_logloss: 0.55135
[1300]	valid_0's binary_logloss: 0.552372
[1400]	

In [9]:
lgbm_predict(test, model2, FEATS, 'LGBM_solvetime_Cumsum추가_category_추가_2400.csv')

writing prediction : output/LGBM_solvetime_Cumsum추가_category_추가_2400.csv


In [14]:
# 제출시
LB_AUC=0.7905
run_id = model2_run_id
mlflow.start_run(run_id=run_id)
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))
mlflow.log_metric("LB AUC",LB_AUC)
mlflow.end_run()

<ActiveRun: >

Active run_id: 7a9515834a1e430f9255ecdc7946af00


## solvesec_cumsum 2400 사용

In [10]:
# 사용할 Feature 설정
FEATS = ['uidIdx',
         'assIdx',
         'testIdx',
         'KnowledgeTag',
         'user_correct_answer',
         'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std',
         'tag_std',
         'tag_mean',
         'tag_sum',
         'solvesec_2400',
         'time_category',
         'solvesec_cumsum_2400',
         'solvecumsum_category'
        ]

cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','time_category','solvecumsum_category']
cat_feats_idx = [i for i,e in enumerate(FEATS) if e in cat_feats]


from datetime import datetime
date = datetime.now().strftime('%m/%d %a')
title=f"🌈({date})[LGBM Cumsum 2400기준, solvesec_cumsum_2400] 피처: {len(FEATS)}개"
using_feats=", ".join(FEATS)
desc=f"사용된 피처({len(FEATS)})\n{using_feats}"
cat_feats_idx, title, desc

([0, 1, 2, 3, 21, 23],
 '🌈(11/29 Tue)[LGBM Cumsum 2400기준, solvesec_cumsum_2400] 피처: 24개',
 '사용된 피처(24)\nuidIdx, assIdx, testIdx, KnowledgeTag, user_correct_answer, user_total_answer, big_category, mid_category, problem_num, month, day, dayname, hour, user_acc, test_mean, test_sum, test_std, tag_std, tag_mean, tag_sum, solvesec_2400, time_category, solvesec_cumsum_2400, solvecumsum_category')

In [11]:
mlflow.lightgbm.autolog()
lgb_x_train = lgb.Dataset(x_train[FEATS], y_train)
lgb_x_valid = lgb.Dataset(x_valid[FEATS], y_valid)

with mlflow.start_run(run_name=title, description=desc) as run:
    model3 = lgb.train(
        params, 
        lgb_x_train,
        valid_sets=[lgb_x_valid],
        verbose_eval=100,
        num_boost_round=3200,
        categorical_feature=cat_feats_idx
    #     early_stopping_rounds=200,
    )

    preds = model3.predict(x_valid[FEATS])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    mlflow.log_metric("VAL AUC",auc)
    mlflow.log_metric("VAL Set SEED",SEED)
    model3_run_id=run.info.run_id
    post_slack("done")
    lgbm_predict(test, model3, FEATS, 'LGBM_solvetime_Cumsum추가_category_추가_2400_solvesec_cumsum_2400.csv')

[LightGBM] [Info] Number of positive: 1208276, number of negative: 637263
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18557
[LightGBM] [Info] Number of data points in the train set: 1845539, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654701 -> initscore=0.639767
[LightGBM] [Info] Start training from score 0.639767
[100]	valid_0's binary_logloss: 0.561438
[200]	valid_0's binary_logloss: 0.554626
[300]	valid_0's binary_logloss: 0.552703
[400]	valid_0's binary_logloss: 0.551491
[500]	valid_0's binary_logloss: 0.550737
[600]	valid_0's binary_logloss: 0.549383
[700]	valid_0's binary_logloss: 0.54733
[800]	valid_0's binary_logloss: 0.547293
[900]	valid_0's binary_logloss: 0.547104
[1000]	valid_0's binary_logloss: 0.547631
[1100]	valid_0's binary_logloss: 0.548391
[1200]	valid_0's binary_logloss: 0.549149
[1300]	valid_0's binary_logloss: 0.549715
[1400]

In [15]:
# 제출시
LB_AUC=0.7988
run_id = model3_run_id
mlflow.start_run(run_id=run_id)
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))
mlflow.log_metric("LB AUC",LB_AUC)
mlflow.end_run()

<ActiveRun: >

Active run_id: b1ea685e9f444aeb80420459cbaa81b3


### early stoping 적용

In [17]:
# 사용할 Feature 설정
FEATS = ['uidIdx',
         'assIdx',
         'testIdx',
         'KnowledgeTag',
         'user_correct_answer',
         'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std',
         'tag_std',
         'tag_mean',
         'tag_sum',
         'solvesec_2400',
         'time_category',
         'solvesec_cumsum_2400',
         'solvecumsum_category'
        ]

cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','time_category','solvecumsum_category']
cat_feats_idx = [i for i,e in enumerate(FEATS) if e in cat_feats]


from datetime import datetime
date = datetime.now().strftime('%m/%d %a')
title=f"🌈({date})[LGBM Cumsum 2400기준, solvesec_cumsum_2400] 피처: {len(FEATS)}개"
using_feats=", ".join(FEATS)
desc=f"사용된 피처({len(FEATS)})\n{using_feats}"
cat_feats_idx, title, desc

mlflow.lightgbm.autolog()
lgb_x_train = lgb.Dataset(x_train[FEATS], y_train)
lgb_x_valid = lgb.Dataset(x_valid[FEATS], y_valid)

with mlflow.start_run(run_name=title, description=desc) as run:
    model3_1 = lgb.train(
        params, 
        lgb_x_train,
        valid_sets=[lgb_x_valid],
        verbose_eval=100,
        num_boost_round=3200,
        categorical_feature=cat_feats_idx,
        early_stopping_rounds=200,
    )

    preds = model3_1.predict(x_valid[FEATS])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    mlflow.log_metric("VAL AUC",auc)
    mlflow.log_metric("VAL Set SEED",SEED)
    model3_1_run_id=run.info.run_id
    post_slack("done")
    lgbm_predict(test, model3_1, FEATS, 'LGBM_solvetime_Cumsum추가_category_추가_2400_solvesec_cumsum_2400_early.csv')

([0, 1, 2, 3, 21, 23],
 '🌈(11/29 Tue)[LGBM Cumsum 2400기준, solvesec_cumsum_2400] 피처: 24개',
 '사용된 피처(24)\nuidIdx, assIdx, testIdx, KnowledgeTag, user_correct_answer, user_total_answer, big_category, mid_category, problem_num, month, day, dayname, hour, user_acc, test_mean, test_sum, test_std, tag_std, tag_mean, tag_sum, solvesec_2400, time_category, solvesec_cumsum_2400, solvecumsum_category')

[LightGBM] [Info] Number of positive: 1208276, number of negative: 637263
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18557
[LightGBM] [Info] Number of data points in the train set: 1845539, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654701 -> initscore=0.639767
[LightGBM] [Info] Start training from score 0.639767
Training until validation scores don't improve for 200 rounds
[100]	valid_0's binary_logloss: 0.561438
[200]	valid_0's binary_logloss: 0.554626
[300]	valid_0's binary_logloss: 0.552703
[400]	valid_0's binary_logloss: 0.551491
[500]	valid_0's binary_logloss: 0.550737
[600]	valid_0's binary_logloss: 0.549383
[700]	valid_0's binary_logloss: 0.54733
[800]	valid_0's binary_logloss: 0.547293
[900]	valid_0's binary_logloss: 0.547104
Early stopping, best iteration is:
[732]	valid_0's binary_logloss: 0.546977
VALID AUC : 0.7946950246054083 ACC :

In [None]:
# 제출시
LB_AUC=0.7988
run_id = model3_run_id
mlflow.start_run(run_id=run_id)
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))
mlflow.log_metric("LB AUC",LB_AUC)
mlflow.end_run()

## solvesec_cumsum 2401 사용

In [12]:
# 사용할 Feature 설정
FEATS = ['uidIdx',
         'assIdx',
         'testIdx',
         'KnowledgeTag',
         'user_correct_answer',
         'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std',
         'tag_std',
         'tag_mean',
         'tag_sum',
         'solvesec_2400',
         'time_category',
         'solvesec_cumsum_2401',
         'solvecumsum_category'
        ]

cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','time_category','solvecumsum_category']
cat_feats_idx = [i for i,e in enumerate(FEATS) if e in cat_feats]


from datetime import datetime
date = datetime.now().strftime('%m/%d %a')
title=f"🌈({date})[LGBM solvesec_cumsum_2401] 피처: {len(FEATS)}개"
using_feats=", ".join(FEATS)
desc=f"사용된 피처({len(FEATS)})\n{using_feats}"
cat_feats_idx, title, desc

([0, 1, 2, 3, 21, 23],
 '🌈(11/29 Tue)[LGBM solvesec_cumsum_2401] 피처: 24개',
 '사용된 피처(24)\nuidIdx, assIdx, testIdx, KnowledgeTag, user_correct_answer, user_total_answer, big_category, mid_category, problem_num, month, day, dayname, hour, user_acc, test_mean, test_sum, test_std, tag_std, tag_mean, tag_sum, solvesec_2400, time_category, solvesec_cumsum_2401, solvecumsum_category')

In [13]:
mlflow.lightgbm.autolog()
lgb_x_train = lgb.Dataset(x_train[FEATS], y_train)
lgb_x_valid = lgb.Dataset(x_valid[FEATS], y_valid)

with mlflow.start_run(run_name=title, description=desc) as run:
    model4 = lgb.train(
        params, 
        lgb_x_train,
        valid_sets=[lgb_x_valid],
        verbose_eval=100,
        num_boost_round=3200,
        categorical_feature=cat_feats_idx
    #     early_stopping_rounds=200,
    )

    preds = model4.predict(x_valid[FEATS])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    mlflow.log_metric("VAL AUC",auc)
    mlflow.log_metric("VAL Set SEED",SEED)
    model4_run_id=run.info.run_id
    post_slack("done")
    lgbm_predict(test, model4, FEATS, 'LGBM_solvetime_Cumsum추가_category_추가_2400_solvesec_cumsum_2401.csv')

[LightGBM] [Info] Number of positive: 1208276, number of negative: 637263
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18557
[LightGBM] [Info] Number of data points in the train set: 1845539, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654701 -> initscore=0.639767
[LightGBM] [Info] Start training from score 0.639767
[100]	valid_0's binary_logloss: 0.561446
[200]	valid_0's binary_logloss: 0.554137
[300]	valid_0's binary_logloss: 0.552903
[400]	valid_0's binary_logloss: 0.551544
[500]	valid_0's binary_logloss: 0.5494
[600]	valid_0's binary_logloss: 0.548209
[700]	valid_0's binary_logloss: 0.547312
[800]	valid_0's binary_logloss: 0.547647
[900]	valid_0's binary_logloss: 0.548485
[1000]	valid_0's binary_logloss: 0.549564
[1100]	valid_0's binary_logloss: 0.550332
[1200]	valid_0's binary_logloss: 0.551066
[1300]	valid_0's binary_logloss: 0.551107
[1400]	

## 3600 기존 최고 성능 - 8012 재현 (수정된 부분 허용할만한지)

In [37]:
# 사용할 Feature 설정
FEATS = ['uidIdx',
         'assIdx',
         'testIdx',
         'KnowledgeTag',
         'user_correct_answer',
         'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std',
         'tag_std',
         'tag_mean',
         'tag_sum',
         'solvesec_3600',
         'time_category',
         'solvesec_cumsum',
         'solvecumsum_category'
        ]

cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','time_category','solvecumsum_category']
cat_feats_idx = [i for i,e in enumerate(FEATS) if e in cat_feats]


from datetime import datetime
date = datetime.now().strftime('%m/%d %a')
title=f"🌈({date})[LGBM 기존 8012 재현] 피처: {len(FEATS)}개"
using_feats=", ".join(FEATS)
desc=f"사용된 피처({len(FEATS)})\n{using_feats}"
cat_feats_idx, title, desc

mlflow.lightgbm.autolog()
lgb_x_train = lgb.Dataset(x_train[FEATS], y_train)
lgb_x_valid = lgb.Dataset(x_valid[FEATS], y_valid)

with mlflow.start_run(run_name=title, description=desc) as run:
    model5 = lgb.train(
        params, 
        lgb_x_train,
        valid_sets=[lgb_x_valid],
        verbose_eval=100,
        num_boost_round=3200,
        categorical_feature=cat_feats_idx
    #     early_stopping_rounds=200,
    )

    preds = model5.predict(x_valid[FEATS])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    mlflow.log_metric("VAL AUC",auc)
    mlflow.log_metric("VAL Set SEED",SEED)
    model5_run_id=run.info.run_id
    post_slack("done")
#     lgbm_predict(test, model5, FEATS, 'LGBM_solvetime_Cumsum추가_category_추가_2400_solvesec_cumsum_2401.csv')

([0, 1, 2, 3, 21, 23],
 '🌈(11/30 Wed)[LGBM 기존 8012 재현] 피처: 24개',
 '사용된 피처(24)\nuidIdx, assIdx, testIdx, KnowledgeTag, user_correct_answer, user_total_answer, big_category, mid_category, problem_num, month, day, dayname, hour, user_acc, test_mean, test_sum, test_std, tag_std, tag_mean, tag_sum, solvesec_3600, time_category, solvesec_cumsum, solvecumsum_category')

[LightGBM] [Info] Number of positive: 1208276, number of negative: 637263
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19042
[LightGBM] [Info] Number of data points in the train set: 1845539, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654701 -> initscore=0.639767
[LightGBM] [Info] Start training from score 0.639767
[100]	valid_0's binary_logloss: 0.560045
[200]	valid_0's binary_logloss: 0.553273
[300]	valid_0's binary_logloss: 0.551615
[400]	valid_0's binary_logloss: 0.549948
[500]	valid_0's binary_logloss: 0.549759
[600]	valid_0's binary_logloss: 0.549908
[700]	valid_0's binary_logloss: 0.549459
[800]	valid_0's binary_logloss: 0.549514
[900]	valid_0's binary_logloss: 0.549613
[1000]	valid_0's binary_logloss: 0.549223
[1100]	valid_0's binary_logloss: 0.549293
[1200]	valid_0's binary_logloss: 0.549537
[1300]	valid_0's binary_logloss: 0.549861
[1400

In [38]:
lgbm_predict(test, model5, FEATS, 'LGBM_기존8012_재현성테스트_train_test를합쳐서_한번에fe함수통과하도록_수정.csv')

writing prediction : output/LGBM_기존8012_재현성테스트_train_test를합쳐서_한번에fe함수통과하도록_수정.csv


In [39]:
# 제출시
LB_AUC=0.8107
run_id = model5_run_id
mlflow.start_run(run_id=run_id)
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))
mlflow.log_metric("LB AUC",LB_AUC)
mlflow.end_run()

<ActiveRun: >

Active run_id: 5cb8c6ce080041888bfb1c70d67c2ac5
