In [1]:
!ls ../../data

sample_submission.csv  test_data.csv  train_data.csv


In [2]:
!conda env list

# conda environments:
#
base                     /opt/conda
dkt                      /opt/conda/envs/dkt
jupy                  *  /opt/conda/envs/jupy



In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")

In [4]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    ## (user_id, 문제푼 횟수)를 원소로 갖는 리스트
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users) # 셔플
    
    max_train_data_len = ratio*len(df) # train data 길이
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users: # for문으로 전체의 ratio 퍼센트만 user_ids에 추가
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    ## train, test split
    train = df[df['userID'].isin(user_ids)] # train
    test = df[df['userID'].isin(user_ids) == False] # test

    #test데이터셋은, train에서 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

## Data Load

In [5]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}
def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    
    return df2

In [6]:
df = feature_engineering(train)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,dayname,big_category,problem_num,mid_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,0,6,1,1,0.947683,0.222749,1268,0.955022,0.20741,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3,...,0,6,2,1,0.947683,0.222749,1268,0.913187,0.281603,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,3,...,0,6,3,1,0.947683,0.222749,1268,0.913187,0.281603,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,3,...,0,6,4,1,0.947683,0.222749,1268,0.913187,0.281603,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,3,...,0,6,5,1,0.947683,0.222749,1268,0.913187,0.281603,3040


In [7]:
df.shape

(2266586, 22)

In [8]:
df[:3]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,dayname,big_category,problem_num,mid_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,0,6,1,1,0.947683,0.222749,1268,0.955022,0.20741,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3,...,0,6,2,1,0.947683,0.222749,1268,0.913187,0.281603,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,3,...,0,6,3,1,0.947683,0.222749,1268,0.913187,0.281603,3040


## AutoViz

자동 시각화

https://github.com/AutoViML/AutoViz

In [9]:
# !pip install -q autoviz

# %matplotlib inline

# from autoviz.AutoViz_Class import AutoViz_Class
# plt.style.use("classic")

# AV = AutoViz_Class()

# AV.AutoViz(
#     filename='',
#     dfte=context_train,
#     depVar='answerCode',
#     verbose=2,
#     max_rows_analyzed=context_train.shape[0],
#     max_cols_analyzed=context_train.shape[1],
# )

# context_train.shape

## LazyPredict
AutoML with scikit-learn

https://lazypredict.readthedocs.io/en/latest/

자동으로 베스트 모델 찾아주는 패키지

In [10]:
# !pip install scikit-learn==0.23.1
# !pip install scipy==1.7.0
# !pip install lazypredict -q

### 데이터 준비

In [11]:
# 유저별 분리
train, test = custom_train_test_split(df)

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [12]:
train.shape, y_train.shape, test.shape, y_test.shape

((1586157, 21), (1586157,), (2007, 21), (2007,))

In [13]:
train[:3]

Unnamed: 0,userID,assessmentItemID,testId,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,day,...,dayname,big_category,problem_num,mid_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum
745,1,A040013001,A040000013,2020-01-06 08:40:43,2048,,0,,1,6,...,2,4,1,13,0.595472,0.490986,789,0.6169,0.486297,971
746,1,A040013002,A040000013,2020-01-06 08:43:46,2048,1.0,1,1.0,1,6,...,2,4,2,13,0.595472,0.490986,789,0.6169,0.486297,971
747,1,A040013003,A040000013,2020-01-06 08:44:29,2047,2.0,2,1.0,1,6,...,2,4,3,13,0.595472,0.490986,789,0.570962,0.495073,1050


### 모델 성능 자동비교

In [14]:
import lightgbm as lgb
import catboost as cb
import lazypredict
from lazypredict.Supervised import LazyRegressor, LazyClassifier

use_model=[
    'AdaBoostClassifier',
    'CategoricalNB',
    'RandomForestClassifier',
    'RidgeClassifierCV',
    'XGBClassifier',
    'LGBMClassifier']
add_model=[
    ('CatBoostClassifier',cb.CatBoostClassifier),
]

all_model = [i for i in lazypredict.Supervised.CLASSIFIERS if i[0] in use_model] + add_model

In [15]:
lazypredict.Supervised.CLASSIFIERS = all_model

In [16]:
lazypredict.Supervised.CLASSIFIERS

[('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('CategoricalNB', sklearn.naive_bayes.CategoricalNB),
 ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
 ('RidgeClassifierCV', sklearn.linear_model._ridge.RidgeClassifierCV),
 ('XGBClassifier', xgboost.sklearn.XGBClassifier),
 ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier),
 ('CatBoostClassifier', catboost.core.CatBoostClassifier)]

In [17]:
# 사용할 Feature 설정
base_feats = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']

FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std', 'tag_std',
         'tag_mean',
         'tag_sum',
        ]

#### LGBM Baseline의 FE

In [18]:
clf = LazyClassifier(verbose=0, predictions=True)
models, predictions = clf.fit(train[base_feats], test[base_feats], y_train, y_test)
models

 86%|███████████████████████████████████████▍      | 6/7 [08:02<00:56, 56.19s/it]

Learning rate set to 0.239597
0:	learn: 0.6395435	total: 154ms	remaining: 2m 33s
1:	learn: 0.6094453	total: 241ms	remaining: 2m
2:	learn: 0.5916891	total: 327ms	remaining: 1m 48s
3:	learn: 0.5819839	total: 403ms	remaining: 1m 40s
4:	learn: 0.5759493	total: 471ms	remaining: 1m 33s
5:	learn: 0.5725623	total: 533ms	remaining: 1m 28s
6:	learn: 0.5701988	total: 595ms	remaining: 1m 24s
7:	learn: 0.5686476	total: 654ms	remaining: 1m 21s
8:	learn: 0.5676746	total: 722ms	remaining: 1m 19s
9:	learn: 0.5669838	total: 791ms	remaining: 1m 18s
10:	learn: 0.5664228	total: 851ms	remaining: 1m 16s
11:	learn: 0.5659762	total: 922ms	remaining: 1m 15s
12:	learn: 0.5656077	total: 983ms	remaining: 1m 14s
13:	learn: 0.5653542	total: 1.09s	remaining: 1m 17s
14:	learn: 0.5651386	total: 1.19s	remaining: 1m 18s
15:	learn: 0.5649822	total: 1.29s	remaining: 1m 19s
16:	learn: 0.5648013	total: 1.35s	remaining: 1m 18s
17:	learn: 0.5646034	total: 1.43s	remaining: 1m 17s
18:	learn: 0.5644643	total: 1.49s	remaining: 1m 

161:	learn: 0.5581782	total: 11.2s	remaining: 58s
162:	learn: 0.5581584	total: 11.3s	remaining: 57.9s
163:	learn: 0.5581355	total: 11.4s	remaining: 58s
164:	learn: 0.5581092	total: 11.5s	remaining: 58s
165:	learn: 0.5580812	total: 11.5s	remaining: 57.9s
166:	learn: 0.5580672	total: 11.6s	remaining: 57.8s
167:	learn: 0.5580461	total: 11.7s	remaining: 57.7s
168:	learn: 0.5580209	total: 11.7s	remaining: 57.6s
169:	learn: 0.5579946	total: 11.8s	remaining: 57.6s
170:	learn: 0.5579689	total: 11.8s	remaining: 57.4s
171:	learn: 0.5579462	total: 11.9s	remaining: 57.4s
172:	learn: 0.5579254	total: 12s	remaining: 57.3s
173:	learn: 0.5579060	total: 12s	remaining: 57.2s
174:	learn: 0.5578848	total: 12.1s	remaining: 57.1s
175:	learn: 0.5578492	total: 12.2s	remaining: 57s
176:	learn: 0.5578370	total: 12.2s	remaining: 56.9s
177:	learn: 0.5578065	total: 12.3s	remaining: 56.8s
178:	learn: 0.5577837	total: 12.4s	remaining: 56.8s
179:	learn: 0.5577658	total: 12.5s	remaining: 56.8s
180:	learn: 0.5577406	to

322:	learn: 0.5550144	total: 22.5s	remaining: 47.2s
323:	learn: 0.5549833	total: 22.6s	remaining: 47.1s
324:	learn: 0.5549717	total: 22.7s	remaining: 47.1s
325:	learn: 0.5549524	total: 22.7s	remaining: 47s
326:	learn: 0.5549360	total: 22.8s	remaining: 46.9s
327:	learn: 0.5549210	total: 22.9s	remaining: 46.8s
328:	learn: 0.5549083	total: 22.9s	remaining: 46.8s
329:	learn: 0.5548942	total: 23s	remaining: 46.7s
330:	learn: 0.5548761	total: 23.1s	remaining: 46.6s
331:	learn: 0.5548598	total: 23.1s	remaining: 46.5s
332:	learn: 0.5548340	total: 23.2s	remaining: 46.5s
333:	learn: 0.5548183	total: 23.2s	remaining: 46.4s
334:	learn: 0.5548056	total: 23.3s	remaining: 46.3s
335:	learn: 0.5547943	total: 23.4s	remaining: 46.2s
336:	learn: 0.5547792	total: 23.4s	remaining: 46.1s
337:	learn: 0.5547615	total: 23.5s	remaining: 46.1s
338:	learn: 0.5547481	total: 23.6s	remaining: 46s
339:	learn: 0.5547321	total: 23.7s	remaining: 46s
340:	learn: 0.5547157	total: 23.8s	remaining: 46s
341:	learn: 0.5547018	

481:	learn: 0.5526132	total: 33.7s	remaining: 36.2s
482:	learn: 0.5526009	total: 33.8s	remaining: 36.2s
483:	learn: 0.5525855	total: 33.9s	remaining: 36.1s
484:	learn: 0.5525756	total: 33.9s	remaining: 36s
485:	learn: 0.5525647	total: 34s	remaining: 35.9s
486:	learn: 0.5525445	total: 34s	remaining: 35.9s
487:	learn: 0.5525290	total: 34.1s	remaining: 35.8s
488:	learn: 0.5525122	total: 34.2s	remaining: 35.7s
489:	learn: 0.5524929	total: 34.3s	remaining: 35.7s
490:	learn: 0.5524804	total: 34.4s	remaining: 35.6s
491:	learn: 0.5524682	total: 34.5s	remaining: 35.6s
492:	learn: 0.5524599	total: 34.5s	remaining: 35.5s
493:	learn: 0.5524487	total: 34.6s	remaining: 35.4s
494:	learn: 0.5524410	total: 34.7s	remaining: 35.4s
495:	learn: 0.5524252	total: 34.8s	remaining: 35.3s
496:	learn: 0.5524184	total: 34.8s	remaining: 35.2s
497:	learn: 0.5523980	total: 34.9s	remaining: 35.2s
498:	learn: 0.5523859	total: 34.9s	remaining: 35.1s
499:	learn: 0.5523706	total: 35s	remaining: 35s
500:	learn: 0.5523566	

642:	learn: 0.5505578	total: 45s	remaining: 25s
643:	learn: 0.5505409	total: 45.1s	remaining: 24.9s
644:	learn: 0.5505221	total: 45.1s	remaining: 24.8s
645:	learn: 0.5505089	total: 45.2s	remaining: 24.8s
646:	learn: 0.5504974	total: 45.3s	remaining: 24.7s
647:	learn: 0.5504887	total: 45.3s	remaining: 24.6s
648:	learn: 0.5504782	total: 45.4s	remaining: 24.6s
649:	learn: 0.5504653	total: 45.5s	remaining: 24.5s
650:	learn: 0.5504564	total: 45.6s	remaining: 24.4s
651:	learn: 0.5504348	total: 45.6s	remaining: 24.4s
652:	learn: 0.5504174	total: 45.7s	remaining: 24.3s
653:	learn: 0.5504076	total: 45.8s	remaining: 24.2s
654:	learn: 0.5503938	total: 45.9s	remaining: 24.2s
655:	learn: 0.5503863	total: 46s	remaining: 24.1s
656:	learn: 0.5503765	total: 46s	remaining: 24s
657:	learn: 0.5503588	total: 46.1s	remaining: 24s
658:	learn: 0.5503522	total: 46.2s	remaining: 23.9s
659:	learn: 0.5503433	total: 46.3s	remaining: 23.8s
660:	learn: 0.5503365	total: 46.3s	remaining: 23.8s
661:	learn: 0.5503251	to

802:	learn: 0.5486348	total: 56.7s	remaining: 13.9s
803:	learn: 0.5486267	total: 56.8s	remaining: 13.8s
804:	learn: 0.5486124	total: 56.9s	remaining: 13.8s
805:	learn: 0.5486023	total: 56.9s	remaining: 13.7s
806:	learn: 0.5485896	total: 57s	remaining: 13.6s
807:	learn: 0.5485807	total: 57.1s	remaining: 13.6s
808:	learn: 0.5485694	total: 57.1s	remaining: 13.5s
809:	learn: 0.5485524	total: 57.2s	remaining: 13.4s
810:	learn: 0.5485432	total: 57.3s	remaining: 13.3s
811:	learn: 0.5485374	total: 57.3s	remaining: 13.3s
812:	learn: 0.5485285	total: 57.4s	remaining: 13.2s
813:	learn: 0.5485148	total: 57.5s	remaining: 13.1s
814:	learn: 0.5485072	total: 57.6s	remaining: 13.1s
815:	learn: 0.5484991	total: 57.7s	remaining: 13s
816:	learn: 0.5484889	total: 57.7s	remaining: 12.9s
817:	learn: 0.5484810	total: 57.8s	remaining: 12.9s
818:	learn: 0.5484673	total: 57.9s	remaining: 12.8s
819:	learn: 0.5484574	total: 57.9s	remaining: 12.7s
820:	learn: 0.5484446	total: 58s	remaining: 12.6s
821:	learn: 0.5484

962:	learn: 0.5469304	total: 1m 8s	remaining: 2.63s
963:	learn: 0.5469157	total: 1m 8s	remaining: 2.56s
964:	learn: 0.5469081	total: 1m 8s	remaining: 2.48s
965:	learn: 0.5468955	total: 1m 8s	remaining: 2.41s
966:	learn: 0.5468821	total: 1m 8s	remaining: 2.34s
967:	learn: 0.5468715	total: 1m 8s	remaining: 2.27s
968:	learn: 0.5468578	total: 1m 8s	remaining: 2.2s
969:	learn: 0.5468477	total: 1m 8s	remaining: 2.13s
970:	learn: 0.5468330	total: 1m 8s	remaining: 2.06s
971:	learn: 0.5468223	total: 1m 8s	remaining: 1.99s
972:	learn: 0.5468160	total: 1m 9s	remaining: 1.92s
973:	learn: 0.5468062	total: 1m 9s	remaining: 1.84s
974:	learn: 0.5467955	total: 1m 9s	remaining: 1.77s
975:	learn: 0.5467829	total: 1m 9s	remaining: 1.7s
976:	learn: 0.5467702	total: 1m 9s	remaining: 1.63s
977:	learn: 0.5467586	total: 1m 9s	remaining: 1.56s
978:	learn: 0.5467468	total: 1m 9s	remaining: 1.49s
979:	learn: 0.5467361	total: 1m 9s	remaining: 1.42s
980:	learn: 0.5467271	total: 1m 9s	remaining: 1.35s
981:	learn: 0.

100%|██████████████████████████████████████████████| 7/7 [09:15<00:00, 79.34s/it]

997:	learn: 0.5465412	total: 1m 10s	remaining: 142ms
998:	learn: 0.5465307	total: 1m 10s	remaining: 70.9ms
999:	learn: 0.5465202	total: 1m 10s	remaining: 0us





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CatBoostClassifier,0.61,0.62,0.62,0.59,72.39
LGBMClassifier,0.61,0.61,0.61,0.59,3.03
XGBClassifier,0.61,0.61,0.61,0.59,29.29
AdaBoostClassifier,0.6,0.61,0.61,0.58,74.89
RidgeClassifierCV,0.59,0.6,0.6,0.56,2.16
RandomForestClassifier,0.59,0.59,0.59,0.58,372.86


#### 제출 AUC 0.7433의 FE

In [19]:
clf = LazyClassifier(verbose=0, predictions=True)
models, predictions = clf.fit(train[FEATS], test[FEATS], y_train, y_test)
models

 86%|███████████████████████████████████████▍      | 6/7 [11:42<01:21, 81.39s/it]

Learning rate set to 0.239597
0:	learn: 0.6368037	total: 80.9ms	remaining: 1m 20s
1:	learn: 0.6046606	total: 160ms	remaining: 1m 20s
2:	learn: 0.5866261	total: 240ms	remaining: 1m 19s
3:	learn: 0.5755460	total: 316ms	remaining: 1m 18s
4:	learn: 0.5690109	total: 398ms	remaining: 1m 19s
5:	learn: 0.5647226	total: 466ms	remaining: 1m 17s
6:	learn: 0.5620034	total: 544ms	remaining: 1m 17s
7:	learn: 0.5601621	total: 623ms	remaining: 1m 17s
8:	learn: 0.5586911	total: 701ms	remaining: 1m 17s
9:	learn: 0.5576189	total: 774ms	remaining: 1m 16s
10:	learn: 0.5567558	total: 848ms	remaining: 1m 16s
11:	learn: 0.5561462	total: 921ms	remaining: 1m 15s
12:	learn: 0.5556293	total: 993ms	remaining: 1m 15s
13:	learn: 0.5551990	total: 1.08s	remaining: 1m 15s
14:	learn: 0.5547959	total: 1.14s	remaining: 1m 15s
15:	learn: 0.5544936	total: 1.21s	remaining: 1m 14s
16:	learn: 0.5541970	total: 1.29s	remaining: 1m 14s
17:	learn: 0.5539670	total: 1.36s	remaining: 1m 14s
18:	learn: 0.5537727	total: 1.44s	remaining

160:	learn: 0.5425476	total: 12.2s	remaining: 1m 3s
161:	learn: 0.5425023	total: 12.3s	remaining: 1m 3s
162:	learn: 0.5424472	total: 12.4s	remaining: 1m 3s
163:	learn: 0.5424134	total: 12.5s	remaining: 1m 3s
164:	learn: 0.5423692	total: 12.6s	remaining: 1m 3s
165:	learn: 0.5423396	total: 12.6s	remaining: 1m 3s
166:	learn: 0.5422998	total: 12.7s	remaining: 1m 3s
167:	learn: 0.5422678	total: 12.8s	remaining: 1m 3s
168:	learn: 0.5422333	total: 12.9s	remaining: 1m 3s
169:	learn: 0.5422045	total: 12.9s	remaining: 1m 3s
170:	learn: 0.5421740	total: 13s	remaining: 1m 3s
171:	learn: 0.5421430	total: 13.1s	remaining: 1m 3s
172:	learn: 0.5421062	total: 13.2s	remaining: 1m 3s
173:	learn: 0.5420662	total: 13.3s	remaining: 1m 2s
174:	learn: 0.5420111	total: 13.3s	remaining: 1m 2s
175:	learn: 0.5419367	total: 13.4s	remaining: 1m 2s
176:	learn: 0.5419088	total: 13.5s	remaining: 1m 2s
177:	learn: 0.5418479	total: 13.6s	remaining: 1m 2s
178:	learn: 0.5417928	total: 13.7s	remaining: 1m 2s
179:	learn: 0.

322:	learn: 0.5363619	total: 24.8s	remaining: 52.1s
323:	learn: 0.5363252	total: 24.9s	remaining: 52s
324:	learn: 0.5362972	total: 25s	remaining: 51.9s
325:	learn: 0.5362661	total: 25.1s	remaining: 51.8s
326:	learn: 0.5362307	total: 25.1s	remaining: 51.7s
327:	learn: 0.5362105	total: 25.2s	remaining: 51.7s
328:	learn: 0.5361759	total: 25.3s	remaining: 51.6s
329:	learn: 0.5361486	total: 25.4s	remaining: 51.5s
330:	learn: 0.5361197	total: 25.5s	remaining: 51.5s
331:	learn: 0.5360655	total: 25.5s	remaining: 51.4s
332:	learn: 0.5360157	total: 25.6s	remaining: 51.3s
333:	learn: 0.5359961	total: 25.7s	remaining: 51.2s
334:	learn: 0.5359520	total: 25.8s	remaining: 51.2s
335:	learn: 0.5359221	total: 25.9s	remaining: 51.1s
336:	learn: 0.5358942	total: 25.9s	remaining: 51s
337:	learn: 0.5358683	total: 26s	remaining: 50.9s
338:	learn: 0.5358527	total: 26.1s	remaining: 50.8s
339:	learn: 0.5358193	total: 26.1s	remaining: 50.8s
340:	learn: 0.5357622	total: 26.2s	remaining: 50.7s
341:	learn: 0.535731

481:	learn: 0.5317337	total: 37.2s	remaining: 40s
482:	learn: 0.5317039	total: 37.3s	remaining: 39.9s
483:	learn: 0.5316786	total: 37.3s	remaining: 39.8s
484:	learn: 0.5316571	total: 37.4s	remaining: 39.7s
485:	learn: 0.5316347	total: 37.5s	remaining: 39.6s
486:	learn: 0.5316012	total: 37.6s	remaining: 39.6s
487:	learn: 0.5315794	total: 37.6s	remaining: 39.5s
488:	learn: 0.5315519	total: 37.7s	remaining: 39.4s
489:	learn: 0.5314816	total: 37.8s	remaining: 39.4s
490:	learn: 0.5314497	total: 37.9s	remaining: 39.3s
491:	learn: 0.5314308	total: 38s	remaining: 39.2s
492:	learn: 0.5314128	total: 38.1s	remaining: 39.1s
493:	learn: 0.5313568	total: 38.1s	remaining: 39.1s
494:	learn: 0.5313306	total: 38.2s	remaining: 39s
495:	learn: 0.5313089	total: 38.3s	remaining: 38.9s
496:	learn: 0.5312913	total: 38.4s	remaining: 38.8s
497:	learn: 0.5312682	total: 38.4s	remaining: 38.8s
498:	learn: 0.5312453	total: 38.5s	remaining: 38.7s
499:	learn: 0.5311955	total: 38.6s	remaining: 38.6s
500:	learn: 0.5311

640:	learn: 0.5275189	total: 49.6s	remaining: 27.8s
641:	learn: 0.5275000	total: 49.6s	remaining: 27.7s
642:	learn: 0.5274712	total: 49.7s	remaining: 27.6s
643:	learn: 0.5274570	total: 49.8s	remaining: 27.5s
644:	learn: 0.5274409	total: 49.9s	remaining: 27.4s
645:	learn: 0.5274072	total: 49.9s	remaining: 27.4s
646:	learn: 0.5273920	total: 50s	remaining: 27.3s
647:	learn: 0.5273638	total: 50.1s	remaining: 27.2s
648:	learn: 0.5272910	total: 50.2s	remaining: 27.2s
649:	learn: 0.5272598	total: 50.3s	remaining: 27.1s
650:	learn: 0.5272458	total: 50.4s	remaining: 27s
651:	learn: 0.5272210	total: 50.4s	remaining: 26.9s
652:	learn: 0.5271992	total: 50.5s	remaining: 26.9s
653:	learn: 0.5271611	total: 50.6s	remaining: 26.8s
654:	learn: 0.5271420	total: 50.7s	remaining: 26.7s
655:	learn: 0.5270892	total: 50.8s	remaining: 26.6s
656:	learn: 0.5270729	total: 50.8s	remaining: 26.5s
657:	learn: 0.5270481	total: 50.9s	remaining: 26.5s
658:	learn: 0.5270294	total: 51s	remaining: 26.4s
659:	learn: 0.5269

802:	learn: 0.5238243	total: 1m 2s	remaining: 15.2s
803:	learn: 0.5238062	total: 1m 2s	remaining: 15.2s
804:	learn: 0.5237852	total: 1m 2s	remaining: 15.1s
805:	learn: 0.5237495	total: 1m 2s	remaining: 15s
806:	learn: 0.5237357	total: 1m 2s	remaining: 14.9s
807:	learn: 0.5237227	total: 1m 2s	remaining: 14.9s
808:	learn: 0.5236913	total: 1m 2s	remaining: 14.8s
809:	learn: 0.5236717	total: 1m 2s	remaining: 14.7s
810:	learn: 0.5236541	total: 1m 2s	remaining: 14.6s
811:	learn: 0.5236476	total: 1m 2s	remaining: 14.5s
812:	learn: 0.5236169	total: 1m 2s	remaining: 14.5s
813:	learn: 0.5236028	total: 1m 2s	remaining: 14.4s
814:	learn: 0.5235793	total: 1m 3s	remaining: 14.3s
815:	learn: 0.5235620	total: 1m 3s	remaining: 14.2s
816:	learn: 0.5235439	total: 1m 3s	remaining: 14.2s
817:	learn: 0.5235186	total: 1m 3s	remaining: 14.1s
818:	learn: 0.5234908	total: 1m 3s	remaining: 14s
819:	learn: 0.5234703	total: 1m 3s	remaining: 13.9s
820:	learn: 0.5234560	total: 1m 3s	remaining: 13.8s
821:	learn: 0.52

961:	learn: 0.5206204	total: 1m 14s	remaining: 2.95s
962:	learn: 0.5206065	total: 1m 14s	remaining: 2.87s
963:	learn: 0.5205923	total: 1m 14s	remaining: 2.79s
964:	learn: 0.5205734	total: 1m 14s	remaining: 2.71s
965:	learn: 0.5205537	total: 1m 14s	remaining: 2.64s
966:	learn: 0.5205409	total: 1m 14s	remaining: 2.56s
967:	learn: 0.5205219	total: 1m 15s	remaining: 2.48s
968:	learn: 0.5205076	total: 1m 15s	remaining: 2.4s
969:	learn: 0.5204921	total: 1m 15s	remaining: 2.33s
970:	learn: 0.5204698	total: 1m 15s	remaining: 2.25s
971:	learn: 0.5204553	total: 1m 15s	remaining: 2.17s
972:	learn: 0.5204259	total: 1m 15s	remaining: 2.09s
973:	learn: 0.5203871	total: 1m 15s	remaining: 2.02s
974:	learn: 0.5203685	total: 1m 15s	remaining: 1.94s
975:	learn: 0.5203447	total: 1m 15s	remaining: 1.86s
976:	learn: 0.5203239	total: 1m 15s	remaining: 1.78s
977:	learn: 0.5202927	total: 1m 15s	remaining: 1.71s
978:	learn: 0.5202518	total: 1m 15s	remaining: 1.63s
979:	learn: 0.5202215	total: 1m 16s	remaining: 

100%|█████████████████████████████████████████████| 7/7 [13:02<00:00, 111.75s/it]

997:	learn: 0.5198518	total: 1m 17s	remaining: 155ms
998:	learn: 0.5198384	total: 1m 17s	remaining: 77.6ms
999:	learn: 0.5198189	total: 1m 17s	remaining: 0us





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.66,0.66,0.66,0.66,530.59
XGBClassifier,0.64,0.64,0.64,0.64,43.18
CatBoostClassifier,0.64,0.64,0.64,0.64,79.85
LGBMClassifier,0.64,0.64,0.64,0.64,4.79
RidgeClassifierCV,0.63,0.63,0.63,0.62,4.46
AdaBoostClassifier,0.63,0.63,0.63,0.62,117.68


## 하이퍼 파라미터 최적화

### 베이지안 최적화

#### HyperOpt를 통한 최적화 예시

<img src=https://blog.kakaocdn.net/dn/mjYYx/btrCebrvgeD/7wRIsVYkCpFMcxHX4TgxB1/img.png width=1200>

**구성 요소 3가지**
1. Search Space : 입력값 범위. x와 y의 입력값 범위를 준다.
2. 목적 함수 : 블랙박스 함수. 내부가 어떻게 돌아가는지 모르고 입력값에 대한 반환값을 알 수 있는 함수
3. 목적 함수 반환 최소값 유추 예시 -> **fmin**함수가 핵심
-> objective_func함수에 search_space 범위의 입력값을 넣었을 때, max_evlas번 반복하며 베이지안 최적화로 최소값 유추하는 예시.

    여기선 5번 반복하며 최소값 64.080 찾음

<img src=https://blog.kakaocdn.net/dn/bykjOu/btrCiQOCfIV/flkFRV5K2U8XFNLSn6aGO1/img.png width=1200>

In [6]:
data['X_train'].shape
data['y_train'].shape

(245436, 11)

(245436,)

In [7]:
X_train = data['X_train'].iloc[:100,:]
y_train = data['y_train'][:100]

X_train.shape, y_train.shape

((100, 11), (100,))

In [8]:
from hyperopt import hp
# max_depth는 5에서 20까지 1간격. mind_child_weight는 1에서 2까지 1간격
# colsample_bytree는 0.5~1 사이 정규분포 랜덤값, learning_rate는 0.01~0.2 사이 정규분포 랜덤값.
# hp.uniform: 정의된 범위 내에서 (정규분포로)랜덤 숫자 추출. (실수반환)
# hp.quniform: 정의된 범위 내에서 마지막 숫자만큼의 간격을 두어 숫자 추출. (실수반환)
xgb_search_space = {
    'max_depth' : hp.quniform('max_depth', 5, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1), # child가 되는데 필요한 최소 가중치의 합
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1) # 각 트리마다의 feature 샘플링 비율
                    }

In [30]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from xgboost import XGBClassifier, XGBRegressor
from hyperopt import STATUS_OK, fmin, tpe, Trials

### XGBRegressor을 이용한 최적화 실행

In [15]:
def objective_func(search_space):
    xgb_reg = XGBRegressor(n_estimators=10, max_depth=int(search_space['max_depth']), # search_space는 실수 -> int필요하면 변환
                            min_child_weight=int(search_space['min_child_weight']), # child가 되는데 필요한 최솟 가중치
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'], # 각 트리마다의 feature 샘플링 비율
                            eval_metric='logloss')
    neg_mse_scores = cross_val_score(xgb_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5, verbose=0)
    # mse는 cv=5 개수만큼 리스트로 나온다.
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    return {'loss':avg_rmse, 'status': STATUS_OK}

trial_val = Trials()

best = fmin(fn=objective_func, space=xgb_search_space, algo=tpe.suggest,
            max_evals=50, trials=trial_val#, rstate=np.random.default_rng(seed=9)
           )
print(f"best: {best}")

100%|██████████| 50/50 [04:43<00:00,  5.68s/trial, best loss: 2.7560139291231875]
best: {'colsample_bytree': 0.5427759431970955, 'learning_rate': 0.18105193338813505, 'max_depth': 5.0, 'min_child_weight': 1.0}


In [33]:
## 50번 시도한 하이퍼파라미터와 성능 기록
len(trial_val.trials)

50

In [16]:
# best일때의 하이퍼파라미터 출력
print(f"colsample_bytree: {best['colsample_bytree']:.4f}, learning_rate: {best['learning_rate']:.4f}, \
max_depth: {int(best['max_depth'])}, min_child_weight: {int(best['min_child_weight'])}")

colsample_bytree: 0.5428, learning_rate: 0.1811, max_depth: 5, min_child_weight: 1


## KFOLD 예시

In [46]:
data.keys()

dict_keys(['train', 'test', 'field_dims', 'users', 'books', 'sub', 'idx2user', 'idx2isbn', 'user2idx', 'isbn2idx', 'X_train', 'X_valid', 'y_train', 'y_valid'])

In [61]:
## 실행만 하는게 목적이니, 대충 train 100개, test 10개 추출
X_train_df = data['X_train'].iloc[:100,:]
y_train_df = data['y_train'][:100]
X_test = data['X_valid'].iloc[:10,:]
y_test = data['y_valid'][:10]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((100, 11), (100,), (10, 11), (10,))

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(solver='liblinear')

#DecisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print(f"DecisionTreeClassifier 정확도 : {accuracy_score(y_test, dt_pred):.2%}")

# RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print(f"RandomForestClassifier 정화도 : {accuracy_score(y_test, rf_pred):.2%}")

# LogisticRegression 학습/예측/평가
lr_clf.fit(X_train,y_train)
lr_pred = lr_clf.predict(X_test)
print(f"LogisticRegression  정확도  : {accuracy_score(y_test, lr_pred):.2%}")

DecisionTreeClassifier(random_state=11)

DecisionTreeClassifier 정확도 : 0.00%


RandomForestClassifier(random_state=11)

RandomForestClassifier 정화도 : 10.00%


LogisticRegression(solver='liblinear')

LogisticRegression  정확도  : 10.00%


### KFold 예시

In [62]:
def exec_kfold(clf, folds=5):
    # 폴드 세트를 5개인 KFold 객체 생성
    kfold = KFold(n_splits=folds)
    # 폴드 수만큼 예측 결과 정확도 저장할 리스트
    scores = []
    print(f">>> {clf} 모델")
    # KFold 교차 검증 수행.
    for cnt, (train_idx, test_idx) in enumerate(kfold.split(X_train_df),1):
        # X_train_df에서 교차 검증별로 학습/검증 데이터 셋 분리
        X_train, X_test = X_train_df.values[train_idx], X_train_df.values[test_idx]
        y_train, y_test = y_train_df.values[train_idx], y_train_df.values[test_idx]
        
        # Classifier 학습/예측/정확도 계산
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc = accuracy_score(y_test, pred)
        scores.append(acc)
        print(f"  교차 검증{cnt} 정확도: {acc:.2%}")
    print(f"  평균 검증 정확도: {np.mean(scores):.2%}\n")
# DecisionTreeClassifier -> exec_kfold 호출
exec_kfold(dt_clf, folds=5)
exec_kfold(rf_clf, folds=5)
exec_kfold(lr_clf, folds=5)

>>> DecisionTreeClassifier(random_state=11) 모델
  교차 검증1 정확도: 5.00%
  교차 검증2 정확도: 10.00%
  교차 검증3 정확도: 25.00%
  교차 검증4 정확도: 20.00%
  교차 검증5 정확도: 20.00%
  평균 검증 정확도: 16.00%

>>> RandomForestClassifier(random_state=11) 모델
  교차 검증1 정확도: 10.00%
  교차 검증2 정확도: 20.00%
  교차 검증3 정확도: 20.00%
  교차 검증4 정확도: 20.00%
  교차 검증5 정확도: 20.00%
  평균 검증 정확도: 18.00%

>>> LogisticRegression(solver='liblinear') 모델
  교차 검증1 정확도: 5.00%
  교차 검증2 정확도: 20.00%
  교차 검증3 정확도: 30.00%
  교차 검증4 정확도: 30.00%
  교차 검증5 정확도: 20.00%
  평균 검증 정확도: 21.00%



### StratifiedKfold 예시

In [64]:
def exec_kfold(clf, folds=5):
    # 폴드 세트를 5개인 KFold 객체 생성
    kfold = StratifiedKFold(n_splits=folds)
    # 폴드 수만큼 예측 결과 정확도 저장할 리스트
    scores = []
    print(f">>> {clf} 모델")
    # KFold 교차 검증 수행.
    for cnt, (train_idx, test_idx) in enumerate(kfold.split(X_train_df, y_train_df),1):
        # X_train_df에서 교차 검증별로 학습/검증 데이터 셋 분리
        X_train, X_test = X_train_df.values[train_idx], X_train_df.values[test_idx]
        y_train, y_test = y_train_df.values[train_idx], y_train_df.values[test_idx]
        
        # Classifier 학습/예측/정확도 계산
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc = accuracy_score(y_test, pred)
        scores.append(acc)
        print(f"  교차 검증{cnt} 정확도: {acc:.2%}")
    print(f"  평균 검증 정확도: {np.mean(scores):.2%}\n")
# DecisionTreeClassifier -> exec_kfold 호출
exec_kfold(dt_clf, folds=5)
exec_kfold(rf_clf, folds=5)
exec_kfold(lr_clf, folds=5)

>>> DecisionTreeClassifier(random_state=11) 모델
  교차 검증1 정확도: 15.00%
  교차 검증2 정확도: 15.00%
  교차 검증3 정확도: 10.00%
  교차 검증4 정확도: 30.00%
  교차 검증5 정확도: 20.00%
  평균 검증 정확도: 18.00%

>>> RandomForestClassifier(random_state=11) 모델
  교차 검증1 정확도: 15.00%
  교차 검증2 정확도: 20.00%
  교차 검증3 정확도: 30.00%
  교차 검증4 정확도: 15.00%
  교차 검증5 정확도: 30.00%
  평균 검증 정확도: 22.00%

>>> LogisticRegression(solver='liblinear') 모델
  교차 검증1 정확도: 5.00%
  교차 검증2 정확도: 10.00%
  교차 검증3 정확도: 20.00%
  교차 검증4 정확도: 30.00%
  교차 검증5 정확도: 25.00%
  평균 검증 정확도: 18.00%

