<a href="https://colab.research.google.com/github/YunSeoHwan/DNN_Study/blob/main/optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import**

In [12]:
!pip install bayesian-optimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.2-py3-none-any.whl (17 kB)
Collecting colorama>=0.4.6
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.2 colorama-0.4.6


In [52]:
import pandas as pd
import random
import os
import numpy as np
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

from bayes_opt import BayesianOptimization
from sklearn.metrics import make_scorer, roc_auc_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error, r2_score

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

# **Data Load**

In [30]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [38]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])

# 정답 target
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

# **Data Pre-processing**

In [40]:
# 결측치 0으로
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

# split data
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.25, random_state = 0)

In [42]:
# qualitative to quantitative
# 이름을 숫자로 변환
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(x_train[i])
    l = LabelEncoder()
    l = l.fit(test_x[i])

    x_train[i] = le.transform(x_train[i])
    test_x[i] = l.transform(test_x[i])

    for label in np.unique(x_test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    x_test[i] = le.transform(x_test[i])

print('Done.')

Done.


Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,5,2,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,2,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,2,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Optimization**

In [58]:
# 탐색 대상 함수 (XGBClassifier)
def XGB_cv(max_depth,learning_rate, n_estimators, gamma
            ,min_child_weight, subsample
            ,colsample_bytree, silent=True, nthread=-1):

    # 모델 정의
    model = XGBClassifier(max_depth=int(max_depth),
                            learning_rate=learning_rate,
                            n_estimators=int(n_estimators),
                            gamma=gamma,
                            min_child_weight=min_child_weight,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree, 
                            nthread=nthread
                            )
    # 모델 훈련
    model.fit(x_train, y_train)

    # 예측값 출력
    pred = model.predict(x_test)

    # 예측률
    result = accuracy_score(y_test, pred)

    # 오차 최적화로 사용할 metric 반환
    return result

# 실험해보고자하는 hyperparameter 집합
pbounds = {'max_depth': (3, 7),
            'learning_rate': (0.01, 0.1),
            'n_estimators': (80, 200),
            'gamma': (0, 100),
            'min_child_weight': (0, 3),
            'subsample': (0.5, 1),
            'colsample_bytree' :(0.2, 1)
            }

# Bayesian optimization 객체 생성
# f : 탐색 대상 함수, pbounds : hyperparameter 집합
# verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
# random_state : Bayesian Optimization 상의 랜덤성이 존재하는 부분을 통제 
bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=37)    

# 메소드를 이용해 최대화 과정 수행
# init_points :  초기 Random Search 갯수
# n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
# acq : Acquisition Function들 중 Expected Improvement(EI) 를 사용
# xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=10, acq='ei', xi=0.01)

# ‘iter’는 반복 회차, ‘target’은 목적 함수의 값, 나머지는 입력값을 나타냅니다. 
# 현재 회차 이전까지 조사된 함숫값들과 비교하여, 현재 회차에 최댓값이 얻어진 경우, 
# bayesian-optimization 라이브러리는 이를 자동으로 다른 색 글자로 표시하는 것을 확인할 수 있습니다

# 찾은 파라미터 값 확인
print(bo.max)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6867   [0m | [0m0.9556   [0m | [0m46.41    [0m | [0m0.02735  [0m | [0m5.328    [0m | [0m1.86     [0m | [0m162.1    [0m | [0m0.5517   [0m |
| [0m2        [0m | [0m0.6867   [0m | [0m0.7964   [0m | [0m28.2     [0m | [0m0.07781  [0m | [0m6.171    [0m | [0m1.882    [0m | [0m133.2    [0m | [0m0.9817   [0m |
| [0m3        [0m | [0m0.6867   [0m | [0m0.2596   [0m | [0m99.11    [0m | [0m0.0369   [0m | [0m4.96     [0m | [0m1.736    [0m | [0m80.7     [0m | [0m0.5536   [0m |
| [0m4        [0m | [0m0.6867   [0m | [0m0.9507   [0m | [0m98.75    [0m | [0m0.01618  [0m | [0m3.336    [0m | [0m2.204    [0m | [0m199.3    [0m | [0m0.9189   [0m |
| [95m5        [0m | [95m0.78     [0m | [95m0.5

# **Training & Inference**

In [70]:
xgb = XGBClassifier(random_state=37, colsample_bytree=0.3, max_depth=6, 
                    learning_rate=0.04, gamma=0.9, subsample=0.6870832907371096, 
                    min_child_weight=0.8, n_estimators=90).fit(x_train, y_train)

print("훈련 세트 정확도: {:.3f}".format(xgb.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(xgb.score(x_test, y_test)))
preds = xgb.predict(test_x)

# preds = xgb.predict(test_x)
# t = GB.predict(train_x)
# t = xtree.predict(train_x)
print('Done.')

훈련 세트 정확도: 0.984
테스트 세트 정확도: 0.807
Done.


# **Submit**

In [71]:
submit = pd.read_csv('/content/sample_submission.csv')
submit['Y_Class'] = preds
submit.to_csv('/content/baseline_submission12.csv', index=False)