# 부스팅 모형 (Boost)

- Weak Learner: 동전던지기 보다 조금 더 잘 예측하는 모형
- Boosting: Weak Learner를 앙상블로 결합시켜 강한 예측 모형을 개발하는 방법론

![](https://upload.wikimedia.org/wikipedia/commons/b/b5/Ensemble_Boosting.svg)

## 환경설정

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing # 전처리

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score

from sklearn.ensemble import GradientBoostingRegressor

## 데이터셋

In [3]:
cancer_df = pd.read_csv('data/breast_cancer.csv')

# list(cancer_df.columns)
y = cancer_df[['diagnosis']]
X = cancer_df.loc[:, 'radius_mean':'fractal_dimension_worst']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
y_train = np.ravel(y_train, order='C') # KNN : A column-vector y was passed when a 1d array was expected

  y = column_or_1d(y, warn=True)


## 기계학습 - CV


In [14]:
clf_xgb = gbt = GradientBoostingRegressor(n_estimators = 100,
                                          max_depth    = 1,
                                          random_state = 777)

clf_xgb.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=1000, random_state=777)

## 예측 성능

In [15]:
y_pred = clf_xgb.predict(X_test)
preds_1d = y_pred.flatten() # 차원 펴주기

pred_class = np.where(preds_1d > 0.2, 1 , 0)  #0.2보다크면 1, 작으면 0

print('Train F1: {:.3f}'.format(f1_score(y_test, pred_class, average = 'micro')))

Train F1: 0.930


# 번외
## cutoff 값 정하기

In [7]:
from sklearn.metrics import f1_score

def choose_cutoff(y_true, y_pred, metric=f1_score):
  best_cutoff = 0
  best_score = 0
  for cutoff in range(0, 100):
    y_pred_cutoff = (y_pred > cutoff/100).astype(int)
    score = metric(y_true, y_pred_cutoff)
    if score > best_score:
      best_cutoff = cutoff
      best_score = score
  return best_cutoff, best_score

# y_true = [0, 0, 1, 1]
# y_pred = [0.1, 0.3, 0.7, 0.9]
cutoff, score = choose_cutoff(y_test, y_pred)
print(f"Best cutoff: {cutoff/100}")
print(f"Best score: {score}")


Best cutoff: 0.2
Best score: 0.9534883720930233
