<a href="https://colab.research.google.com/github/bob8dod/ML-studying/blob/main/2021ML/9.%20Ansemble(Majority%20Voting%2C%20Bagging%2C%20Boosting).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head(3)

Unnamed: 0,ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,PersonalLoan,SecuritiesAccount,CDAccount,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0


In [None]:
x = bank_df.drop(['ID','ZIPCode','PersonalLoan'], axis=1)
y = bank_df['PersonalLoan']

# Majority Voting (with Cross Validation, GridSearchCV) -> voting='soft'

In [None]:
#학습에 사용되는 모델 개별 정의
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(solver='liblinear', penalty='l2', C=0.001, random_state=1)
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
tree = DecisionTreeClassifier(max_depth=None, criterion='entropy', random_state=1)

In [None]:
#해당 모델들을 이용하는 Majority Voting 정의
from sklearn.ensemble import VotingClassifier
voting_estimators = [('logistic',logistic),('tree',tree),('knn',knn)]
voting = VotingClassifier(estimators=voting_estimators, voting='soft')

In [None]:
#voting을 쓰는 것이 옳은지 판단. (교차검증이용) -> 각각에 대하여 성능확인
from sklearn.model_selection import cross_val_score
clf_labels = ['DecisionTreeClassifier','KNeighborsClassifier', 'LogisticRegression', 'Majority voting']
all_clf = [tree, knn, logistic, voting]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=x, y=y, cv=10, scoring='roc_auc')
    print('[%s] ROC_AUC : %.4f  (+/-%.4f)' %(label, scores.mean(), scores.std()))

[DecisionTreeClassifier] ROC_AUC : 0.9489  (+/-0.0194)
[KNeighborsClassifier] ROC_AUC : 0.8803  (+/-0.0247)
[LogisticRegression] ROC_AUC : 0.9301  (+/-0.0129)
[Majority voting] ROC_AUC : 0.9800  (+/-0.0097)


In [None]:
#voting이 best모델이므로 GridSearchCV를 통한 최적의 모델 및 파라미터 확인
from sklearn.model_selection import GridSearchCV
params = {'logistic__C':[0.001,0.1,100.0], #개별 모델에 대한 파라미터 설정
          'tree__max_depth':[1,3,5],
          'knn__n_neighbors':[1,3,5]}
gs_cv = GridSearchCV(estimator = voting, param_grid=params, cv=10, scoring='roc_auc')
gs_cv.fit(x, y)

# for r, _ in enumerate(gs_cv.cv_results_['mean_test_score']): #각 경우의 수에 대한 성능 확인
#     print("%0.3f +/- %0.3f %r"
#           % (gs_cv.cv_results_['mean_test_score'][r], 
#              gs_cv.cv_results_['std_test_score'][r] / 2.0, 
#              gs_cv.cv_results_['params'][r]))
    
print(gs_cv. best_params_) #결과의 최적의 파라미터
print(gs_cv.best_score_) #결과의 최적의 성능 점수

{'knn__n_neighbors': 5, 'logistic__C': 100.0, 'tree__max_depth': 5}
0.986910029498525


# Bagging -> n_estimator

In [None]:
#bagging에 사용될 Classifier 선언 - Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=None, criterion='entropy',random_state=1)

In [None]:
#해당 모델을 이용하는 Bagging 정의 (tree를 반복적으로 사용하는 것임)
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(base_estimator=tree, n_estimators=100, max_samples=1.0,
                            bootstrap=True, bootstrap_features= False,n_jobs=1, random_state=1)

In [None]:
#ensemble을 사용할지 기존모델인 DecsionTree를 사용할지 비교 (평가)
from sklearn.model_selection import cross_val_score
clv_label = ['DecisionTree','Bagging']
all_clv = [tree, bagging]
for clv, label in zip(all_clv, clv_label):
    score = cross_val_score(estimator=clv, X=x, y=y, scoring='roc_auc',cv=10)
    print('[%s] ROC_AUC: %.4f' %(label, score.mean()))

[DecisionTree] ROC_AUC: 0.9489
[Bagging] ROC_AUC: 0.9972


In [None]:
bagging.fit(x_train,y_train)
bagging.score(x_test,y_test)

0.982

In [None]:
#Bagging이 더 성능이 좋기 때문에 Bagging에서 Tree의 파리마터를 자동으로 선택
from sklearn.model_selection import GridSearchCV
params = { 'base_estimator__max_depth':[None,1,3,5], 'base_estimator__criterion':['entropy', 'gini']} 
#bagging.get_params().keys()를 통해서 사용해야할 이름 (base_estimator__criterion,,,이런거  확인)
grid = GridSearchCV(estimator=bagging, param_grid=params,cv=10, scoring='roc_auc')
grid.fit(x, y)
print(grid.best_params_)
print(grid.best_score_)
best_model = grid.best_estimator_ # 가장 좋은 성능을 낸 파라미터를 이용하여 새로운 모델로 정의

{'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 5}
0.9976470317109143


# AdaBoost -> n_estimators,learning_rate

In [None]:
#boost에 사용될 모델 정의 (max_depth=1)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=None, criterion='entropy',random_state=1) #max_depth=1 인게 voting이랑 다름

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=100,
                              learning_rate = 0.1, random_state=1) 

In [None]:
#Ensemble사용과 Tree사용 비교
from sklearn.model_selection import cross_val_score
clf_labels = ['Decision Tree','AdaBoost']
clf_all = [tree, adaboost]
for clf, label in zip(clf_all, clf_labels):
    scores = cross_val_score(estimator=clf, X=x, y=y, cv=10, scoring='roc_auc')
    print('[%s] ROC_AUC : %.4f' %(label, scores.mean()))

[Decision Tree] ROC_AUC : 0.9489
[AdaBoost] ROC_AUC : 0.9479


In [None]:
adaboost.get_params().keys()

dict_keys(['algorithm', 'base_estimator__ccp_alpha', 'base_estimator__class_weight', 'base_estimator__criterion', 'base_estimator__max_depth', 'base_estimator__max_features', 'base_estimator__max_leaf_nodes', 'base_estimator__min_impurity_decrease', 'base_estimator__min_samples_leaf', 'base_estimator__min_samples_split', 'base_estimator__min_weight_fraction_leaf', 'base_estimator__random_state', 'base_estimator__splitter', 'base_estimator', 'learning_rate', 'n_estimators', 'random_state'])

In [None]:
#파리미터 자동 설정으로 성능 올리기
from sklearn.model_selection import GridSearchCV
params = {'base_estimator__criterion':['entropy', 'gini'], 'base_estimator__max_depth':[None,1,3,5]} 
grid = GridSearchCV(estimator=adaboost, param_grid=params, scoring='roc_auc', cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_score_)

{'base_estimator__criterion': 'entropy', 'base_estimator__max_depth': 3}
0.9968058628318582


# GBM(Gradient Bootsting) -> n_estimators, learning_rate

In [None]:
#Tree를 이용하거나 그러진않음
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5,random_state=1) 
#learning_rate가 영향을 많이 미침

In [None]:
gbm.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [None]:
#파리미터 자동 설정으로 성능 올리기
from sklearn.model_selection import GridSearchCV
params = {'learning_rate':[0.8,0.5,0.3], 'max_depth':[None,1,3,5]} 
grid = GridSearchCV(estimator=gbm, param_grid=params, scoring='roc_auc', cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_score_)

{'learning_rate': 0.3, 'max_depth': 5}
0.9977599557522124


In [None]:
best_model.score(x_test,y_test)

0.986

# Light GBM -> n_estimators, learning_rate

In [None]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(n_estimator=100, learning_rate=0.5, random_state=1)
lgbm.fit(x,y)

LGBMClassifier(learning_rate=0.5, n_estimator=100, random_state=1)

In [None]:
#파리미터 자동 설정으로 성능 올리기
from sklearn.model_selection import GridSearchCV
params = {'learning_rate':[0.8,0.5,0.3], 'max_depth':[None,1,3,5]} 
grid = GridSearchCV(estimator=lgbm, param_grid=params, scoring='roc_auc', cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_score_)

{'learning_rate': 0.3, 'max_depth': None}
0.9981102507374631
