<a href="https://colab.research.google.com/github/aettikang/bigdata_analysis_basic/blob/main/Boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 필요한 파이썬 라이브러리 불러오기
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, log_loss, classification_report)
import xgboost

In [None]:
attrition = pd.read_csv('https://raw.githubusercontent.com/aettikang/bigdata_analysis_basic/main/HR-Employee-Attrition.csv')
attrition.head()

In [None]:
attrition.info()

In [None]:
attrition_cat_dummies = pd.get_dummies(attrition) #가급적 get_dummies쓸것
attrition_cat_dummies.head(3)

In [5]:
feature_columns = list(attrition_cat_dummies.columns.difference(["Attrition"]))

X = attrition_cat_dummies[feature_columns]
y = attrition_cat_dummies['Attrition'].astype('category') 

In [7]:
# Import the train_test_split method
from sklearn.model_selection import train_test_split

# Split data into train and test sets as well as for validation and testing
train_X, test_X, train_y, test_y = train_test_split(X,y, 
                                              train_size= 0.80,
                                              random_state=0);

In [None]:
from sklearn.model_selection import GridSearchCV

gb_params = {'n_estimators' : [1000, 1500],
           'max_depth' : [4, 6],
           'min_samples_leaf' : [2, 4],
           'max_features' : [0.7, 0.9],
           'learning_rate' : [0.25,0.3]
            }

# RandomForest Classifier 객체 생성 후 GridSearchCV 수행
gb_clf = GradientBoostingClassifier(random_state = 0)
grid_cv = GridSearchCV(gb_clf, param_grid = gb_params, cv = 3, n_jobs = -1)
grid_cv.fit(train_X, train_y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

In [9]:
# Gradient Boosting Parameters
seed = 0
gb_params ={
    'n_estimators': 1000,
    'max_features': 0.9,
    'learning_rate' : 0.3,
    'max_depth': 6,
    'min_samples_leaf': 4,
    'random_state' : seed,
    'verbose': 0
}

In [10]:
gb = GradientBoostingClassifier(**gb_params)
gb.fit(train_X, train_y)
gb_predictions = gb.predict(test_X)

In [None]:
print("Accuracy score: {}".format(accuracy_score(test_y, gb_predictions)))
print(classification_report(test_y, gb_predictions))

In [None]:
#feat = feature_columns
imp = gb.feature_importances_
df = pd.DataFrame({'Feature': feature_columns, 'Importance': imp})
df = df.sort_values('Importance', ascending=False)[:5]
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.barplot(x='Importance', y='Feature', data=df);