## 랜덤 포레스트 - 그리드 검색

In [9]:
# 랜덤 포레스트 분류기 - 그리드 검색
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd



In [10]:
# 데이터 가져오기
hrattr = pd.read_csv('./Data/HR_Employee_Attrition.csv')

In [12]:
# 데이터 컬럼 및 데이터 타입 확인
hrattr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [43]:
# 종속변수 : Attrition
# Attrition의 Yes, No -> 1, 0 으로 변환
hrattr['Attrition_ind'] = 0
hrattr.loc[hrattr['Attrition'] == 'Yes', 'Attrition_ind'] = 1

In [44]:
# 연속형변수와, 범주형변수를 구분하여 정의
discrete_columns = list()
continuous_columns = list()
dtypes = list(hrattr.dtypes)
for dtype in enumerate(dtypes):
    if dtype[1] == 'object':
        discrete_columns.append(hrattr.columns[dtype[0]])
    elif dtype[1] == 'int64':
        continuous_columns.append(hrattr.columns[dtype[0]])
discrete_columns = sorted(discrete_columns)
continuous_columns = sorted(continuous_columns)

In [45]:
# object -> dummy로 변환
dummies_list = list()
for col in discrete_columns:
    dummies_list.append(pd.get_dummies(hrattr[col], prefix = col))

In [46]:
# discrete_df 만들기
discrete_df = pd.concat(dummies_list, axis = 1)

In [47]:
# continuous_df 만들기
continuous_df = hrattr[continuous_columns]

In [48]:
# 통합 데이터 프레임 만들기
hrattr_new = pd.concat([discrete_df, continuous_df], axis = 1)

In [49]:
# 데이터 확인
hrattr_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 58 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Attrition_No                       1470 non-null   uint8
 1   Attrition_Yes                      1470 non-null   uint8
 2   BusinessTravel_Non-Travel          1470 non-null   uint8
 3   BusinessTravel_Travel_Frequently   1470 non-null   uint8
 4   BusinessTravel_Travel_Rarely       1470 non-null   uint8
 5   Department_Human Resources         1470 non-null   uint8
 6   Department_Research & Development  1470 non-null   uint8
 7   Department_Sales                   1470 non-null   uint8
 8   EducationField_Human Resources     1470 non-null   uint8
 9   EducationField_Life Sciences       1470 non-null   uint8
 10  EducationField_Marketing           1470 non-null   uint8
 11  EducationField_Medical             1470 non-null   uint8
 12  EducationField_Other

In [50]:
# 필요없는 컬럼 제거하기
remove_cols = ['Attrition_No', 'Attrition_Yes', 'EmployeeCount', 'EmployeeNumber', 'Over18_Y', 'StandardHours']
hrattr_new.drop(remove_cols, axis = 1, inplace = True)

In [51]:
# 데이터 프레임 재확인
hrattr_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 52 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   BusinessTravel_Non-Travel          1470 non-null   uint8
 1   BusinessTravel_Travel_Frequently   1470 non-null   uint8
 2   BusinessTravel_Travel_Rarely       1470 non-null   uint8
 3   Department_Human Resources         1470 non-null   uint8
 4   Department_Research & Development  1470 non-null   uint8
 5   Department_Sales                   1470 non-null   uint8
 6   EducationField_Human Resources     1470 non-null   uint8
 7   EducationField_Life Sciences       1470 non-null   uint8
 8   EducationField_Marketing           1470 non-null   uint8
 9   EducationField_Medical             1470 non-null   uint8
 10  EducationField_Other               1470 non-null   uint8
 11  EducationField_Technical Degree    1470 non-null   uint8
 12  Gender_Female       

In [52]:
# train, test 데이터 나누기
x_train, x_test, y_train, y_test = train_test_split(hrattr_new.drop('Attrition_ind', axis = 1), hrattr_new['Attrition_ind'])

In [80]:
# pipeline
pipeline = Pipeline([
    ('clf', RandomForestClassifier(criterion = 'gini', random_state = 42, class_weight = { 0 : 0.3, 1 : 0.7 }))
])

In [81]:
# parameter 설정
parameters = {
    'clf__n_estimators' : (2000, 3000, 4000),
    'clf__max_depth'    : (5, 15, 30),
    'clf__min_samples_split' : (2,3),
    'clf__min_samples_leaf' : (1,2)
}

In [82]:
# grid_search
grid_search = GridSearchCV(
    pipeline,
    parameters,
    n_jobs = -1,
    cv = 5,
    verbose = 1,
    scoring = 'accuracy' 
)

In [83]:
# grid_search 학습
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  4.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('clf',
                                        RandomForestClassifier(class_weight={0: 0.3,
                                                                             1: 0.7},
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': (5, 15, 30),
                         'clf__min_samples_leaf': (1, 2),
                         'clf__min_samples_split': (2, 3),
                         'clf__n_estimators': (2000, 3000, 4000)},
             scoring='accuracy', verbose=1)

In [84]:
# best score - 학습
print("%0.3f" % grid_search.best_score_)

0.857


In [85]:
# best_parameters - 학습
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('clf',
   RandomForestClassifier(class_weight={0: 0.3, 1: 0.7}, max_depth=5,
                          n_estimators=2000, random_state=42))],
 'verbose': False,
 'clf': RandomForestClassifier(class_weight={0: 0.3, 1: 0.7}, max_depth=5,
                        n_estimators=2000, random_state=42),
 'clf__bootstrap': True,
 'clf__ccp_alpha': 0.0,
 'clf__class_weight': {0: 0.3, 1: 0.7},
 'clf__criterion': 'gini',
 'clf__max_depth': 5,
 'clf__max_features': 'auto',
 'clf__max_leaf_nodes': None,
 'clf__max_samples': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_impurity_split': None,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__n_estimators': 2000,
 'clf__n_jobs': None,
 'clf__oob_score': False,
 'clf__random_state': 42,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [86]:
# 예측값 - 테스트
prediction = grid_search.predict(x_test)

In [87]:
# 정확도 - 테스트
round(accuracy_score(y_test, prediction), 3)

0.883

In [88]:
# classification_report - 테스트
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94       317
           1       0.68      0.29      0.41        51

    accuracy                           0.88       368
   macro avg       0.79      0.64      0.67       368
weighted avg       0.87      0.88      0.86       368



In [89]:
# confusion matrix - 테스트
pd.crosstab(y_test,prediction, rownames = ['Actual'], colnames = ['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,310,7
1,36,15
