# 랜덤포레스트
- 의사결정 트리
    - 높은 분산, 낮은 편향 => 모델의 안정성 떨어짐
- 랜덤포레스트
    - 의사결정 트리를 평균화 => 모델의 분산을 안정화
   
- 배깅 : 전체 테이블에서 소수의 관측값과 전체 열이 선택
- 랜덤포레스트 : 소수의 관측값과 열이 선택 => 서로 상관없는 개별 트리 생성

- 의사결정 트리의 앙상블 과정으로 인해 랜덤포레스트는 개별 변수에 관한 해석이 힘듬 => 변수의 유의성을 결정할 수 없음
- 따라서, 변수의 중요도만 제공됨

In [1]:
# 모듈 가져오기
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# 데이터 불러오기
credit_data = pd.read_csv('./Data/german_credit_new.csv')

In [4]:
# 데이터 shap 확인
credit_data.shape

(1000, 21)

In [6]:
# 데이터 컬럼 확인
credit_data.columns

Index(['Status of existing checking account', 'Duration in month',
       'Credit history', 'Purpose', 'Credit amount', 'Savings account/bonds',
       'Present employment since',
       'Installment rate in percentage of disposable income',
       'Personal status and sex', 'Other debtors / guarantors',
       'Present residence since', 'Property', 'Age in years',
       'Other installment plans', 'Housing',
       'Number of existing credits at this bank', 'Job',
       'Number of people being liable to provide maintenance for', 'Telephone',
       'foreign worker', 'class'],
      dtype='object')

In [8]:
# 종속변수 class 1 ,2 를 -> 0, 1로 변환
credit_data['class'] = credit_data['class'] -1

In [12]:
# 명목형 변수와, 연속형 변수로 나눠서 컬럼을 정의하자
dtypes = credit_data.dtypes
continuous_columns = list()
discrete_columns = list()
for i in range(0, len(dtypes)):
    if dtypes[i] == 'object':
        discrete_columns.append(credit_data.columns[i])
        
    elif dtypes[i] == 'int64':
        continuous_columns.append(credit_data.columns[i])

In [13]:
# 컬럼 데이터 확인
print(continuous_columns)
print(discrete_columns)

['Duration in month', 'Credit amount', 'Installment rate in percentage of disposable income', 'Present residence since', 'Age in years', 'Number of existing credits at this bank', 'Number of people being liable to provide maintenance for', 'class']
['Status of existing checking account', 'Credit history', 'Purpose', 'Savings account/bonds', 'Present employment since', 'Personal status and sex', 'Other debtors / guarantors', 'Property', 'Other installment plans', 'Housing', 'Job', 'Telephone', 'foreign worker']


In [15]:
# 명목형 데이터는 더미 변수로 변환 필요
dummies_list = list()
for col in discrete_columns:
    dummies_list.append(pd.get_dummies(credit_data[col], prefix = col))

In [17]:
# 더비변수들을 이용해서 데이터프레임 새로 만들기
dummies_df = pd.concat(dummies_list, axis = 1)

In [18]:
# 데이터분석을 위한 새로운 통합 데이터 프레임 만들기
credit_data_new = pd.concat([credit_data[continuous_columns], dummies_df], axis = 1)

In [19]:
# 데이터 프레임 확인하기
credit_data_new

Unnamed: 0,Duration in month,Credit amount,Installment rate in percentage of disposable income,Present residence since,Age in years,Number of existing credits at this bank,Number of people being liable to provide maintenance for,class,Status of existing checking account_A11,Status of existing checking account_A12,...,Housing_A152,Housing_A153,Job_A171,Job_A172,Job_A173,Job_A174,Telephone_A191,Telephone_A192,foreign worker_A201,foreign worker_A202
0,6,1169,4,4,67,2,1,0,1,0,...,1,0,0,0,1,0,0,1,1,0
1,48,5951,2,2,22,1,1,1,0,1,...,1,0,0,0,1,0,1,0,1,0
2,12,2096,2,3,49,1,2,0,0,0,...,1,0,0,1,0,0,1,0,1,0
3,42,7882,2,4,45,1,2,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,4,31,1,1,0,0,0,...,1,0,0,1,0,0,1,0,1,0
996,30,3857,4,4,40,1,1,0,1,0,...,1,0,0,0,0,1,0,1,1,0
997,12,804,4,4,38,1,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
998,45,1845,4,4,23,1,1,1,1,0,...,0,1,0,0,1,0,0,1,1,0


In [41]:
# train, test 데이터 분할하기
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_train, x_test, y_train, y_test = train_test_split(credit_data_new.drop('class', axis = 1), credit_data_new['class'], test_size = 0.3, random_state = 42)

In [42]:
# 데이터 분할 상태 확인하기
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((700, 61), (300, 61), (700,), (300,))

## 랜덤 포레스트 머신 러닝 모델

- 트리 개수 : 1000개
- 분할 기준 : 지니계수
- 각 트리의 최대 깊이 : 100
- 분할에 필요한 각 노드의 최소 관측값 : 3
- 트리 노드의 최소 관측값 수 : 2

In [50]:
# 모델 생성
rf_fit = RandomForestClassifier(n_estimators = 1000, criterion = 'gini', max_depth = 100, min_samples_split = 3, min_samples_leaf = 2, random_state = 43)
rf_fit.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=43, verbose=0,
                       warm_start=False)

In [51]:
# 예측값
y_pred = rf_fit.predict(x_train)

In [52]:
# 데이터 shpae 확인
y_train.shape, y_pred.shape

((700,), (700,))

In [53]:
# confusion matrix - validation
pd.crosstab(y_train, y_pred, rownames = ['Actual'], colnames = ['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,491,0
1,28,181


In [54]:
# 테스트 값
y_pred_test = rf_fit.predict(x_test)

In [55]:
# confusion matrix - test
pd.crosstab(y_test, y_pred_test, rownames = ['Actual'], colnames = ['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,197,12
1,59,32


In [57]:
# 정확도 - train
accuracy_score(y_train, y_pred)

0.96

In [58]:
# 정확도 - test
accuracy_score(y_test, y_pred_test)

0.7633333333333333

## 그리드 검색
- 트리개수 : (1000, 2000, 3000)
- 최대 깊이 : (100, 200, 300)
- 분할당 최소 표본 : (2, 3)
- 단말 노드의 최소 표본 : (1,2)
        