In [215]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

%matplotlib inline

### data load

In [95]:
titanic_df = pd.read_csv('../data/titanic_train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [96]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## 전체 흐름
### 1. 데이터 전처리
#### 결측치 처리
- Age >> 평균  
- Cabin : N(모르니깐 N으로 정의)  
- Embarked : N(모르니깐 N으로 정의)  

#### 컬럼 drop
- PassengerID
- Name
- Ticket

#### label encoding - 범주형
- Sex
- Cabin
- Embarked

#### 추가 고민사항
- Age, Fare(수치형 데이터)
    - 그대로 사용할지
    - 스케일링
    - 비닝


### 2. 데이터 분할


### 3. 학습
#### 교차 검증

#### 파라미터 튜닝 + 교차검증
- GridSearchCV

### 4. feature engineering(정확도 높이기)
- 속성 선택(추가 or 삭제 등등)
- 파라미터 지정

### 5. 하이퍼 파라미터 튜닝
- 모델 : RF, LR
- 스케일링


In [97]:
titanic_df.fillna({'Age' : titanic_df['Age'].mean()}, inplace = True)

titanic_df.fillna({'Cabin' : 'N'}, inplace=True)

titanic_df.fillna({'Embarked' : 'N'}, inplace = True)

titanic_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [98]:
titanic_df.drop(['PassengerId', 'Name', 'Ticket'], axis= 1, inplace = True)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,N,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,N,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,N,S


In [99]:
# cabin 컬럼은 범주이 너무 많으니 왼쪽 한글자만 사용
titanic_df['Cabin'].unique()

titanic_df['Cabin'] = titanic_df['Cabin'].str[:1]
titanic_df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,N,S
1,1,1,female,38.0,1,0,71.2833,C,C


In [100]:
col_names = ['Cabin', 'Sex', 'Embarked']

for i in col_names:
    le = LabelEncoder()
    le.fit(titanic_df[i])
    titanic_df[i] = le.transform(titanic_df[i])

titanic_df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,3
1,1,1,0,38.0,1,0,71.2833,2,0


In [101]:
X_titanic_df = titanic_df.drop('Survived', axis = 1)
y_titanic_df = titanic_df['Survived']

X_train, X_test, y_train, y_test \
= train_test_split(X_titanic_df
                , y_titanic_df
                , test_size=0.2
                , random_state=11)


In [102]:
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lf_clf = LogisticRegression(solver = 'liblinear')
dt_clf, rf_clf, lf_clf

(DecisionTreeClassifier(random_state=11),
 RandomForestClassifier(random_state=11),
 LogisticRegression(solver='liblinear'))

In [103]:
dt_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
lf_clf.fit(X_train, y_train)

dt_pred = dt_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
lf_pred = lf_clf.predict(X_test)

print(f'의사결정트리 : {accuracy_score(y_test, dt_pred)}')
print(f'랜덤포레스트 : {accuracy_score(y_test, rf_pred)}')
print(f'선형회귀 : {accuracy_score(y_test, lf_pred)}')

의사결정트리 : 0.7877094972067039
랜덤포레스트 : 0.8547486033519553
선형회귀 : 0.8659217877094972


In [104]:
dt_clf_cv = DecisionTreeClassifier(random_state=11)
rf_clf_cv = RandomForestClassifier(random_state=11)
lf_clf_cv = LogisticRegression(solver = 'liblinear')
dt_clf_cv, rf_clf_cv, lf_clf_cv

(DecisionTreeClassifier(random_state=11),
 RandomForestClassifier(random_state=11),
 LogisticRegression(solver='liblinear'))

In [249]:
kfold = KFold(n_splits=5) #stratified 옵션도 넣어보기

scores = []

for train_index, test_index in kfold.split(X_titanic_df):
    X_train, X_test = \
    X_titanic_df.values[train_index]\
    , X_titanic_df.values[test_index]

    y_train, y_test =\
    y_titanic_df.values[train_index]\
    , y_titanic_df.values[test_index]

    dt_clf_cv.fit(X_train, y_train)
    pred = dt_clf_cv.predict(X_test)
    accuracy = float(np.round(accuracy_score(y_test, pred),5))
    scores.append(accuracy)

print(f'리스트 : {scores}')
print(f'평균 : {np.mean(scores)}')

리스트 : [0.75419, 0.7809, 0.78652, 0.76966, 0.82022]
평균 : 0.782298


In [246]:
s_kfold = StratifiedKFold(n_splits=5) #stratified 옵션도 넣어보기

s_scores = []

for train_index, test_index in s_kfold.split(X_titanic_df, y_titanic_df):
    X_train, X_test = \
    X_titanic_df.values[train_index]\
    , X_titanic_df.values[test_index]

    y_train, y_test =\
    y_titanic_df.values[train_index]\
    , y_titanic_df.values[test_index]

    dt_clf_cv.fit(X_train, y_train)
    pred = dt_clf_cv.predict(X_test)
    accuracy = float(np.round(accuracy_score(y_test, pred),5))
    s_scores.append(accuracy)

print(f'리스트 : {s_scores}')
print(f'평균 : {np.mean(s_scores)}')

리스트 : [0.74302, 0.77528, 0.79213, 0.78652, 0.8427]
평균 : 0.78793


In [182]:
# 함수로 만들기

def best_model(X_df, y_df, n):
    dt_scores = []
    rf_scores = []
    lf_scores = []
    dt_clf_cv = DecisionTreeClassifier(random_state=11)
    rf_clf_cv = RandomForestClassifier(random_state=11)
    lf_clf_cv = LogisticRegression(solver = 'liblinear')

    s_kfold = StratifiedKFold(n_splits=n)

    for train_index, test_index in s_kfold.split(X_df, y_df):

        X_train, X_test = X_df.values[train_index], X_df.values[test_index]
        y_train, y_test = y_df.values[train_index], y_df.values[test_index]

        dt_clf_cv.fit(X_train, y_train)
        dt_pred = dt_clf_cv.predict(X_test)
        accuracy = np.round(accuracy_score(y_test, dt_pred),5)
        dt_scores.append(accuracy)

        rf_clf_cv.fit(X_train, y_train)
        rf_pred = rf_clf_cv.predict(X_test)
        accuracy = np.round(accuracy_score(y_test, rf_pred),5)
        rf_scores.append(accuracy)

        lf_clf_cv.fit(X_train, y_train)
        lf_pred = lf_clf_cv.predict(X_test)
        accuracy = np.round(accuracy_score(y_test, lf_pred),5)
        lf_scores.append(accuracy)

    return print(
        f'''
▶ 의사결정트리 모델 성능
{'*'*50}
리스트 : {dt_scores}
평균 : {np.mean(dt_scores)}

▶ 랜덤 포레스트 모델 성능
{'*'*50}
리스트 : {rf_scores}
평균 : {np.mean(rf_scores)}

▶ 선형 모델 성능
{'*'*50}
리스트 : {lf_scores}
평균 : {np.mean(lf_scores)}
'''
    )

best_model(X_titanic_df, y_titanic_df, 10)



▶ 의사결정트리 모델 성능
**************************************************
리스트 : [np.float64(0.73333), np.float64(0.78652), np.float64(0.68539), np.float64(0.75281), np.float64(0.82022), np.float64(0.79775), np.float64(0.78652), np.float64(0.75281), np.float64(0.88764), np.float64(0.82022)]
평균 : 0.782321

▶ 랜덤 포레스트 모델 성능
**************************************************
리스트 : [np.float64(0.73333), np.float64(0.77528), np.float64(0.76404), np.float64(0.86517), np.float64(0.8427), np.float64(0.8427), np.float64(0.79775), np.float64(0.77528), np.float64(0.91011), np.float64(0.8427)]
평균 : 0.814906

▶ 선형 모델 성능
**************************************************
리스트 : [np.float64(0.8), np.float64(0.79775), np.float64(0.76404), np.float64(0.82022), np.float64(0.78652), np.float64(0.76404), np.float64(0.79775), np.float64(0.76404), np.float64(0.83146), np.float64(0.80899)]
평균 : 0.793481



In [154]:
X_titanic_df.shape[:1]

(891,)

In [84]:
dt_clf_gcv = DecisionTreeClassifier(random_state=11)
rf_clf_gcv = RandomForestClassifier(random_state=11)
lf_clf_gcv = LogisticRegression(solver = 'liblinear')
dt_clf_gcv, rf_clf_gcv, lf_clf_gcv

(DecisionTreeClassifier(random_state=11),
 RandomForestClassifier(random_state=11),
 LogisticRegression(solver='liblinear'))

In [184]:
# DecisionTree parameter, RandomForest
parameters = {
            'max_depth' : [1,2,3]
            , 'min_samples_split' : [2,3]
            }

grid_dt_clf = GridSearchCV(dt_clf_gcv
                    , param_grid=parameters
                    , scoring='accuracy'
                    , cv = 5
                    , refit = True)

grid_dt_clf.fit(X_train, y_train)

pd.DataFrame(grid_dt_clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001199,0.000402,0.000804,0.0004022423,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.804196,0.762238,0.825175,0.774648,0.746479,0.782547,0.028501,3
1,0.001199,0.000399,0.001201,0.0004001646,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.804196,0.762238,0.825175,0.774648,0.746479,0.782547,0.028501,3
2,0.0014,0.00049,0.001,5.722046e-07,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.748252,0.762238,0.825175,0.774648,0.746479,0.771358,0.028786,5
3,0.001199,0.0004,0.001,7.168434e-07,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.748252,0.762238,0.825175,0.774648,0.746479,0.771358,0.028786,5
4,0.001394,0.000483,0.000406,0.0004968534,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.797203,0.804196,0.818182,0.795775,0.774648,0.798001,0.014118,1
5,0.001191,0.000405,0.000598,0.0004883128,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.797203,0.804196,0.818182,0.795775,0.774648,0.798001,0.014118,1


In [185]:
best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
accuracy_score(y_test, best_dt_pred)

0.8426966292134831

GridSearchCV 의 인자들

estimator : 보통 알고리즘을 객체로 만들어 넣어준다.

param_grid : 튜닝을 위한 대상 파라미터, 사용될 파라미터를 딕셔너리 형태로 넣어준다.

scoring : 예측 성능을 측정할 평가 방법을 넣는다. 분류 알고리즘일 때는, 'accuracy', 'f1', 회귀 알고리즘일 때는 'neg_mean_squared_error', 'r2' 등을 넣을 수 있다.

cv : 교차 검증에서 몇개로 분할되는지 지정한다.(정수로 넣어주면 K겹 교차검증이 되고, KFold(k) 이런식으로 넣어주어도 무방 // default 값은 cv=3)

refit : True로 하면 최적의 하이퍼 파라미터를 찾아서 estimator를 재학습시킨다. (default 값이 True임)

In [191]:
# 정확성을 높혀보자
parameters = {
            'max_depth' : [None, 3, 5, 10, 15] # 깊이를 다양하게 설정함
            , 'min_samples_split' : [2,3,5,10] # 최소 분할 샘플 수
            , 'min_samples_leaf' : [1,2,4,6] # 최소 노드 샘플 수
            }

grid_dt_clf = GridSearchCV(dt_clf_gcv
                    , param_grid=parameters
                    , scoring='accuracy'
                    , cv = 5
                    , refit = True)

grid_dt_clf.fit(X_train, y_train)

best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
accuracy_score(y_test, best_dt_pred)

0.8539325842696629

In [192]:
# 정확성을 높혀보자
parameters = {
            'max_depth' : [None, 3, 5, 10, 15, 20] # 깊이를 다양하게 설정함
            , 'min_samples_split' : [2,3,5,10] # 최소 분할 샘플 수
            , 'min_samples_leaf' : [1,2,4,6,8,10] # 최소 노드 샘플 수
            }

grid_dt_clf = GridSearchCV(dt_clf_gcv
                    , param_grid=parameters
                    , scoring='accuracy'
                    , cv = 5
                    , refit = True)

grid_dt_clf.fit(X_train, y_train)

best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
accuracy_score(y_test, best_dt_pred)

0.8651685393258427

In [194]:
# 정확성을 높혀보자
parameters = {
            'max_depth' : [None, 3, 5, 10, 15, 20, 100] # 깊이를 다양하게 설정함
            , 'min_samples_split' : [2,3,5,10] # 최소 분할 샘플 수
            , 'min_samples_leaf' : [1,2,4,6,8,10,12] # 최소 노드 샘플 수
            }

grid_dt_clf = GridSearchCV(dt_clf_gcv
                    , param_grid=parameters
                    , scoring='accuracy'
                    , cv = 5
                    , refit = True)

grid_dt_clf.fit(X_train, y_train)

best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
accuracy_score(y_test, best_dt_pred)

0.8651685393258427

criterion : 분할 성능 측정 기능

min_samples_split : 노드를 분할하기 위한 최소한의 샘플 데이터수로, 과적합을 제어하는데 주로 사용함. 작게 설정할 수록 분할 노드가 많아져 과적합 가능성이 높아짐.

max_depth : 트리의 최대 깊이, 깊이가 깊어지면 과적합될 수 있음.

max_features : 최적의 분할을 위해 고려할 최대 feature 개수 (default = None : 데이터 세트의 모든 피처를 사용)

min_samples_leaf : 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수 (과적합 제어 용도), 작게 설정 필요

max_leaf_nodes : 리프노드의 최대 개수

In [242]:
# 정확성을 높혀보자
dt_parameters = {
            # 'criterion' : ['gini', 'entropy'],
            'max_depth' : [None, 3, 5, 10, 15] # 깊이를 다양하게 설정함
            , 'max_leaf_nodes' : [None, 2,3,4,5,6]
            , 'min_samples_split' : [2,3,5,6] # 최소 분할 샘플 수
            , 'min_samples_leaf' : [1,2,4,6,8] # 최소 노드 샘플 수
            ,'max_features' : [None,'sqrt','log2',3,4,5]
            }

rf_parameters = {
    'n_estimators': [100, 200],  # 랜덤 포레스트에 추가
    'max_depth': [None, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'max_features': ['sqrt', 'log2']
}


grid_dt_clf = GridSearchCV(dt_clf_gcv
                    , param_grid=dt_parameters
                    , scoring='accuracy'
                    , cv = 5
                    , n_jobs = -1 # CPU의 병렬 처리를 활성화 함
                    , refit = True)


grid_dt_clf.fit(X_train, y_train)

best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
print(f'''
▶ 의사결정 모델      
{grid_dt_clf.best_estimator_}
{accuracy_score(y_test, best_dt_pred)}
      '''
)

grid_rf_clf = GridSearchCV(rf_clf_gcv
                    , param_grid=rf_parameters
                    , scoring='accuracy'
                    , cv = 5
                    , n_jobs = -1 # CPU의 병렬 처리를 활성화 함
                    , refit = True)

grid_rf_clf.fit(X_train, y_train)

best_rf_clf = grid_rf_clf.best_estimator_
best_rf_pred = best_rf_clf.predict(X_test)
print(f'''

▶ 랜덤포레스트 모델
{grid_rf_clf.best_estimator_}      
{accuracy_score(y_test, best_rf_pred)}
      '''
)


▶ 의사결정 모델      
DecisionTreeClassifier(min_samples_leaf=8, random_state=11)
0.8651685393258427
      


▶ 랜덤포레스트 모델
RandomForestClassifier(max_depth=10, max_features='log2', min_samples_split=5,
                       n_estimators=200, random_state=11)      
0.8764044943820225
      
