In [6]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [7]:
titanic_df = pd.read_csv(r"titanic_train.csv")
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
def encode_features(dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature].astype(str))
        dataDF[feature] = le.transform(dataDF[feature].astype(str))
        print(id(le))

    return dataDF


In [9]:
titanic_df = encode_features(titanic_df)
titanic_df.head()

2742111764936
2742108450504
2742018125640


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,147,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,81,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,147,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,55,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,147,2


In [10]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace = True)
    return df

def drop_features(df) :
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ["Cabin", "Sex", "Embarked"]
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature].astype(str))
        df[feature] = le.transform(df[feature].astype(str))
    return df

def transform_features(df) :
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [11]:
titanic_df = pd.read_csv("titanic_train.csv")
y_titanic_df = titanic_df["Survived"]
X_titanic_df = titanic_df.drop("Survived", axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [14]:
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()

dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print("DecisionTreeClassifier 정확도 : {0:.4f}".format(accuracy_score(y_test, dt_pred)))

rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print("RandomForestClassifier 정확도 : {0:.4f}".format(accuracy_score(y_test, rf_pred)))

lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print("LogisticRegression 정확도 : {0:.4f}".format(accuracy_score(y_test, lr_pred)))


DecisionTreeClassifier 정확도 : 0.7877
RandomForestClassifier 정확도 : 0.8547
LogisticRegression 정확도 : 0.8492


In [32]:
from sklearn.model_selection import KFold

In [43]:
# dt_clf = DecisionTreeClassifier(random_state = 156)
# kfold = KFold(n_)
# cv_accuracy = []
# print("타이타닉 데이터 세트 크기 : ", titanic_df.shape[0])

In [42]:
# n_iter = 0
# for train_index, test_index in kfold.split(titanic_df) :
#     X_train, X_test = X_titanic_df[train_index], X_titanic_df[test_index]
#     y_train, y_test = y_titanic_df[train_index], y_titanic_df[test_index]
    
#     dt_clf.fit(X_train, y_train)
#     pred = dt_clf.predict(X_test)
#     n_iter += 1
    
#     accuracy = np.round(accuracy_score(y_test, pred), 4)
#     train_size = X_train.shape[0]
#     test_size = X_test.shape[0]
    
#     print("\n#{0} 교차 검증 정확도 : {1}, 학습 데이터 크키 : {2}, 검증 데이터 크기 : {3}".format(n_iter, accuracy, train_size, test_size))
#     print("#{0} 검증 세트 인덱스 : {1}".format(n_iter, test_index))
#     cv_accuracy.append(accuracy)
# print("\n## 평균 검증 정확도 : ", np.mean(cv_accuracy))

In [55]:
def exec_kfold(clf, folds=5):
    kfold = KFold(n_splits=folds)
    scores = []
    
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        x_train, x_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        clf.fit(x_train, y_train)
        predictions = clf.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        
        print("교차 검증{0} 정확도 : {1:.4f}".format(iter_count, accuracy))
        
    mean_score = np.mean(scores)
    print("평균 정확도 : {0:.4f}".format(mean_score))

exec_kfold(dt_clf, folds=5)

교차 검증0 정확도 : 0.7542
교차 검증1 정확도 : 0.7640
교차 검증2 정확도 : 0.7865
교차 검증3 정확도 : 0.7584
교차 검증4 정확도 : 0.8371
평균 정확도 : 0.7801


In [67]:
from sklearn.model_selection import cross_val_score, cross_validate
def exec_cvs (clf, cvn=5):
    scores = cross_val_score(clf, x_titanic_df, y_titanic_df, scoring = 'accuracy', cv=cvn)
    mean_score = np.mean(scores)
    return print(np.round(scores, 4)), print(np.round(np.mean(scores), 4))


In [68]:
exec_cvs(dt_clf, cvn = 5)

[0.7486 0.7584 0.809  0.7528 0.8034]
0.7744


(None, None)

In [69]:
# 결과값이 약간 다를수 있다.
scores = cross_val_score(dt_clf, X_titanic_df, y_titanic_df, cv=5)
for iter_count, accuracy in enumerate(scores):
    print("교차 검증 {0} 정확도 : {1:.4f}".format(iter_count, accuracy))
print("평균 정확도 : {0:.4f}".format(np.mean(scores)))

교차 검증 0 정확도 : 0.7486
교차 검증 1 정확도 : 0.7697
교차 검증 2 정확도 : 0.7978
교차 검증 3 정확도 : 0.7809
교차 검증 4 정확도 : 0.8202
평균 정확도 : 0.7834


In [74]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

parameters = {'max_depth' : [2, 3, 5, 10],
             'min_samples_split' :  [2, 3, 5], 'min_samples_leaf' : [1, 5, 8]}

grid_dclf = GridSearchCV(dt_clf, param_grid = parameters, scoring='accuracy', cv = 5)
grid_dclf.fit(X_train, y_train)

print("GridSearchCV 최적 하이퍼 파라미터 : ", grid_dclf.best_params_)
print("GridSearchCV 최고 정확도 : {0:.4f}".format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test, dpredictions)
precision = precision_score(y_test, dpredictions)
recall = recall_score(y_test, dpredictions)

print("테스트 세트에서의 DecisionTreeClassifier 정확도 : {0:.4f}".format(accuracy))
print("테스트 세트에서의 DecisionTreeClassifier 정밀도 : {0:.4f}".format(precision))
print("테스트 세트에서의 DecisionTreeClassifier 재현율 : {0:.4f}".format(recall))

GridSearchCV 최적 하이퍼 파라미터 :  {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도 : 0.7992
테스트 세트에서의 DecisionTreeClassifier 정확도 : 0.8715
테스트 세트에서의 DecisionTreeClassifier 정밀도 : 0.8393
테스트 세트에서의 DecisionTreeClassifier 재현율 : 0.7705


# 모델 평가

## 분류모델 지표의 의미와 계산

metric : 메트릭 
- 어떤 성능 또는 활동에 대한 정보를 나타내는 숫자 (학습 완료된 모델의 성능이 얼마나 되는지 숫자로 표현)

분류 : accuracy(정확도), precision(정밀도), recall(재현율)





In [3]:
import graphviz

In [2]:
!pip install graphviz

Collecting graphviz
  Using cached graphviz-0.16-py2.py3-none-any.whl (19 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.16
