## 1. Load Data & Data split

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [3]:
# pandas data load
dataset = pd.read_pickle('dataset/titanic_dataset')
data = dataset.copy()

In [4]:
# import library for data split
from sklearn.model_selection import train_test_split

# data devide X, y 
y = data.pop('Survived')
X = data

# data split train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 6) (179, 6) (712,) (179,)


## 2. build model

In [5]:
# import library for model
from sklearn.ensemble import RandomForestClassifier

### 2.1. base model

In [33]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [34]:
clf.score(X_test, y_test)

0.7877094972067039

### 2.2. change depth None

In [35]:
clf = RandomForestClassifier(max_depth=None, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [36]:
clf.score(X_test, y_test)

0.8324022346368715

## 3. Data standardizatin ( MinMax )

In [37]:
from sklearn.preprocessing import MinMaxScaler

In [38]:
scaler = MinMaxScaler()
X_scaler = scaler.fit(X)
X_scaler = scaler.transform(X)
X_scaler = pd.DataFrame(X_scaler, columns=X.columns, index=list(X.index.values))
X_scaler.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,1.0,1.0,0.271174,0.125,0.0,0.014151
1,0.0,0.0,0.472229,0.125,0.0,0.139136
2,1.0,0.0,0.321438,0.0,0.0,0.015469
3,0.0,0.0,0.434531,0.125,0.0,0.103644
4,1.0,1.0,0.434531,0.0,0.0,0.015713


In [39]:
# data split train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 6) (179, 6) (712,) (179,)


## 4. Build model & MinMax scaler

In [40]:
clf = RandomForestClassifier(max_depth=None, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [41]:
clf.score(X_test, y_test)

0.8324022346368715

## 5. k-fold cross validation

In [6]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [10]:
clf = RandomForestClassifier(max_depth=None, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [11]:
kfold = KFold(n_splits=10)
result = cross_val_score(clf, X, y, cv=kfold)
print(result)

[0.72222222 0.7752809  0.75280899 0.84269663 0.86516854 0.82022472
 0.79775281 0.78651685 0.84269663 0.84269663]


In [12]:
sum(result)/10

0.8048064918851434