# 타이타닉 모델 만들기

In [47]:
import pandas as pd 
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [73]:
# 트레인셋
df_train = pd.read_csv('../static/data/classification/titanic_train.csv')
df_train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,0,24.0,0,0,8.85,7,3
1,0,3,1,29.699118,8,2,69.55,7,3
2,1,2,0,30.0,0,0,12.35,7,2


In [74]:
# Features
X_train = df_train.iloc[:, 1:].values
# Label
y_train = df_train.iloc[:, 0].values
print(X_train.shape, y_train.shape)
X_train

(668, 8) (668,)


array([[ 3.        ,  0.        , 24.        , ...,  8.85      ,
         7.        ,  3.        ],
       [ 3.        ,  1.        , 29.69911765, ..., 69.55      ,
         7.        ,  3.        ],
       [ 2.        ,  0.        , 30.        , ..., 12.35      ,
         7.        ,  2.        ],
       ...,
       [ 3.        ,  0.        , 17.        , ...,  7.925     ,
         7.        ,  3.        ],
       [ 1.        ,  1.        , 46.        , ..., 61.175     ,
         4.        ,  3.        ],
       [ 3.        ,  1.        ,  9.        , ..., 46.9       ,
         7.        ,  3.        ]])

In [100]:
df_train.iloc[:, 1:]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,0,24.000000,0,0,8.8500,7,3
1,3,1,29.699118,8,2,69.5500,7,3
2,2,0,30.000000,0,0,12.3500,7,2
3,3,1,16.000000,4,1,39.6875,7,3
4,1,0,36.000000,0,2,71.0000,1,3
...,...,...,...,...,...,...,...,...
663,1,0,32.000000,0,0,76.2917,3,0
664,2,1,34.000000,0,0,13.0000,7,3
665,3,0,17.000000,4,2,7.9250,7,3
666,1,1,46.000000,1,0,61.1750,4,3


In [76]:
# 테스트셋
df_test = pd.read_csv('../static/data/classification/titanic_test.csv')
df_test.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,0,20.0,0,0,8.6625,7,3
1,0,3,1,22.0,0,0,7.8958,7,3
2,0,3,1,33.0,0,0,7.8958,7,0


In [77]:
# Features
X_test = df_test.iloc[:, 1:].values
# Label
y_test = df_test.iloc[:, 0].values
X_test.shape, y_test.shape

((223, 8), (223,))

In [78]:
# 정규화 모델 추출
scaler = MinMaxScaler()
scaler.fit(df_train.iloc[:, 1:])

MinMaxScaler()

In [79]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((668, 8), (223, 8))

In [80]:
joblib.dump(scaler, '../static/model/titanic_scaler.pkl')

['../static/model/titanic_scaler.pkl']

In [81]:
# 1. 로지스틱 회귀
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [82]:
params = {
    #'C': [0.1, 1, 5, 10]
    #'C': [0.05, 0.08, 0.1, 0.2, 0.5]
    'C': [0.01, 0.03, 0.05, 0.07]
}
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test_scaled)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.8174
최적 파라미터: {'C': 0.05}


0.8026905829596412

In [83]:
joblib.dump(best_lr, '../static/model/titanic_lr.pkl')

['../static/model/titanic_lr.pkl']

In [84]:
# 2. SVM
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [85]:
params = {
    #'C': [0.1, 1, 5, 7, 10]
    #'C': [8, 10, 12, 15, 20]
    'C': [13, 14, 15, 16, 17, 18]
}
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test_scaled)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.8309
최적 파라미터: {'C': 17}


0.8071748878923767

In [86]:
joblib.dump(best_lr, '../static/model/titanic_sv.pkl')

['../static/model/titanic_sv.pkl']

In [87]:
# 3. Random Forest
rf_clf = RandomForestClassifier()
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [88]:
params = {
    #'max_depth': [4, 6, 8, 10],
    'max_depth': [7, 8, 9],
    'min_samples_split': [2, 3, 4]
    #'min_samples_split': [3, 4, 5, 6]
}
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test_scaled)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.8309
최적 파라미터: {'max_depth': 8, 'min_samples_split': 2}


0.8295964125560538

In [89]:
joblib.dump(best_rf, '../static/model/titanic_rf.pkl')

['../static/model/titanic_rf.pkl']

In [90]:
# 테스트하기
index = 100
df_test.iloc[index, 1:].values

array([ 2. ,  0. , 29. ,  0. ,  0. , 10.5,  5. ,  3. ])

In [91]:
# 1차원이면 2차원으로 바꿔줘야 함
test_data = (df_test.iloc[index, 1:].values).reshape(1,-1)
label = df_test.iloc[index, 0]
test_data, label

(array([[ 2. ,  0. , 29. ,  0. ,  0. , 10.5,  5. ,  3. ]]), 1)

In [92]:
# 정규화
new_scaler = joblib.load('../static/model/titanic_scaler.pkl')

In [93]:
# 테스트셋 정규화
test_scaled = new_scaler.transform(test_data)

In [94]:
pred_lr = best_lr.predict(test_scaled)
pred_sv = best_sv.predict(test_scaled)
pred_rf = best_rf.predict(test_scaled)

In [95]:
label, pred_lr[0], pred_sv[0], pred_rf[0]

(1, 1, 1, 1)

In [96]:
tmp = df_test.iloc[index, 1:]
print(type(tmp))

<class 'pandas.core.series.Series'>


In [97]:
# Server -> Client를 위한 Feature와 값을 포함한 딕셔너리 추출
dic = tmp.to_dict()
dic

{'Pclass': 2.0,
 'Sex': 0.0,
 'Age': 29.0,
 'SibSp': 0.0,
 'Parch': 0.0,
 'Fare': 10.5,
 'Cabin': 5.0,
 'Embarked': 3.0}