In [3]:
import os
import numpy as np

import pandas as pd
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 30)

from pandas import DataFrame

from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

from tqdm import tqdm, tqdm_notebook

In [12]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

import lightgbm as lgbm  # lightgbm 부스팅 알고리즘 사용
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [4]:
train = pd.read_csv("train.csv")
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.loc[train.isnull()['Age'],'Age'] = train['Age'].mean()

train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
train2 = train.drop(columns=['Cabin']).dropna(axis=0)

train2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


# 데이터 마이닝 EDA

## 시각화, 시간변수 처리, 타겟변수 변환, 불균형 처리, 특징 선택

In [9]:
train2['Age_band']=0
train2.loc[train2['Age']<=16,'Age_band']=0
train2.loc[(train2['Age']>16)&(train2['Age']<=32),'Age_band']=1
train2.loc[(train2['Age']>32)&(train2['Age']<=48),'Age_band']=2
train2.loc[(train2['Age']>48)&(train2['Age']<=64),'Age_band']=3
train2.loc[train2['Age']>64,'Age_band']=4

In [10]:
train2['Fare_Range']=pd.qcut(train2['Fare'],4) #4개의 범위로 분리
train2.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,Survived
Fare_Range,Unnamed: 1_level_1
"(-0.001, 7.896]",0.197309
"(7.896, 14.454]",0.303571
"(14.454, 31.0]",0.454955
"(31.0, 512.329]",0.577273


In [11]:
train2['Fare_cat']=0
train2.loc[train2['Fare']<=7.91,'Fare_cat']=0
train2.loc[(train2['Fare']>7.91)&(train2['Fare']<=14.454),'Fare_cat']=1
train2.loc[(train2['Fare']>14.454)&(train2['Fare']<=31),'Fare_cat']=2
train2.loc[(train2['Fare']>31)&(train2['Fare']<=513),'Fare_cat']=3

In [13]:
train2.drop(['Name','Age','Ticket','Fare','Fare_Range','PassengerId'],axis=1,inplace=True) 

In [14]:
train2

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_band,Fare_cat
0,0,3,male,1,0,S,1,0
1,1,1,female,1,0,C,2,3
2,1,3,female,0,0,S,1,1
3,1,1,female,1,0,S,2,3
4,0,3,male,0,0,S,2,1
...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,S,1,1
887,1,1,female,0,0,S,1,2
888,0,3,female,1,2,S,1,2
889,1,1,male,0,0,C,1,2


In [63]:
dummy1 = pd.get_dummies(train2['Embarked'])

dummy2 = pd.get_dummies(train2['Sex'])

train3 = pd.concat([train2, dummy1], axis = 1)

train4 = pd.concat([train3, dummy2], axis = 1)

del train4['Sex']

del train4['Embarked']

train4.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age_band,Fare_cat,C,Q,S,female,male
0,0,3,1,0,1,0,0,0,1,0,1
1,1,1,1,0,2,3,1,0,0,1,0
2,1,3,0,0,1,1,0,0,1,1,0
3,1,1,1,0,2,3,0,0,1,1,0
4,0,3,0,0,2,1,0,0,1,0,1


# 분류 분석(분류모델, 앙상블, 개선방안, 평가방법)

In [64]:
y = train4['Survived']

X = train4.copy()
del X['Survived']

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [66]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [67]:
model1 = SVC(kernel = 'rbf', C=1, gamma = 0.1) #'linear'
model2 = LogisticRegression()
model3 = KNeighborsClassifier() #deafult 5
model4 = GaussianNB()
model5 = RandomForestClassifier()
model6 = AdaBoostClassifier(n_estimators=100, random_state=0)
model7 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [68]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score

In [69]:
model1.fit(X_train, y_train)

y_pred = model1.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       184
           1       0.78      0.73      0.75       110

    accuracy                           0.82       294
   macro avg       0.81      0.80      0.80       294
weighted avg       0.82      0.82      0.82       294



In [70]:
roc_auc_score(y_test, y_pred)

0.8011363636363636

In [71]:
confusion_matrix( y_test, y_pred )

array([[161,  23],
       [ 30,  80]], dtype=int64)

In [72]:
accuracy_score(y_test, y_pred )

0.8197278911564626

In [89]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict

cv = KFold(n_splits=5) # k = 10, split the data into 10 equal parts

pred_percent = []

for t,v in cv.split(train4):
    
    train_X, train_Y = X.iloc[t], y.iloc[t]       # 훈련용
    val_X, val_Y = X.iloc[v], y.iloc[v]       # 검증용 분리.
    
    model1.fit(train_X, train_Y)
    
    y_pred_val = model1.predict(val_X)
    
    pred_percent.append(accuracy_score(val_Y, y_pred_val))
    
    
    
    



In [90]:
pred_percent

[0.8258426966292135,
 0.8202247191011236,
 0.8089887640449438,
 0.7865168539325843,
 0.8418079096045198]

In [40]:
model2.fit(X_train, y_train)

y_pred = model2.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       184
           1       0.73      0.79      0.76       110

    accuracy                           0.81       294
   macro avg       0.80      0.81      0.80       294
weighted avg       0.82      0.81      0.81       294



In [42]:
model3.fit(X_train, y_train)

y_pred = model3.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       184
           1       0.63      0.70      0.66       110

    accuracy                           0.73       294
   macro avg       0.72      0.73      0.72       294
weighted avg       0.74      0.73      0.74       294



In [43]:
model4.fit(X_train, y_train)

y_pred = model4.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.80      0.83       184
           1       0.70      0.76      0.73       110

    accuracy                           0.79       294
   macro avg       0.78      0.78      0.78       294
weighted avg       0.79      0.79      0.79       294



In [44]:
model5.fit(X_train, y_train)

y_pred = model5.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       184
           1       0.74      0.68      0.71       110

    accuracy                           0.79       294
   macro avg       0.78      0.77      0.77       294
weighted avg       0.79      0.79      0.79       294



In [45]:
model6.fit(X_train, y_train)

y_pred = model6.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       184
           1       0.72      0.79      0.75       110

    accuracy                           0.81       294
   macro avg       0.79      0.80      0.80       294
weighted avg       0.81      0.81      0.81       294



In [46]:
model7.fit(X_train, y_train)

y_pred = model7.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       184
           1       0.72      0.69      0.70       110

    accuracy                           0.78       294
   macro avg       0.77      0.76      0.77       294
weighted avg       0.78      0.78      0.78       294

