# 預測生存機率

# 資料匯入與預處理

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25)

titanic_train_df = pd.read_csv("train.csv")
titanic_test_df = pd.read_csv("test.csv")
titanic_train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### One-hot Encoding

In [2]:
# 訓練資料
df_train_sex = pd.get_dummies(titanic_train_df[ 'Sex'])
df_train_sex.sample(5)

Unnamed: 0,female,male
447,0,1
842,1,0
579,0,1
809,1,0
788,0,1


### 訓練資料合併

In [3]:
df_train_ml = pd.merge(titanic_train_df, df_train_sex, left_index=True, right_index=True)
df_train_ml.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male
638,639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41.0,0,5,3101295,39.6875,,S,1,0
815,816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,S,0,1
795,796,0,2,"Otter, Mr. Richard",male,39.0,0,0,28213,13.0,,S,0,1
631,632,0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,347743,7.0542,,S,0,1
548,549,0,3,"Goldsmith, Mr. Frank John",male,33.0,1,1,363291,20.525,,S,0,1


### LabelEncoding

In [4]:
# 訓練資料
#df_train_ml['Embarked_rank'] = df_ml['Embarked'].replace({'C':1,'S':2,'Q':3})
#df_train_ml.sample(5)

In [5]:
# 測試資料
#df_test_ml['Embarked_rank'] = df_ml['Embarked'].replace({'C':1,'S':2,'Q':3})
#df_test_ml.sample(5)

### 拿掉空值

In [6]:
df_train_ml = df_train_ml.dropna().reset_index(drop=True)

In [7]:

from sklearn.model_selection import train_test_split

#X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
#y = df_ml['left']
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)


X = df_train_ml[['PassengerId','Pclass','Age','SibSp','female','male']]
y = df_train_ml['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)





from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

#y_test = df_test_ml['Survived']

# 使用單一分類器進行預測

### 決策分類樹

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion = 'gini', max_depth=5)
tree.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.73      0.69      0.71        16
          1       0.88      0.90      0.89        39

avg / total       0.83      0.84      0.83        55



### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.62      0.94      0.75        16
          1       0.97      0.77      0.86        39

avg / total       0.87      0.82      0.83        55



### SVC

In [10]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.67      0.62      0.65        16
          1       0.85      0.87      0.86        39

avg / total       0.80      0.80      0.80        55



# 使用整體學習進行預測

### VotingClassifier

In [11]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True)

eclf = VotingClassifier(estimators=[('dt' , clf1) , ('knn' , clf2) , ('svc' , clf3)] , voting = 'soft', weights=[2,1,1])
eclf.fit(X_train_std , y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))

Y_pred = eclf.predict(X_test_std)


             precision    recall  f1-score   support

          0       0.71      0.62      0.67        16
          1       0.85      0.90      0.88        39

avg / total       0.81      0.82      0.81        55



### Bagging

In [12]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100)
bagc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, bagc.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.67      0.62      0.65        16
          1       0.85      0.87      0.86        39

avg / total       0.80      0.80      0.80        55



### 隨機森林(Random Forest)

In [13]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features="auto" )
rfc.fit(X_train, y_train)
print(metrics.classification_report(y_test, rfc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.62      0.62      0.62        16
          1       0.85      0.85      0.85        39

avg / total       0.78      0.78      0.78        55

