# Titanic 生存率預測(分類預測)

In [1]:
import pandas as pd 

In [2]:
#loading data
train_df = pd.read_csv("./dataset/titanic/train.csv")
test_df = pd.read_csv("./dataset/titanic/test.csv")

In [3]:
# count is na
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
# fillna with median
med = train_df.median()
train_df= train_df.fillna(med)
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test_df = test_df.fillna(med)
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

## One-Hot-Encoding

In [7]:
# ['Embarked'] train
dummy = pd.get_dummies(train_df['Embarked'])
train_df = pd.concat([train_df,dummy], axis = 1)

In [8]:
# ['Embarked'] test
dummy = pd.get_dummies(test_df['Embarked'])
test_df = pd.concat([test_df,dummy], axis = 1)

In [9]:
# ['SEX'] train
dummy = pd.get_dummies(train_df['Sex'])
train_df = pd.concat([train_df,dummy], axis = 1)

In [10]:
# ['SEX'] test
dummy = pd.get_dummies(test_df['Sex'])
test_df = pd.concat([test_df,dummy], axis = 1)

### 取出稱謂

In [11]:
#將稱謂取出 (如下列範例 )
'''
s = "Braund, Mr. Owen Harris"
s.split(",")[-1].split(".")[0].replace(" ", "")
'''

'\ns = "Braund, Mr. Owen Harris"\ns.split(",")[-1].split(".")[0].replace(" ", "")\n'

In [12]:
def midget(s):
     return s.split(",")[-1].split(".")[0].replace(" ","")
train_df['Name'].apply(midget).value_counts()

Mr             517
Miss           182
Mrs            125
Master          40
Dr               7
Rev              6
Mlle             2
Major            2
Col              2
Sir              1
Capt             1
Jonkheer         1
Don              1
theCountess      1
Mme              1
Lady             1
Ms               1
Name: Name, dtype: int64

In [13]:
def midget2(s):
    mid = s.split(",")[-1].split(".")[0].replace(" ","")
    reserved = ['Mr','Miss','Mrs','Master']
    if mid in reserved:
        return mid
    else :
        return "X"

n = train_df['Name'].apply(midget2)
dummy = pd.get_dummies(n)
train_df = pd.concat([train_df, dummy],axis = 1)

In [14]:
n = test_df["Name"].apply(midget2)
dummy = pd.get_dummies(n)
test_df = pd.concat([test_df, dummy], axis=1)

In [15]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'C', 'Q', 'S', 'female',
       'male', 'Master', 'Miss', 'Mr', 'Mrs', 'X'],
      dtype='object')

In [16]:
train = train_df.drop(["PassengerId", "Sex", "Name", "Ticket", "Cabin", "Embarked", "X"], axis=1)
train

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,female,male,Master,Miss,Mr,Mrs
0,0,3,22.0,1,0,7.2500,0,0,1,0,1,0,0,1,0
1,1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0,0,1
2,1,3,26.0,0,0,7.9250,0,0,1,1,0,0,1,0,0
3,1,1,35.0,1,0,53.1000,0,0,1,1,0,0,0,0,1
4,0,3,35.0,0,0,8.0500,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,0,0,1,0,1,0,0,0,0
887,1,1,19.0,0,0,30.0000,0,0,1,1,0,0,1,0,0
888,0,3,28.0,1,2,23.4500,0,0,1,1,0,0,1,0,0
889,1,1,26.0,0,0,30.0000,1,0,0,0,1,0,0,1,0


In [17]:
test = test_df.drop(["PassengerId", "Sex", "Name", "Ticket", "Cabin", "Embarked", "X"], axis=1)
test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,C,Q,S,female,male,Master,Miss,Mr,Mrs
0,3,34.5,0,0,7.8292,0,1,0,0,1,0,0,1,0
1,3,47.0,1,0,7.0000,0,0,1,1,0,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,0,1,0,0,1,0
3,3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1,0
4,3,22.0,1,1,12.2875,0,0,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,28.0,0,0,8.0500,0,0,1,0,1,0,0,1,0
414,1,39.0,0,0,108.9000,1,0,0,1,0,0,0,0,0
415,3,38.5,0,0,7.2500,0,0,1,0,1,0,0,1,0
416,3,28.0,0,0,8.0500,0,0,1,0,1,0,0,1,0


# 建立模型 隨機森林

In [29]:
x_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()
grid = {
    "n_estimators":range(25, 35, 1),
    "max_depth":range(6, 10)
}
gridsearch = GridSearchCV(clf, param_grid=grid, cv=10, n_jobs=8)
gridsearch.fit(x_train, y_train)
gridsearch.best_params_

{'max_depth': 7, 'n_estimators': 28}

In [48]:
from sklearn.model_selection import cross_val_score
import numpy as np
clf = RandomForestClassifier(n_estimators=27, max_depth=7)
scores = cross_val_score(clf, x_train, y_train, cv=10)
print("十次:", scores)
print("平均:", np.average(scores))

十次: [0.81111111 0.83333333 0.78651685 0.88764045 0.86516854 0.82022472
 0.79775281 0.7752809  0.87640449 0.85227273]
平均: 0.8305705935762114


In [49]:
clf = RandomForestClassifier(n_estimators=27, max_depth=7)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=27,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
result = pd.DataFrame()
pid = test_df["PassengerId"]
pre = clf.predict(test)
result["PassengerId"] = pid
result["Survived"] = pre
result.to_csv("titanic.csv", index=False)
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


### 檢視重要變數

In [23]:
im = pd.DataFrame()
im["name"] = x_train.columns
im["importance"] = clf.feature_importances_
im

Unnamed: 0,name,importance
0,Pclass,0.102354
1,Age,0.110597
2,SibSp,0.05624
3,Parch,0.039795
4,Fare,0.159855
5,C,0.012807
6,Q,0.009252
7,S,0.017424
8,female,0.138758
9,male,0.133088


# KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.transform(test)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf = KNeighborsClassifier()
grid = {
    "n_neighbors":range(5, 100),
}
gridsearch = GridSearchCV(clf, param_grid=grid, cv=10, n_jobs=8)
gridsearch.fit(x_train_scale, y_train)
gridsearch.best_params_

{'n_neighbors': 17}

In [26]:
clf = KNeighborsClassifier(n_neighbors=17)
clf.fit(x_train_scale, y_train)
result = pd.DataFrame()
pid = test_df["PassengerId"]
pre = clf.predict(x_test_scale)
result["PassengerId"] = pid
result["Survived"] = pre
result.to_csv("titanic_knn.csv", index=False)
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
