## Titanic disaster ML Practice (Kaggle)
by 張景筌

- 練習data preprocessing
- 練習將文字轉為可用信息
- 練習Feature Selection
- 測試Logistic Regression, SVM model 

In [1]:
## Import useful package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 資料預處理
先做資料簡單的瀏覽與整理



In [2]:
# read titanic_train data
url = "https://drive.google.com/uc?export=download&id=0BxTpuNriAdKlbnlsSUdpQVJRTk0"
titanic_train = pd.read_csv(url)

# see titanic file
titanic_train.head(10) # find NaN in Cabin column

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## VARIABLE DESCRIPTIONS:
- survival        :Survival  (0 = No; 1 = Yes)

- pclass          :Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
               
- name            :Name
- sex             :Sex
- age             :Age
- sibsp           :Number of Siblings/Spouses Aboard
- parch           :Number of Parents/Children Aboard
- ticket          :Ticket Number
- fare            :Passenger Fare
- cabin           :Cabin
- embarked        :Port of Embarkation  (C = Cherbourg; Q = Queenstown; S = Southampton)
               

## SPECIAL NOTES:
* Pclass is a proxy for socio-economic status (SES)
 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

* Age is in Years; Fractional if Age less than One (1)
 If the Age is Estimated, it is in the form xx.5

   With respect to the family relation variables (i.e. sibsp and parch)
some relations were ignored.  The following are the definitions used
for sibsp and parch.

* Sibling:  Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic
* Spouse:   Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)
* Parent:   Mother or Father of Passenger Aboard Titanic
* Child:    Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic

 Other family relatives excluded from this study include cousins,
nephews/nieces, aunts/uncles, and in-laws.  Some children travelled
only with a nanny, therefore parch=0 for them.  As well, some
travelled with very close friends or neighbors in a village, however,
the definitions do not support such relations.

## 資料瀏覽

In [3]:
titanic_train.describe() # find NaN in age column



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Age處理

In [4]:
# fix NaN in Age column 利用中位數補值

age_median = np.nanmedian(titanic_train['Age'])
new_age = np.where(titanic_train['Age'].isnull(), age_median, titanic_train['Age']) # change null to median
titanic_train['Age'] = new_age
titanic_train.describe() # review

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Embarked 處理

In [5]:
titanic_train['Embarked'].isnull().values.sum() #查看embarked是否有null 

2

In [6]:
#　由於個數很少，將不符合之資料丟棄
print (np.where(titanic_train['Embarked'].isnull()) )
titanic_train = titanic_train.drop(titanic_train.index[[61,829]])

(array([ 61, 829], dtype=int64),)


## 轉換文字資料成可處理訊息

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

titanic_train['Sex'] = le.fit_transform(titanic_train['Sex'])
titanic_train['Embarked'] = le.fit_transform(titanic_train['Embarked'])
titanic_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2
5,6,0,3,"Moran, Mr. James",1,28.0,0,0,330877,8.4583,,1
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,2
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,2
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,,0


## 將資料正則化與標準化(Normalization/Standarization)

In [8]:
# 將 Pclass Age SibSp Parch Fare Embarked　做normalization
scale1 = preprocessing.MinMaxScaler(feature_range=(-1, 1))

titanic_train['Pclass'] = scale1.fit_transform(titanic_train['Pclass'])
titanic_train['Age'] = scale1.fit_transform(titanic_train['Age'])
titanic_train['SibSp'] = scale1.fit_transform(titanic_train['SibSp'])
titanic_train['Parch'] = scale1.fit_transform(titanic_train['Parch'])
titanic_train['Fare'] = scale1.fit_transform(titanic_train['Fare'])
titanic_train['Embarked'] = scale1.fit_transform(titanic_train['Embarked'])
titanic_train.head(10)



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1.0,"Braund, Mr. Owen Harris",1,-0.457653,-0.75,-1.0,A/5 21171,-0.971698,,1.0
1,2,1,-1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,-0.055542,-0.75,-1.0,PC 17599,-0.721729,C85,-1.0
2,3,1,1.0,"Heikkinen, Miss. Laina",0,-0.357125,-1.0,-1.0,STON/O2. 3101282,-0.969063,,1.0
3,4,1,-1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,-0.130937,-0.75,-1.0,113803,-0.792711,C123,1.0
4,5,0,1.0,"Allen, Mr. William Henry",1,-0.130937,-1.0,-1.0,373450,-0.968575,,1.0
5,6,0,1.0,"Moran, Mr. James",1,-0.306861,-1.0,-1.0,330877,-0.966981,,0.0
6,7,0,-1.0,"McCarthy, Mr. Timothy J",1,0.346569,-1.0,-1.0,17463,-0.797542,E46,1.0
7,8,0,1.0,"Palsson, Master. Gosta Leonard",1,-0.960292,-0.25,-0.666667,349909,-0.917729,,1.0
8,9,1,1.0,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,-0.331993,-1.0,-0.333333,347742,-0.956538,,1.0
9,10,1,0.0,"Nasser, Mrs. Nicholas (Adele Achem)",0,-0.658708,-0.75,-1.0,237736,-0.882611,,-1.0


## 建立 X 與 y

In [9]:
titanic_X = pd.DataFrame([titanic_train['Pclass'],
                         titanic_train['Sex'],
                         titanic_train['Age'],
                         titanic_train['SibSp'],
                         titanic_train['Parch'],
                         titanic_train['Fare'],
                         titanic_train['Embarked']
]).T
titanic_y = titanic_train["Survived"]

## 特徵選擇(Feature Selection)

In [10]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(titanic_X, titanic_y)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.11269915  0.28997475  0.24993068  0.04630342  0.04159911  0.22880159
  0.03069131]


- ## 推論 Sex,Age,Fare為較重要的feature

## 重新建立X,Y

In [11]:
titanic_X = pd.DataFrame([titanic_train['Sex'],
                         titanic_train['Age'],
                         titanic_train['Fare']
]).T
titanic_y = titanic_train["Survived"]

## 切分訓練與測試資料
利用 70%/30%　比例切分訓練與測試資料
## 測試SVM模型
利用 sex,age,fare推論 survived

In [30]:
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split

def SVM_predict(time):
    train_X, test_X, train_y, test_y = train_test_split(titanic_X, titanic_y, test_size = 0.3)
    # 切分訓練與測試資料
    model = SVC()
    model.fit(train_X, train_y)
    SVM_model = model.fit(train_X, train_y)
    #print(model)
    # make predictions
    expected = test_y
    predicted = model.predict(test_X)
    # summarize the fit of the model
    #print(metrics.classification_report(expected, predicted))
    #print(metrics.confusion_matrix(expected, predicted))

    # 準確率
    accuracy = metrics.accuracy_score(expected, predicted)
    #print('第%d次預測準確率='%(time),accuracy)
    return accuracy,train_X,train_y

max_predict = 0
min_predict = 1
sum_predict = 0
best_SVM_train_X,best_SVM_train_y = None,None
for times in range(500):
    temp_predict, temp_X, temp_y = SVM_predict(times+1)
    sum_predict += temp_predict
    if temp_predict > max_predict:
        max_predict = temp_predict
        best_SVM_train_X,best_SVM_train_y = temp_X, temp_y 
    if min_predict > temp_predict:
        min_predict = temp_predict
print('SVM_model-500次-maximum prediction=',max_predict,'minimum prediction=',min_predict,'average prediction=',sum_predict/500)        

SVM_model-500次-maximum prediction= 0.87265917603 minimum prediction= 0.719101123596 average prediction= 0.786382022472


## 測試Logistic Regression

In [13]:
from sklearn import metrics, linear_model
from sklearn.cross_validation import train_test_split

def Logistic_predict(time):
    train_X, test_X, train_y, test_y = train_test_split(titanic_X, titanic_y, test_size = 0.3)
    # 切分訓練與測試資料
    model = linear_model.LogisticRegression()
    model.fit(train_X, train_y)
    #print(model)
    # make predictions
    expected = test_y
    predicted = model.predict(test_X)
    # summarize the fit of the model
    #print(metrics.classification_report(expected, predicted))
    #print(metrics.confusion_matrix(expected, predicted))

    # 準確率
    accuracy = metrics.accuracy_score(expected, predicted)
    #print('第%d次預測準確率='%(time),accuracy)
    return accuracy

max_predict = 0
min_predict = 1
sum_predict = 0
for times in range(500):
    temp_predict = Logistic_predict(times+1)
    sum_predict += temp_predict
    if temp_predict > max_predict:
        max_predict = temp_predict
    if min_predict > temp_predict:
        min_predict = temp_predict
print ('Logistic_model-500次-maximum prediction=',max_predict,'minimum prediction=',min_predict,'average prediction=',sum_predict/500)

Logistic_model-500次-maximum prediction= 0.87265917603 minimum prediction= 0.7265917603 average prediction= 0.785086142322


根據測試結果，運用SVM 與 Logistic Regression 所得之準確率差異不大
## 使用SVM作為submission

In [31]:
#資料處理
url = "https://drive.google.com/uc?export=download&id=0BxTpuNriAdKlb3dVODhoVkUxaHM"
titanic_submit = pd.read_csv(url)

age_median = np.nanmedian(titanic_submit['Age'])
new_age = np.where(titanic_submit['Age'].isnull(), age_median, titanic_submit['Age']) # change null to median
titanic_submit['Age'] = new_age
Fare_median = np.nanmedian(titanic_submit['Fare'])
new_Fare = np.where(titanic_submit['Fare'].isnull(), age_median, titanic_submit['Fare']) # change null to median
titanic_submit['Fare'] = new_Fare

titanic_submit['Sex'] = le.fit_transform(titanic_submit['Sex'])
titanic_submit['Fare'] = scale1.fit_transform(titanic_submit['Fare'])
titanic_submit['Age'] = scale1.fit_transform(titanic_submit['Age'])



#建立 X
titanic_submit_X = pd.DataFrame([titanic_submit['Sex'],
                         titanic_submit['Age'],
                         titanic_submit['Fare']
]).T

#用 SVM 預測 y
submit_model = SVC()
submit_model.fit(best_SVM_train_X,best_SVM_train_y)
titanic_submit_y = submit_model.predict(titanic_submit_X)
print (titanic_submit_y)
#準備要上傳的檔案
submit_dict = {
   "PassengerId": titanic_submit["PassengerId"],
   "Survived": titanic_submit_y
}
titanic_submit_df = pd.DataFrame(submit_dict)

# 輸出成 csv
titanic_submit_df.to_csv("titanic_submit2.csv", index = False)




[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


