In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.svm import SVC

In [37]:
train_path = 'titanicTrain.csv'
verify_path = 'titanicQuestion.csv'

In [38]:
train_df = pd.read_csv(train_path)
verify_df = pd.read_csv(verify_path)

cleaning data ...

In [39]:
# if a row has all columns with NaN, strip it.
train_df.dropna(axis='rows', how='all', inplace=True)
train_df.head()

verify_df.dropna(axis='rows', how='all', inplace=True)
verify_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3,,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q,,,
1,3,,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q,16.0,,
2,3,,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q,16.0,,
3,3,,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q,16.0,,
4,3,,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q,13.0,,


In [40]:
# drop some unhelpful columns...
train_df.drop(columns=['home.dest', 'body','embarked','cabin','name','ticket'], inplace=True)
train_df.head()

verify_df.drop(columns=['home.dest', 'body','embarked','cabin','name','ticket'], inplace=True)
verify_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,boat
0,3,,male,,0,0,7.75,
1,3,,female,,2,0,23.25,16.0
2,3,,female,,2,0,23.25,16.0
3,3,,male,,2,0,23.25,16.0
4,3,,female,,0,0,7.7875,13.0


In [41]:
# 由直系血親的人數和配偶+兄弟姊妹合併成家庭大小
train_df['family_size'] = train_df['sibsp'] + train_df['parch']

verify_df['family_size'] = verify_df['sibsp'] + verify_df['parch']

In [42]:
# sex->sex_number M=1,F=0
# boat->on_boat  if row('boat') is not NaN
train_df['sex_number'] = train_df.apply(lambda row: 0 if row['sex'] == 'male' else 1, axis=1)
train_df['on_boat'] = train_df.apply(lambda row: 0 if type(row['boat']) == type(1.0) and np.isnan(row['boat']) else 1, axis=1)
train_df.drop(columns=['sex', 'boat'], inplace=True)
train_df.head()

verify_df['sex_number'] = verify_df.apply(lambda row: 0 if row['sex'] == 'male' else 1, axis=1)
verify_df['on_boat'] = verify_df.apply(lambda row: 0 if type(row['boat']) == type(1.0) and np.isnan(row['boat']) else 1, axis=1)
verify_df.drop(columns=['sex', 'boat'], inplace=True)
verify_df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,family_size,sex_number,on_boat
0,3,,,0,0,7.75,0,0,0
1,3,,,2,0,23.25,2,1,1
2,3,,,2,0,23.25,2,1,1
3,3,,,2,0,23.25,2,0,1
4,3,,,0,0,7.7875,0,1,1


In [43]:
# setting features to be considered
features = ['sex_number', 'age', 'pclass', 'fare', 'sibsp', 'parch', 'on_boat', 'family_size']

In [44]:
# missing value preprocessing
#     age:float
# impute missing value
imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
imputer = imputer.fit(train_df.loc[:, features])
train_arr_imputed = imputer.fit_transform(train_df.loc[:, features])
verify_arr_imputed = imputer.fit_transform(verify_df.loc[:, features])
#train_df_imputed = pd.DataFrame()


In [45]:
# 訓練模型
svc = SVC()
svc_trained = svc.fit(train_arr_imputed, train_df['survived'])

In [48]:
# 計算測資
verify_df['survived'] = svc_trained.predict(verify_arr_imputed)
verify_df

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,family_size,sex_number,on_boat
0,3,0.0,,0,0,7.7500,0,0,0
1,3,0.0,,2,0,23.2500,2,1,1
2,3,0.0,,2,0,23.2500,2,1,1
3,3,0.0,,2,0,23.2500,2,0,1
4,3,1.0,,0,0,7.7875,0,1,1
5,3,0.0,,0,0,15.5000,0,0,0
6,3,1.0,,0,0,7.8792,0,1,1
7,3,1.0,15.0,0,0,8.0292,0,1,0
8,3,0.0,35.0,0,0,7.7500,0,1,0
9,3,0.0,,0,0,7.7500,0,0,0


In [49]:
verify_df.to_csv('titanicQuestion.solved.csv')