In [1]:
import urllib.request
import os

In [2]:
# 下載鐵達尼資料集到本地端
url = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
file_path = "/Users/PChomeIM/pywork/Dataset/titanic3.xls"
if not os.path.isfile(file_path):
    result = urllib.request.urlretrieve(url, filepath)
    print("downloaded:", result)

In [3]:
import numpy
import pandas as pd

In [4]:
# 載入 excel 資料
all_df = pd.read_excel(file_path)

In [5]:
# 顯示前兩筆資料
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [6]:
# 查看資料集維度，共1309筆，14個欄位
all_df.shape

(1309, 14)

In [7]:
# 挑選對預測旅客生存有幫助的欄位
cols = ["survived", "name", "pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]
all_df = all_df[cols]

In [8]:
# 顯示前兩筆資料
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [9]:
# 查看新的資料集維度，共1309筆，9個欄位
all_df.shape

(1309, 9)

In [10]:
# 訓練時，暫時不需要 name 欄位
df = all_df.drop(['name'], axis=1)
df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S


In [11]:
# 查看所有欄位有幾筆資料含有null值
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [12]:
# 將age欄位的null值填上平均值
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

In [13]:
# 將fare欄位的null值填上平均值
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [14]:
# 將sex欄位的female和male分別轉換為0和1
df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)

In [15]:
# 將embarked欄位轉成onehot表示
onehot_df = pd.get_dummies(data=df, columns=['embarked'])

In [16]:
# 所有欄位皆沒有null值
onehot_df.isnull().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked_C    0
embarked_Q    0
embarked_S    0
dtype: int64

In [17]:
# 查看整理後的前兩筆資料
onehot_df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1


In [18]:
# 將df轉換成array
ndarray = onehot_df.values

In [19]:
# 查看整理後資料集維度
ndarray.shape

(1309, 10)

In [20]:
# 顯示前兩筆資料
ndarray[:2]

array([[   1.    ,    1.    ,    0.    ,   29.    ,    0.    ,    0.    ,
         211.3375,    0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    1.    ,    0.9167,    1.    ,    2.    ,
         151.55  ,    0.    ,    0.    ,    1.    ]])

In [21]:
# 使用 python slice 擷取 features 和 label 資料
features = ndarray[:,1:] # 取第 1 欄以後的欄位作為 features
label = ndarray[:,0] # 取第 0 欄的 survived 作為 label

In [22]:
# 顯示 features 和 label 資料維度
print(features.shape)
print(label.shape)

(1309, 9)
(1309,)


In [23]:
# 查看前兩筆 features 資料
features[:2]

array([[   1.    ,    0.    ,   29.    ,    0.    ,    0.    ,  211.3375,
           0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    0.9167,    1.    ,    2.    ,  151.55  ,
           0.    ,    0.    ,    1.    ]])

In [25]:
# 將 feature 的值進行標準化，讓所有值介於 0~1 之間
from sklearn import preprocessing

In [26]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
features = minmax_scale.fit_transform(features)

In [27]:
# 查看標準化後的前2筆資料
features[:2]

array([[ 0.        ,  0.        ,  0.36116884,  0.        ,  0.        ,
         0.41250333,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.00939458,  0.125     ,  0.22222222,
         0.2958059 ,  0.        ,  0.        ,  1.        ]])