# 下載泰坦尼克號上旅客的數據集

In [None]:
import urllib.request
import os

In [None]:
url="https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
filepath = "./data/titanic3.xls"

if not os.path.isfile(filepath):
    os.makedirs(os.path.dirname(filepath), exist_ok=True) # create the directory if it doesn't exist
    result = urllib.request.urlretrieve(url, filepath)
    print("Downloaded:", result)

# 使用Pandas dataframe讀取數據並進行處理

In [None]:
import numpy
import pandas as pd

In [None]:
all_df = pd.read_excel(filepath)

In [None]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [None]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp',
      'parch', 'fare', 'embarked']
all_df=all_df[cols]

In [None]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [None]:
df=all_df.drop(['name'], axis=1)

In [None]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [None]:
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

In [None]:
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [None]:
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)

In [None]:
df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,1,0,29.0,0,0,211.3375,S
1,1,1,1,0.9167,1,2,151.55,S


In [None]:
x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])

In [None]:
x_OneHot_df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1


# 轉換為array

In [None]:
ndarray = x_OneHot_df.values

In [None]:
ndarray.shape

(1309, 10)

In [None]:
ndarray[:2]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ]])

In [None]:
Label = ndarray[:,0]
Features = ndarray[:,1:]

In [None]:
Label[:2]

array([1., 1.])

In [None]:
Features[:2]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.9167,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ]])

In [None]:
Label.shape

(1309,)

In [None]:
Features.shape

(1309, 9)

# 將array進行標準化

In [None]:
from sklearn import preprocessing

In [None]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [None]:
scaledFeatures=minmax_scale.fit_transform(Features)

In [None]:
scaledFeatures[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [None]:
Label[:5]

array([1., 1., 0., 0., 0.])

# 將數據分為訓練數據與測試數據

In [None]:
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [None]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df))

total: 1309 train: 1037 test: 272


In [None]:
def PreprocessData(raw_df):
    df=raw_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])

    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]

    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures=minmax_scale.fit_transform(Features)    
    
    return scaledFeatures,Label

In [None]:
train_Features,train_Label=PreprocessData(train_df)
test_Features,test_Label=PreprocessData(test_df)

In [None]:
train_Features[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [None]:
train_Label[:2]

array([1., 1.])