# 下载泰坦尼克号上旅客的数据集

In [1]:
import urllib.request
import os

In [2]:
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="data/titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

downloaded: ('data/titanic3.xls', <http.client.HTTPMessage object at 0x0000012D1CE78940>)


# 使用Pandas dataframe读取数据并进行处理

In [3]:
import numpy
import pandas as pd

In [4]:
all_df = pd.read_excel(filepath)

In [5]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [6]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp',
      'parch', 'fare', 'embarked']
all_df=all_df[cols]

In [7]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [8]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [9]:
df=all_df.drop(['name'], axis=1)

In [10]:
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

In [11]:
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [12]:
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)

In [13]:
df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,1,0,29.0,0,0,211.3375,S
1,1,1,1,0.9167,1,2,151.55,S


In [14]:
x_One Hot_df = pd.get_dummies(data=df,columns=["embarked" ])

In [15]:
x_One Hot_df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1


# 转换为array

In [16]:
ndarray = x_One Hot_df.values

In [17]:
ndarray.shape

(1309, 10)

In [18]:
ndarray[:2]

array([[   1.    ,    1.    ,    0.    ,   29.    ,    0.    ,    0.    ,
         211.3375,    0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    1.    ,    0.9167,    1.    ,    2.    ,
         151.55  ,    0.    ,    0.    ,    1.    ]])

In [19]:
Label = ndarray[:,0]
Features = ndarray[:,1:]

In [20]:
Features.shape

(1309, 9)

In [21]:
Features[:2]

array([[   1.    ,    0.    ,   29.    ,    0.    ,    0.    ,  211.3375,
           0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    0.9167,    1.    ,    2.    ,  151.55  ,
           0.    ,    0.    ,    1.    ]])

In [22]:
Label.shape

(1309,)

In [23]:
Label[:2]

array([ 1.,  1.])

# 将array进行标准化

In [24]:
from sklearn import preprocessing

In [25]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [26]:
scaledFeatures=minmax_scale.fit_transform(Features)

In [27]:
scaledFeatures[:2]

array([[ 0.        ,  0.        ,  0.36116884,  0.        ,  0.        ,
         0.41250333,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.00939458,  0.125     ,  0.22222222,
         0.2958059 ,  0.        ,  0.        ,  1.        ]])

In [28]:
Label[:5]

array([ 1.,  1.,  0.,  0.,  0.])

# 将数据分为训练数据与测试数据

In [29]:
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [30]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df))

total: 1309 train: 1043 test: 266


In [31]:
def PreprocessData(raw_df):
    df=raw_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)
    x_One Hot_df = pd.get_dummies(data=df,columns=["embarked" ])

    ndarray = x_One Hot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]

    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures=minmax_scale.fit_transform(Features)    
    
    return scaledFeatures,Label

In [32]:
train_Features,train_Label=PreprocessData(train_df)
test_Features,test_Label=PreprocessData(test_df)

In [33]:
train_Features[:2]

array([[ 0.        ,  0.        ,  0.36116884,  0.        ,  0.        ,
         0.41250333,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.00939458,  0.125     ,  0.22222222,
         0.2958059 ,  0.        ,  0.        ,  1.        ]])

In [34]:
train_Label[:2]

array([ 1.,  1.])