In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, model_selection, decomposition, preprocessing, tree
from matplotlib import pyplot as plt

In [2]:
# 进一步探索泰坦尼克号数据
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
data.drop(['Cabin'],axis=1, inplace=True)

In [9]:
data.drop(['Name'],axis=1, inplace=True)

In [12]:
data.drop(['Ticket'],axis=1, inplace=True)

In [31]:
# 对性别和Embarked进行独热编码
en = preprocessing.OneHotEncoder()
en_data = en.fit_transform(data['Sex'].values.reshape(-1, 1)).toarray()
en_data

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [33]:
en.categories_

[array(['female', 'male'], dtype=object)]

In [35]:
# 对性别进行独热编码 在拼接到数据上
data = pd.concat([data, pd.DataFrame(en_data, columns=['female', 'male'])], axis=1)

In [36]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,female,male
0,1,0,3,male,22.0,1,0,7.25,S,0.0,1.0
1,2,1,1,female,38.0,1,0,71.2833,C,1.0,0.0
2,3,1,3,female,26.0,0,0,7.925,S,1.0,0.0
3,4,1,1,female,35.0,1,0,53.1,S,1.0,0.0
4,5,0,3,male,35.0,0,0,8.05,S,0.0,1.0


In [38]:
# Embarked 中又两个缺失值值 数量比较少 直接删掉
index = data[data['Embarked'].isnull()].index
data.drop(index=index, inplace=True)

In [44]:
# 对Embarked进行独热编码
en = preprocessing.OneHotEncoder()
en_data = en.fit_transform(data['Embarked'].values.reshape(-1, 1)).toarray()
data = pd.concat([data, pd.DataFrame(en_data, columns=['Embarked_C', 'Embarked_Q', 'Embarked_S'])], axis=1)


In [47]:
# 删除原来的Sex和Embarked列
data.drop(['Embarked', 'Sex'], axis=1, inplace=True)

In [48]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,1.0
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,1.0
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,1.0
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,1.0


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    889 non-null float64
Survived       889 non-null float64
Pclass         889 non-null float64
Age            712 non-null float64
SibSp          889 non-null float64
Parch          889 non-null float64
Fare           889 non-null float64
female         889 non-null float64
male           889 non-null float64
Embarked_C     889 non-null float64
Embarked_Q     889 non-null float64
Embarked_S     889 non-null float64
dtypes: float64(12)
memory usage: 90.5 KB


In [62]:
# 只有age有缺失项 尝试使用随机森林回归进行填充
x_train = data.loc[data['Age'].notnull().index, data.columns != 'Age']
y_train = data.loc[data['Age'].notnull().index, 'Age']

x_test = data.loc[data['Age'].isnull().index, data.columns != 'Age']
y_test = data.loc[data['Age'].isnull().index, 'Age']


In [60]:
# 接下来开始使用随机森立回归开始插值  以Age为标签值 剩下的其他的作为特征列
clf = ensemble.RandomForestRegressor()


(891,)

In [61]:
data[data['Age'].notnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,1.0,0.0,0.0,1.0
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3.0,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,0.0,0.0,0.0,1.0
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0,1.0
4,5.0,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,1.0,0.0,0.0,1.0
6,7.0,0.0,1.0,54.0,0.0,0.0,51.8625,0.0,1.0,0.0,0.0,1.0
7,8.0,0.0,3.0,2.0,3.0,1.0,21.0750,0.0,1.0,0.0,0.0,1.0
8,9.0,1.0,3.0,27.0,0.0,2.0,11.1333,1.0,0.0,0.0,0.0,1.0
9,10.0,1.0,2.0,14.0,1.0,0.0,30.0708,1.0,0.0,1.0,0.0,0.0
10,11.0,1.0,3.0,4.0,1.0,1.0,16.7000,1.0,0.0,0.0,0.0,1.0
