In [23]:
import pandas as pd

In [24]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [25]:
train_data.info()
# 注意在需要的特征中，Age、Embarked 有缺失

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [26]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [27]:
test_data.info()
# 注意在需要的特征中，Age、Fare 有缺失

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [28]:
# 人工选取对预测有效的特征
selected_features = ['Pclass', 'Sex', 'Embarked', 'SibSp', 'Parch', 'Fare']

X_train = train_data[selected_features]
X_test = test_data[selected_features]

In [29]:
y_train = train_data['Survived']

In [30]:
# 使用出现频率最高的值来填充 Embarked 特征（类别型）的缺失值
X_train['Embarked'].fillna('S', inplace=True)
X_test['Embarked'].fillna('S', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [31]:
# 使用平均值来填充 Fare 特征（数值型）的缺失值
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [32]:
# 查验处理后的训练和测试数据
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 41.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 19.7+ KB


In [33]:
# 采用 DictVectorizer 对特征抽取和特征向量化
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)    # sparse=False 意思是不用稀疏矩阵表示
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.fit_transform(X_test.to_dict(orient='record'))
dict_vec.feature_names_

['Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [34]:
# 输出处理后特征向量的维度
print(len(dict_vec.feature_names_))

9


In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

from sklearn.cross_validation import cross_val_score
# 5 折交叉验证以进行性能评估，并获得平均分类准确性的得分
cross_val_score(rfc, X_train, y_train, cv=5).mean()

0.7924714526309967

In [36]:
# 预测
rfc.fit(X_train, y_train)
predict_y_test = rfc.predict(X_test)
# 存储预测结果
rfc_submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predict_y_test
})
rfc_submission.to_csv('rfc_submission2.csv', index=False)