# Titanic - Machine Learning from Disaster

---

## 1. 导入数据集

In [1]:
import pandas as pd
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## 2. 查看数据详细信息

In [2]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [3]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


## 3. 选择特征及标签

In [4]:
selected_features=['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare']

X_train = train[selected_features]
X_test = test[selected_features]

y_train = train['Survived']

## 4. 处理丢失数据

In [5]:
print(X_train['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [6]:
X_train['Embarked'].fillna('S', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [7]:
print(X_test['Embarked'].value_counts())

S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [8]:
X_test['Embarked'].fillna('S', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [9]:
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


## 5. 检查清洗后的数据

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [11]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


## 6. 使用向量化增加特征数量

In [12]:
from sklearn.feature_extraction import DictVectorizer

dictVectorizer = DictVectorizer(sparse=False)
dictVectorizer.fit(X_train.to_dict(orient='record'))

X_train = dictVectorizer.transform(X_train.to_dict(orient='record'))
X_test = dictVectorizer.transform(X_test.to_dict(orient='record'))

print(dictVectorizer.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


In [13]:
print(X_train[:10,:])

[[22.          0.          0.          1.          7.25        0.
   3.          0.          1.          1.        ]
 [38.          1.          0.          0.         71.2833      0.
   1.          1.          0.          1.        ]
 [26.          0.          0.          1.          7.925       0.
   3.          1.          0.          0.        ]
 [35.          0.          0.          1.         53.1         0.
   1.          1.          0.          1.        ]
 [35.          0.          0.          1.          8.05        0.
   3.          0.          1.          0.        ]
 [29.69911765  0.          1.          0.          8.4583      0.
   3.          0.          1.          0.        ]
 [54.          0.          0.          1.         51.8625      0.
   1.          0.          1.          0.        ]
 [ 2.          0.          0.          1.         21.075       1.
   3.          0.          1.          3.        ]
 [27.          0.          0.          1.         11.1333      2

## 7. 使用RandomForestClassifier和XGBClassifier进行预测

In [14]:
from sklearn.ensemble import RandomForestClassifier

randomForestClassifier = RandomForestClassifier()

In [15]:
from xgboost import XGBClassifier

xgbClassifier = XGBClassifier()

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
cross_val_score(randomForestClassifier, X_train, y_train, cv=5)

array([0.76536313, 0.77653631, 0.86516854, 0.7752809 , 0.83050847])

In [18]:
from sklearn import warnings

warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

In [19]:
cross_val_score(xgbClassifier, X_train, y_train, cv=5)

array([0.80446927, 0.81005587, 0.8258427 , 0.79775281, 0.85310734])

In [20]:
randomForestClassifier.fit(X_train, y_train)
y_predict_randomForestClassifier = randomForestClassifier.predict(X_test)

submission_randomForestClassifier = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':y_predict_randomForestClassifier})
submission_randomForestClassifier.to_csv('./submission_randomForestClassifier.csv', index=False)

In [21]:
xgbClassifier.fit(X_train, y_train)
y_predict_xgbClassifier = xgbClassifier.predict(X_test)

submission_xgbClassifier = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':y_predict_xgbClassifier})
submission_xgbClassifier.to_csv('./submission_xgbClassifier.csv', index=False)

## 8. 使用网格搜索及xgbClassifier进行预测

In [22]:
from sklearn.grid_search import GridSearchCV
import numpy as np

params = {'max_depth':np.array(range(2, 7)), 
          'n_estimators':np.array(range(100, 1100, 200)), 
          'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]}

xgbClassifier_best = XGBClassifier()
gridSearchCV = GridSearchCV(xgbClassifier_best, params, n_jobs=-1, cv=5, verbose=1)
gridSearchCV.fit(X_train, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 578 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:   33.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([2, 3, 4, 5, 6]), 'n_estimators': array([100, 300, 500, 700, 900]), 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [23]:
print(gridSearchCV.best_score_)
print(gridSearchCV.best_params_)

0.835016835016835
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [24]:
y_predict_xgbClassifier_best = gridSearchCV.predict(X_test)
submission_xgbClassifier_best = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':y_predict_xgbClassifier_best})
submission_xgbClassifier.to_csv('./submission_xgbClassifier_best.csv', index=False)

---

# -END-