<a href="https://colab.research.google.com/github/qinyunkone/AQIstudy/blob/master/Titanic_Survived_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [0]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
PassengerId = test_data.PassengerId

# 特征提取

In [0]:
full_data = [train_data, test_data]

# 缺失值处理
for data in full_data:
    age_mean = data.Age.mean()
    age_std = data.Age.std()
    age_null_count = data.Age.isnull().sum()
    data['Age'][np.isnan(data['Age'])] = np.random.randint(age_mean-age_std, age_mean+age_std, size=age_null_count)  # 填充年龄缺失值
    
    data['Embarked'].fillna('S', inplace=True)  # 票号最接近的都是'S'
    
    data['Fare'].fillna(data['Fare'].median())  # 用中位数填充票价的缺失值

# 离散值处理
for data in full_data:
    data['Sex'] = data.Sex.map({'female': 0, 'male': 1})
    
    data['Embarked'] = data.Embarked.map({'C': 0, 'Q': 1, 'S': 2})
    
# 其他处理
def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

for data in full_data:    
    data['Cabin'] = data['Cabin'].apply(lambda x: 1 if x==x else 0)  # 是否拥有客舱
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1  # 家庭人数
    
    data['IsAlone'] = data['FamilySize'].apply(lambda x: 1 if x==1 else 0)  # 是否单身
    
    data['Title'] = data['Name'].apply(get_title)
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, "Rare": 5})

In [0]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'SibSp']
train_data = train_data.drop(columns=drop_cols)
test_data  = test_data.drop(columns=drop_cols)
y_train = train_data.iloc[:, 0]
X_train = train_data.iloc[:, 1:]
X_test = test_data
y_test = pd.read_csv('gender_submission.csv').Survived

# 模型

In [0]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=100)
xgb_clf.fit(X_train, y_train)
xgb_clf.score(X_test, y_test)

0.8947368421052632

In [0]:
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier(
    n_estimators=100
)

params = {
    'learning_rate': [0.1, 0.3, 0.5],
    'gamma': [0, 0.1, 1],    
    'max_depth': np.arange(1, 6),
    'min_child_weight': np.arange(1, 6),
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1]
}

grid = GridSearchCV(xgb, params, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0,...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.5, 0.8, 1],
                         'gamma': [0, 0.1, 1], 'learning_rate': [0.1, 0.3, 0.5],
                         'max_dept

In [0]:
print(grid.best_params_)
print(grid.best_score_)
model = grid.best_estimator_

model.score(X_test, y_test)

{'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.8}
0.8507295173961841


0.8660287081339713

In [0]:
# predictions = model.predict(X_test)
# Submission = pd.DataFrame({ 'PassengerId': PassengerId,
#                             'Survived': predictions })
# Submission.to_csv('Submission.csv', index=False)