# 鐵達尼號Kaggle實戰
參考自[XGBoost调参技巧（二）Titanic实战Top9%](https://zhuanlan.zhihu.com/p/28739256)

In [1]:
# -*- coding: utf-8 -*- 
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
%matplotlib inline



## 讀取資料

In [2]:
train=pd.read_csv('train.csv',encoding='utf8')
test=pd.read_csv('test.csv',encoding='utf8')
submit = pd.read_csv('gender_submission.csv',encoding='utf8')

## data_cleasing_one_hot
這是使用one-hot encoding的方式來處理類別變數

## data_cleasing
這是單純把類別轉變為1、2、3的處理方式




In [3]:
def data_cleasing_one_hot(titanic):
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    # child
    titanic["child"] = titanic["Age"].apply(lambda x: 1 if x < 15 else 0)

    # sex
    titanic["sex"] = titanic["Sex"].apply(lambda x: 1 if x == "male" else 0)

    titanic["Embarked"] = titanic["Embarked"].fillna("S")
    #用眾數填入NA
    
    
    # familysize
    titanic["familysize"] = titanic["SibSp"] + titanic["Parch"] + 1

    titanic['Title1'] = titanic['Name'].str.split(", ", expand=True)[1]
    titanic['Title1'] = titanic['Title1'].str.split(".", expand=True)[0]
    titanic['Title2'] = titanic['Title1'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','the Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
             ['Miss','Mrs','Miss','Mr','Mr','Mrs','Mrs','Mr','Mr','Mr','Mr','Mr','Mr','Mrs'])
    
    #將名字的謂稱提取出來
    
    
    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
    
    one_hot=pd.get_dummies(titanic[['Embarked','Sex','Title2']])
    #將類別變數做one-hot encoding
    
    
    titanic=pd.concat([titanic,one_hot],axis=1)
    
    return titanic


def data_cleasing(titanic):
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    # child
    titanic["child"] = titanic["Age"].apply(lambda x: 1 if x < 15 else 0)

    # sex
    titanic["sex"] = titanic["Sex"].apply(lambda x: 1 if x == "male" else 0)

    titanic["Embarked"] = titanic["Embarked"].fillna("S")
    # embark
    def getEmbark(Embarked):
        if Embarked == "S":
            return 1
        elif Embarked == "C":
            return 2
        else:
            return 3
    titanic["embark"] = titanic["Embarked"].apply(getEmbark)

    # familysize
    titanic["fimalysize"] = titanic["SibSp"] + titanic["Parch"] + 1



    # name
    def getName(name):
        if "Mr" in str(name):
            return 1
        elif "Mrs" in str(name):
            return 2
        else:
            return 0
    titanic["name"] = titanic["Name"].apply(getName)
    
    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

    return titanic

train_data = data_cleasing(train)
test_data  = data_cleasing(test)

In [4]:
features = ["Pclass", "sex", "child", "fimalysize", "Fare", "embark"]

## 建立模型，並在cross validation找出好參數

- 樹的棵樹
- 每棵樹最大深度


In [5]:
clf =XGBClassifier(learning_rate=0.1, max_depth=2, 
silent=True, objective='binary:logistic')
param_test = {
    'n_estimators': range(30, 50, 2),
    'max_depth': range(2, 7, 1)
}
grid_search = GridSearchCV(estimator = clf, param_grid = param_test, 
scoring='accuracy', cv=5)
grid_search.fit(train_data[features], train_data["Survived"])
grid_search.best_params_, grid_search.best_score_

({'max_depth': 6, 'n_estimators': 32}, 0.83164983164983164)

## 將最好的參數帶入

In [6]:
model=XGBClassifier(learning_rate=0.1, max_depth=6,n_estimators=32,
silent=True, objective='binary:logistic')
model.fit(train_data[features], train_data["Survived"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=32,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## 預測test data

In [80]:
y_pred=model.predict(test_data[features])

In [81]:
test_data['Survived']=y_pred

## 做成csv

In [82]:
submit=test_data[['PassengerId','Survived']]
submit['Survived'] = submit['Survived'].astype(int)
submit.to_csv('submit.csv', index= False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
