In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# 
data = [train, test]

train.head(3)
train.describe()

In [None]:
for dataset in data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
train['FamilySize'].plot(kind='hist')

In [None]:
pd.pivot_table(train, values='Survived', index='Pclass', columns='Sex')
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map({'male': 0, 'female': 1}).astype('int')

In [None]:
train['Fare'].plot(kind='hist', bins=20)
for dataset in data:
    # dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True) - no null vals, also this would have caused data leakage
    dataset['Fare'] = pd.cut(dataset['Fare'], bins=[-1, 50, 200, 10000], labels=[0, 1, 2])
    dataset['Fare'] = dataset['Fare'].astype('int')
fig = plt.figure()
train['Fare'].hist()

In [None]:
pd.pivot_table(train, values='Survived', index='Embarked')
for dataset in data:
    dataset['Embarked'].fillna('Other', inplace=True)
    dataset['Embarked'] = dataset['Embarked'].map({'Other': 0, 'S': 1, 'Q': 2, 'C': 3}).astype('int')
train['Embarked'].value_counts()

In [None]:
train.head(3)
for dataset in data:
    dataset = dataset.drop(['SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
for dataset in data:
    # Takes the substring between a space and period to get the title of the person
    dataset['Title'] = dataset['Name'].apply(lambda x: x[x[0:x.index('.')].rfind(' ') + 1:x.index('.')])
    dataset['Title'] = dataset['Title'].replace(list(set(dataset['Title'].unique()) - {'Mr', 'Miss', 'Mrs', 'Master'}), 'Other')
    dataset['Title'].fillna('Other', inplace=True)
    dataset['Title'] = dataset['Title'].map({'Other': 0, 'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4}).astype('int')
    dataset = dataset.drop('Name', axis=1, inplace=True)
train['Title'].value_counts()
pd.pivot_table(train, values='Survived', index='Title')

In [None]:
train.apply(lambda x: x.isnull()).sum()
# Use imputer to fill missing values for Age during cross validation
""""pivot = pd.pivot_table(train, values='Age', index='Pclass', columns=['Sex'], aggfunc='median')
pivot

def impute_age(x):
    return pivot.loc[x['Pclass'], x['Sex']]

for dataset in data:
    dataset['Age'].fillna(dataset[dataset['Age'].isnull()].apply(impute_age, axis=1), inplace=True)
train.apply(lambda x: x.isnull()).sum()"""

Try to find good models: 

In [None]:
# SVC, KNeighbors, Random Forest, XGBoost
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
svc_pipeline = make_pipeline(Imputer(), SVC())
knn_pipeline = make_pipeline(Imputer(), KNeighborsClassifier())
forest_pipeline = make_pipeline(Imputer(), RandomForestClassifier())
xgb_pipeline = make_pipeline(Imputer(), XGBClassifier())
models = [svc_pipeline, knn_pipeline, forest_pipeline, xgb_pipeline]

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

X = train.drop('Survived', axis=1)
y = train['Survived']
np_X = np.array(X)
np_y = np.array(y)

kfold = KFold(n_splits=10)
for i, model in enumerate(models): 
    error_sum = 0
    for train_indexes, test_indexes in kfold.split(X):
        train_X, test_X = np_X[train_indexes], np_X[test_indexes]
        train_y, test_y = np_y[train_indexes], np_y[test_indexes]
        
        # Redirect output from calling fit so it doesn't cause clutter 
        output = model.fit(train_X, train_y)
        error_sum += model.score(test_X, test_y)
    error_sum /= 10
    print("Score for Model #{}: {}".format(i, error_sum))

XGBoost is the best model. 

Tune parameters (n_estimators, early_stopping_rounds, and learning_rate):

In [None]:
estimators = [100, 500, 1000]
stopping_rounds = [1, 5, 10]
rates = [0.1, 0.5, 1]
for estimator in estimators:
    for stopping_round in stopping_rounds:
        for rate in rates:
            xgb_pipeline = Pipeline([('imputer', Imputer()), 
                                          ('xgb', XGBClassifier(estimators=estimator, learning_rate=rate))])
            error_sum = 0
            for train_indexes, test_index in kfold.split(X):
                train_X, test_X = np_X[train_indexes], np_X[test_indexes]
                train_y, test_y = np_y[train_indexes], np_y[test_indexes]
                output = xgb_pipeline.fit(train_X, train_y, xgb__eval_set=[(test_X, test_y)], 
                                          xgb__early_stopping_rounds=stopping_round, xgb__verbose=False)
                error_sum += xgb_pipeline.score(test_X, test_y)
            error_sum /= 10
            print("n_estimators: {} \t early_stopping_rounds: {} \t learning_rate: {} \t Score: {}".format(estimator, stopping_round, rate, error_sum))


Learning rate and early stopping rounds seems to be the two things affecting the score, with learning_rate=1 and early_stopping_rounds=10 being the best combo.  

In [2]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.30)

xgb_pipeline = Pipeline([('imputer', Imputer()), ('xgb', XGBClassifier(learning_rate=0.1))])
output = xgb_pipeline.fit(train_X, train_y, xgb__eval_set=[(np.array(val_X), np.array(val_y))], xgb__early_stopping_rounds=10, xgb__verbose=False)
predictions = xgb_pipeline.predict(val_X)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(predictions, val_y)

NameError: name 'X' is not defined

learning_rate=1 seems to cause overfitting, so instead sticking to the default of 0.1. 

In [None]:
predictions = xgb_pipeline.predict(test)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
submission.to_csv('submission.csv', index=False)