In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.cross_validation import cross_val_score, cross_val_predict, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from pprint import pprint
from time import time

# Feature Engineering

In [3]:
def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return None

def get_cabin_deck(cabin):
    if cabin.isnan():
        return None
    deck_search = re.search(' ([A-Z])', cabin)
    if deck_search:
        return deck_search.group(1)
    
def get_cabin_number(cabin):
    number_search = re.search(' ([0-9]+)', cabin)
    if number_search:
        return number_search.group(1)

In [4]:
sex_map = {
        'male': 0,
        'female': 1
    }

embarked_map = {
    'S': 0,
    'C': 1,
    'Q': 2
}

title_map = {
    'Mr': 1,
    'Miss': 2,
    'Mrs': 3,
    'Master': 4,
    'Dr': 5,
    'Rev': 6,
    'Major': 7,
    'Col': 7,
    'Mlle': 8,
    'Mme': 8,
    'Don': 9,
    'Lady': 10,
    'Countess': 10,
    'Jonkheer': 10,
    'Sir': 9,
    'Capt': 7,
    'Ms': 2
}

In [5]:
def process_dataset(df):
    
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Sex'] = df['Sex'].map(sex_map)
    df['Embarked'] = df['Embarked'].map(embarked_map)
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['NameLength'] = df['Name'].apply(lambda x: len(x))
    df['Title'] = df['Name'].apply(get_title).map(title_map)
    df['Title'] = df['Title'].fillna(0)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
#     df['Cabin'] = df['Cabin'].fillna('')
#     df['CabinDeck'] = df['Cabin'].apply(get_cabin_deck)
#     df['CabinNumber'] = df['Cabin'].apply(get_cabin_number)
    
    return df

In [6]:
titanic = pd.read_csv('data/train.csv')
titanic = process_dataset(titanic)

titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,NameLength,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,1,23,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,1,51,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,0,22,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,1,44,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0,24,1


In [7]:
titanic_test = pd.read_csv('data/test.csv')
titanic_test = process_dataset(titanic_test)

titanic_test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,NameLength,Title
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2,0,16,1.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0,1,32,3.0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2,0,25,1.0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0,0,16,1.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0,2,44,3.0


In [8]:
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
FamilySize       int64
NameLength       int64
Title            int64
dtype: object

# Feature Selection

In [9]:
predictors = ['Sex', 'Title', 'Pclass', 'NameLength', 'Fare', 'Age', 'SibSp', 'Parch', 'Embarked', 'FamilySize']

# OR

# Select all columns which contain ints or floats
predictors = titanic.select_dtypes(include=['int','float']).columns.tolist()

# Remove target column. Obviously it would perfectly correlate and must be excluded.
predictors.remove('Survived')

X = titanic[predictors]
y = titanic['Survived']

X_test = titanic_test[predictors]

print(X.shape)
print(y.shape)
print(X_test.shape)

(891, 11)
(891,)
(418, 11)


In [10]:
def fit_grid_search(grid_search, pipeline, parameters):
    
    print('pipeline:', [name for name, _ in pipeline.steps])
    print('parameters:')
    pprint(parameters)

    t0 = time()
    grid_search.fit(X, y)
    print('done in {:0.3}s'.format(time() - t0))

    print('Best score: {:.3}'.format(grid_search.best_score_))
    print('Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t{}: {}'.format(param_name, best_parameters[param_name]))

    return grid_search.best_estimator_

# Random Forest

In [11]:
# Create Pipeline
pipeline = Pipeline([
        ('selectk', SelectKBest()),
        ('rf', RandomForestClassifier())
    ])

parameters = {
    'selectk__k': [ 5, 7, 9, 11],
    'rf__n_estimators': [100, 150],
    'rf__min_samples_split': [2, 4, 6],
    'rf__min_samples_leaf': [2, 4, 6]
    }

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)

best_estimator = fit_grid_search(grid_search, pipeline, parameters)

best_estimator = grid_search.best_estimator_
print(best_estimator.score(X, y))

predictions = best_estimator.predict(X_test)
predictions

pipeline: ['selectk', 'rf']
parameters:
{'rf__min_samples_leaf': [2, 4, 6],
 'rf__min_samples_split': [2, 4, 6],
 'rf__n_estimators': [100, 150],
 'selectk__k': [5, 7, 9, 11]}
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   59.3s finished


done in 59.8s
Best score: 0.834
Best parameters set:
	rf__min_samples_leaf: 2
	rf__min_samples_split: 4
	rf__n_estimators: 100
	selectk__k: 7
0.915824915825


array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

# Save Submission To CSV

In [None]:
def check_submission_rules(df):
    assert df.shape[0] == 418

check_submission_rules(predictions)

submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv('submissions/advanced_method.csv', index=False)