In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

In [2]:
data = pd.read_csv('/Users/balaji/moonshot/workspace/pycharm/titanic/train.csv')

Feature Engineering

In [3]:
import re

# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Get all the titles and print how often each one occurs.
titles = data["Name"].apply(get_title)
#print(pd.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v

# Verify that we converted everything.
#print(pd.value_counts(titles))

# Add in the title column.
data["Title"] = titles

In [4]:
print data.head()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked Title  
0      0         A/5 21171   7.2500   NaN        S     1  
1      0          PC 17599  71.2833   C85        C     3  
2      0  STON/O2. 3101282   7.9250   NaN        S     2  
3      0            113803  53.1000  C123        S     3  
4      0            373450   8.

In [5]:
# Generating a familysize column
data["FamilySize"] = data["SibSp"] + data["Parch"]

# The .apply method generates a new series
data["NameLength"] = data["Name"].apply(lambda x: len(x))

In [6]:
import operator

# A dictionary mapping family name to id
family_id_mapping = {}

# A function to get the id given a row
def get_family_id(row):
    # Find the last name by splitting on a comma
    last_name = row['Name'].split(',')[0]
    # Create the family id
    family_id = '{0}{1}'.format(last_name, row['FamilySize'])
    # Look up the id in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            # Get the maximum id from the mapping and add one to it if we don't have an id
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

# Get the family ids with the apply method
family_ids = data.apply(get_family_id, axis=1)

# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
family_ids[data['FamilySize'] < 3] = -1

# Print the count of each unique id.
#print(pd.value_counts(family_ids))

data['FamilyId'] = family_ids

In [7]:
print data.head()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked Title  FamilySize  \
0      0         A/5 21171   7.2500   NaN        S     1           1   
1      0          PC 17599  71.2833   C85        C     3           1   
2      0  STON/O2. 3101282   7.9250   NaN        S     2           0   
3      0            113803  53.1000  C

In [8]:
feature_names = ['Age', 'Pclass', 'Fare', 'Title', 'FamilySize', 'FamilyId']

In [9]:
features = pd.concat([data[feature_names], pd.get_dummies(data['Sex'], prefix='Sex')], axis=1)

In [10]:
features = features.fillna(features.dropna().median())
features = features.drop('Sex_male', 1)

In [11]:
target = data['Survived']

Logistic Regression

In [12]:
log_reg = LogisticRegression(C=1)
log_reg_scores = cross_val_score(log_reg, features, target, cv=3)

In [13]:
print log_reg_scores.min()

0.79797979798


Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=5, bootstrap=True, criterion='entropy', max_features=3, min_samples_leaf=1,min_samples_split=9)
rf_scores = cross_val_score(rf, features, target, cv=5, n_jobs=4, scoring='accuracy')

In [15]:
print rf_scores

[ 0.87150838  0.82681564  0.8258427   0.80337079  0.83615819]


Gradient Boosting

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=.8, max_features=0.5, max_depth=4)
gb_scores = cross_val_score(gb, features, target, cv=5, n_jobs=4, scoring='accuracy')

In [23]:
print gb_scores

[ 0.84916201  0.82122905  0.87640449  0.82022472  0.83615819]


Grid search CV for gradient boosting

In [18]:
from sklearn.grid_search import GridSearchCV

gb = GradientBoostingClassifier(n_estimators=100, subsample=.8)
params = {
    "learning_rate" : [0.05, 0.1, 0.5],
    "max_features" : [0.5, 1],
    "max_depth" : [3, 4, 5],
}
gs = GridSearchCV(gb, params, cv=5, scoring='roc_auc', n_jobs=1)
gs.fit(features, target)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=0.8, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [0.5, 1], 'learning_rate': [0.05, 0.1, 0.5], 'max_depth': [3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [19]:
scores = cross_val_score(gs, features, target, cv=5, n_jobs=4,
                         scoring='accuracy')

In [20]:
print scores

[ 0.83240223  0.80446927  0.8258427   0.8258427   0.81920904]


In [21]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'max_features': 0.5}

Grid search for random forest

Randomized Grid Search CV

In [None]:
from scipy.stats import randint as sp_randint
from sklearn.grid_search import RandomizedSearchCV

params = {"max_depth": [3, 5, 10, None],
              "max_features": sp_randint(1, 4),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=params,
                                   n_iter=n_iter_search)
random_search.fit(features, target)

In [183]:
random_search.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 9}

Bagging Classifier

In [24]:
from sklearn.ensemble import BaggingClassifier

bclf = BaggingClassifier(n_estimators=100, max_features=3)
bclf_scores = cross_val_score(bclf, features, target, cv=5, n_jobs=4, scoring='accuracy')

In [25]:
print bclf_scores

[ 0.79888268  0.82122905  0.87078652  0.82022472  0.84745763]


SVC

In [76]:
from sklearn.svm import SVC

svc = SVC()
svc_scores = cross_val_score(svc, features, target, cv=5, n_jobs=4, scoring='accuracy')

In [77]:
print svc_scores

[ 0.7150838   0.67597765  0.75842697  0.75842697  0.74576271]


Model training

In [26]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [27]:
gb.fit(features_train, target_train).score(features_test, target_test)

0.84357541899441346

In [28]:
bclf.fit(features_train, target_train).score(features_test, target_test)

0.85474860335195535

In [29]:
rf.fit(features_train, target_train).score(features_test, target_test)

0.82122905027932958

In [30]:
log_reg.fit(features_train, target_train).score(features_test, target_test)

0.77653631284916202

In [41]:
target_predicted =bclf.predict(features_test)

Model evaluation

In [42]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(target_test, target_predicted)
print (cm)

[[100  10]
 [ 16  53]]


In [43]:
from sklearn.metrics import classification_report

In [44]:
print(classification_report(target_test, target_predicted,
                            target_names=['not survived', 'survived']))

              precision    recall  f1-score   support

not survived       0.86      0.91      0.88       110
    survived       0.84      0.77      0.80        69

 avg / total       0.85      0.85      0.85       179



In [45]:
from sklearn.metrics import explained_variance_score

In [46]:
explained_variance_score(target_test, target_predicted)

0.39156785243741765

Model testing

In [47]:
test_data = pd.read_csv('/Users/balaji/moonshot/workspace/pycharm/titanic/test.csv')

In [67]:
# Get all the titles and print how often each one occurs.
titles = test_data["Name"].apply(get_title)
#print(pd.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2, "Dona":2}
for k,v in title_mapping.items():
    titles[titles == k] = v

# Verify that we converted everything.
#print(pd.value_counts(titles))

# Add in the title column.
test_data["Title"] = titles

In [68]:
# Generating a familysize column
test_data["FamilySize"] = test_data["SibSp"] + test_data["Parch"]

# The .apply method generates a new series
test_data["NameLength"] = test_data["Name"].apply(lambda x: len(x))

In [69]:
# Get the family ids with the apply method
family_ids = test_data.apply(get_family_id, axis=1)

# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
family_ids[test_data['FamilySize'] < 3] = -1

# Print the count of each unique id.
#print(pd.value_counts(family_ids))

test_data['FamilyId'] = family_ids

In [70]:
test_features = pd.concat([test_data[feature_names], pd.get_dummies(test_data['Sex'], prefix='Sex')], axis=1)
test_features = test_features.fillna(test_features.dropna().median())
test_features = test_features.drop('Sex_male', 1)

In [71]:
bclf.fit(features, target)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=3, max_samples=1.0,
         n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [73]:
predictions = bclf.predict(test_features)

Kaggle submission

In [74]:
submission = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": predictions})

In [75]:
submission.to_csv("/Users/balaji/moonshot/workspace/pycharm/titanic/submission4.csv", index=False)