In [None]:
# import headers
import pandas as pd
import numpy as np
import sklearn
import re
import math
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score

In [None]:
# load titanic dataset
train = pd.read_csv('titanic-train.csv')
train.drop('Survived',1,inplace=True)
test = pd.read_csv('titanic-test.csv')
combined=train.append(test)
combined.reset_index(inplace=True)
combined.drop('index',inplace=True,axis=1)

In [None]:
# checking the data
combined.head(5)

In [None]:
# feature engineering on name field
# Extract titles from name field
combined['Title']=combined['Name'].apply(lambda xa:xa[xa.find(',')+1:xa.find('.')])
print combined['Title'].value_counts()
# Create mapping for titles
# create  a dictionary of names
name_title_dict = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        'Don':        'Royalty',
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"
}

In [None]:
# Check age null values
combined['Age'].isnull().sum()
# replace null values with median based on sex and pclass
# groupby age with sex,pclass,title
median_age=combined.groupby(['Sex','Pclass','Title'])['Age'].median()
# flatten multiindex series and assign to dataframe
median_age_df=median_age.reset_index()
print median_age_df
# replace sex=female, pclass=3 and title=Ms which has Nan values for age with mean age for sex=female and pclass=3
median_age_df['Age'].ix[13] = 24.5
# for all null values of age , replace with corresponding median from sex,pclass,title
# create combined_v01 dataframe to merge combined and median dataframe
combined_v01 = pd.merge(combined,median_age_df,on=['Sex','Pclass','Title'],how='left')
# new column to assign new age values which wont have any null values
combined_v01['Newage']=combined_v01.apply(lambda x:x['Age_y'] if np.isnan(x['Age_x']) else x['Age_x'],axis=1)
combined_v01.drop(['Age_x'],inplace=True,axis=1)


In [None]:
# Drop name from dataframe
combined_v01.drop('Name',axis=1,inplace=True)

In [None]:
# replace null values for fare with mean
combined_v01['Fare'].fillna(combined.Fare.mean(),inplace=True)

In [None]:
# replace null values for embarked with mode
combined_v01.Embarked.fillna(combined.Embarked.mode()[0],inplace=True)

In [None]:
#replace null values with mode
combined_v01.Cabin.fillna('U',inplace=True)
# replace cabin with first letters
combined_v01.Cabin=[s[0] for s in combined_v01.Cabin]
combined_v01.shape

In [None]:
# create dummies function
def get_dummies_func(df,column_name):
    df1=pd.get_dummies(df[column_name])
    df1.columns = [column_name+'_'+s for s in df1.columns]
    df = pd.concat([df,df1],axis=1)
    df.drop(column_name,axis=1,inplace=True)
    return df    

In [None]:
# create dummies
combined_v02=pd.DataFrame()
a = ['Cabin','Title','Embarked']
for s in a:
    print s
    combined_v01=get_dummies_func(combined_v01,s)
combined_v01.shape

In [None]:
# Code sex
combined_v01.Sex=combined_v01.Sex.map({'male':0,'female':1})
combined_v01.shape

In [None]:
# create family size
combined_v01['Familysize'] = combined_v01.SibSp + combined_v01.Parch
combined_v01.shape

In [None]:
# define family status function
def family_size_func(s):    
    if s==1:
        return 'singleton'
    elif s>=2 & s<=4:
        return 'smallfamily'
    elif s>=5:
        return 'largefamily'   
combined_v01['family_status'] = [family_size_func(s) for s in combined_v01.Familysize]
combined_v01.shape

In [None]:
# get dummies for family status
combined_v01 = get_dummies_func(combined_v01,'family_status')
combined_v01.shape

In [None]:
# get dummies for pclass
pclass_dummies = pd.get_dummies(combined_v01['Pclass'])
pclass_dummies.columns = ['Pclass_'+str(s) for s in pclass_dummies.columns]
print pclass_dummies.head(5)
combined=pd.concat([combined_v01,pclass_dummies],axis=1)
#combined.head(50)
combined_v01.drop('Pclass',axis=1,inplace=True)
combined_v01.shape

In [None]:
# drop ticket column
combined_v01.drop('Ticket',axis=1,inplace=True)
combined_v01.drop('PassengerId',axis=1,inplace=True)
combined_v01.shape

In [None]:
combined_v01.info()

In [None]:
from sklearn import preprocessing
features = ['Newage','Fare']
normalized_X = preprocessing.normalize(combined_v01[features])

In [None]:
combined_v02 = combined_v01.copy()
combined_v02.drop(['Age_y'],axis=1,inplace=True)
combined_v02[features] = normalized_X
combined_v02.info()

In [None]:
# Scaling & normalization
'''combined.info()
combined_v02 = combined_v01.copy()
combined_v02.drop(['Age_y'],axis=1,inplace=True)
combined_v02.info()
features = ['Newage','Fare']
combined_v02[features] = combined_v02[features].apply(lambda x: x/x.max(), axis=0)
combined_v02.shape
'''

In [None]:
# The data is stored in such a way that train data has survived class and test data doesnt have it. We have combined them into a
# single dataframe. Now seperate them to train and test
train = pd.read_csv('titanic-train.csv')
train_x = combined_v02.ix[0:890]
train_y=train.Survived
test_x=combined_v02.ix[891:]

In [434]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_features='sqrt')

parameter_grid = {
                 'max_depth' : [4,5,6,7,8],
                 'n_estimators': [200,210,240,250],
                 'criterion': ['gini','entropy']
                 }

cross_validation = StratifiedKFold(train_y, n_folds=5)

grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_x, train_y)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.828282828283
Best parameters: {'n_estimators': 210, 'criterion': 'entropy', 'max_depth': 7}


In [None]:
# References : http://ahmedbesbes.com/how-to-score-08134-in-titanic-kaggle-challenge.html