# Titanic Submission to Kaggle (Part 2)
## 2nd part of my tutorial series to Kaggle.
This is the optimization of my first solution to the Titanic competition. I use this code for the second part of my tutorial series I am writing on Medium at LINK. You find the the first part here: [https://www.kaggle.com/adamjermann/titanic/first-titanic-submission/][1].

  [1]: https://www.kaggle.com/adamjermann/titanic/first-titanic-submission/

In [1]:
# importing the basic libs
import pandas as pd
import numpy as np

# loading the data
data = pd.read_csv("train.csv")

# dropping the `Cabin` and `Ticket` columns.
data.drop('Cabin', axis=1, inplace=True)

In [2]:
# extracting the titles form the names and using it for better imputation of Age
def name_extract(word):
    return word.split(',')[1].split('.')[0].strip()
titles = pd.DataFrame({'Title': data['Name'].apply(name_extract)})

def title_groups(old_title):
    if old_title == 'Mr':
        return('Mr')
    elif old_title == 'Mrs':
        return('Mrs')
    elif old_title == 'Master':
        return('Master')
    elif old_title == 'Miss':
        return('Miss')
    elif old_title == 'Master':
        return('Master')
    elif old_title == 'Dr':
        return('Dr')
    elif old_title == 'Rev':
        return('Rev')
    else:
        return('Others')

titles = pd.DataFrame({'Title': titles['Title'].apply(title_groups)})
data = pd.merge(data, titles, left_index = True, right_index = True)
table = data.pivot_table(values='Age', index=['Title'], columns=['Sex'], aggfunc=np.mean)
def fill_age(x):
    return table[x['Sex']][x['Title']]
data['Age'].fillna(data[data['Age'].isnull()].apply(fill_age, axis=1), inplace=True)

# drop the rest of the missing data.
data.dropna(inplace=True)

In [3]:
# trimming outliers
data.ix[data.Fare > 200, 'Fare'] = np.percentile(data.Fare, 98)

# creating new (family size) feature
data['Family_size'] = data['SibSp'] + data['Parch'] + 1

# encoding nominal features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.Embarked = le.fit_transform(data.Embarked)
data.Sex = le.fit_transform(data.Sex)

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categorical_features='all')
data_encoded = enc.fit_transform(data[['Embarked', 'Pclass']]).toarray()
data_encoded = pd.DataFrame(data_encoded)
data = pd.merge(data, data_encoded, left_index = True, right_index = True)

In [4]:
# creating the feature matrix (with numerical features) and label vector
X = data.drop(['PassengerId','Survived', 'Name', 'Sex', 'Embarked', 'Ticket', 'Title', 'Parch', 'SibSp'], axis=1)
y = data.ix[:, 'Survived']

# import the decision tree classifier and the cross-validation package.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# create the first model and assess its accuracy.
clf = DecisionTreeClassifier()
cross_val_score(clf, X, y, cv=10).mean()

0.67316681929719502

In [5]:
# performing GridSearch to find the best parameters.

from sklearn.model_selection import GridSearchCV

criterion = ['gini', 'entropy']
splitter = ['best', 'random']
max_depth = np.arange(1,10)
min_samples_leaf = np.arange(1,20,2)
min_samples_split = np.arange(2,20,2)

parameters = {'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 
              'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split}
clf = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(clf, parameters, cv=10)
clf.fit(X, y)

print(clf.best_score_)
print(clf.best_params_)

0.727170236753
{'min_samples_split': 18, 'splitter': 'random', 'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 7}


In [7]:
# import the test data and have a look at it.
data_test = pd.read_csv("test.csv")
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
# perform the same feature engineering steps
data_test.drop(('Cabin'), axis=1, inplace=True)

titles = pd.DataFrame({'Title': data_test['Name'].apply(name_extract)})
titles = pd.DataFrame({'Title': titles['Title'].apply(title_groups)})
data_test = pd.merge(data_test, titles, left_index = True, right_index = True)
data_test['Age'].fillna(data_test[data_test['Age'].isnull()].apply(fill_age, axis=1), inplace=True)

data.ix[data.Fare > 200, 'Fare'] = np.percentile(data.Fare, 98)
data_test['Family_size'] = data_test['SibSp'] + data_test['Parch'] + 1

# we need 1 plus step: we have missing values in 'Fare' as well. Lets fill in the average
data_test['Fare'].fillna(data['Fare'].mean(), inplace=True)

le = LabelEncoder()
data_test.Embarked = le.fit_transform(data_test.Embarked)
data_test.Sex = le.fit_transform(data_test.Sex)

enc = OneHotEncoder(categorical_features='all')
data_encoded = enc.fit_transform(data_test[['Embarked', 'Pclass']]).toarray()
data_encoded = pd.DataFrame(data_encoded)
data_test = pd.merge(data_test, data_encoded, left_index = True, right_index = True)

In [9]:
# create the feature matrix
X_test = data_test.drop(['PassengerId', 'Name', 'Sex', 'Embarked', 'Ticket', 'Title', 'Parch', 'SibSp'], axis=1)

# predict the labels for the test data
predicted = pd.Series(clf.predict(X_test))

# create a dataframe to store the prediction in the format Kaggle needs for submission.
solution = pd.concat([data_test['PassengerId'], predicted], axis=1)
solution.columns = ['PassengerId','Survived']

# output the results in a csv
solution.to_csv('submission.csv', index=False)