In [0]:
# import libraries for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# list data files that are connected to the kernel
import os
os.listdir('../input/')

['gender_submission.csv', 'test.csv', 'train.csv']

In [0]:
# read the train.csv file into a datframe
df_train = pd.read_csv('../input/train.csv')
print('Shape: ', df_train.shape)
df_train.head()

Shape:  (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
# read the test.csv file into a datframe
df_test = pd.read_csv('../input/test.csv')
print('Shape: ', df_test.shape)
df_test.head()

Shape:  (418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [0]:
# create df_full by merging both train and test data
df_full = df_train.append(df_test, sort=False)
print('Shape: ', df_full.shape)

Shape:  (1309, 12)


# Exploratory Data Analysis

In [0]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [0]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [0]:
# import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# create function EDA_helper to do which is doing 3 things: binning, encoding of the feature and calculating the impact on the target feature
def EDA_helper(feature, bin_number=5, train_size=891):
    '''function creates a new column as 'old feature name_bin' and bins the values (only if the number of unique values is more than 10). After that it calculates the sum, count and mean of the feature values'''    
    # calculate number of unique values for the feature
    unique_features = len(list(df_full[feature].unique()))
    # if there are more than 10 unique values
    if unique_features > 10:
        print('Number of unique features is %d, starting to bin...' % unique_features)
        # create a new column for the bins
        df_full[feature + '_bin'] = pd.qcut(df_full[feature], bin_number)
        # assign the bins to the train and test dataframe
        df_train[feature + '_bin'] = df_full[feature + '_bin'][:train_size]
        df_test[feature + '_bin'] = df_full[feature + '_bin'][train_size:]
        # define LabelEncoder instance 
        label = LabelEncoder()
        # fit and transform the data
        df_full[feature + '_bin_code'] = label.fit_transform(df_full[feature + '_bin'].astype(str))
        # assign the encoded bins to the train and test dataframe
        df_train[feature + '_bin_code'] = df_full[feature + '_bin_code'][:train_size]
        df_test[feature + '_bin_code'] = df_full[feature + '_bin_code'][train_size:]
        print('Binning successful, calculating impact...')
        # calculate the statistics
        impact = df_full[[feature + '_bin', 'Survived']].groupby([feature + '_bin']).agg(['sum','count','mean']).rename(columns={'sum':'Yes','count':'Total','mean':'In %'})
    else:
        print('Number of unique features is %d, binning not needed. Calculating impact...' % unique_features)
        # define LabelEncoder instance 
        label = LabelEncoder()
        # fit and transform the data
        df_full[feature + '_code'] = label.fit_transform(df_full[feature])
        # assign the encoded bins to the train and test dataframe
        df_train[feature + '_code'] = df_full[feature + '_code'][:train_size]
        df_test[feature + '_code'] = df_full[feature + '_code'][train_size:]
        # calculate the statistics for not binned features
        impact = df_full[[feature, 'Survived']].groupby([feature]).agg(['sum','count','mean']).rename(columns={'sum':'Yes','count':'Total','mean':'In %'})
    return impact

### PassengerId

In [0]:
# using the function on the 'PassengerId' column
EDA_helper('PassengerId')

Number of unique features is 1309, starting to bin...
Binning successful, calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
PassengerId_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(0.999, 262.6]",92.0,262,0.351145
"(262.6, 524.2]",112.0,262,0.427481
"(524.2, 785.8]",100.0,261,0.383142
"(785.8, 1047.4]",38.0,106,0.358491
"(1047.4, 1309.0]",0.0,0,


### Survived

In [0]:
# unique value counts in 'Survived' column
df_train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

### Pclass

In [0]:
EDA_helper('Pclass')

Number of unique features is 3, binning not needed. Calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,136.0,216,0.62963
2,87.0,184,0.472826
3,119.0,491,0.242363


### Name

In [0]:
# extract the title from the 'Name' column
for name in df_full['Name']:
    df_full['Title'] = df_full['Name'].str.extract('([A-Za-z]+)\.', expand=False)

# check how the different titles are distributed by gender
print(pd.crosstab(df_full['Title'], df_full['Sex']))

Sex       female  male
Title                 
Capt           0     1
Col            0     4
Countess       1     0
Don            0     1
Dona           1     0
Dr             1     7
Jonkheer       0     1
Lady           1     0
Major          0     2
Master         0    61
Miss         260     0
Mlle           2     0
Mme            1     0
Mr             0   757
Mrs          197     0
Ms             2     0
Rev            0     8
Sir            0     1


In [0]:
# categorize titles
for title in df_full['Title']:
    df_full['Title'] = df_full['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare'
                                               )
    df_full['Title'] = df_full['Title'].replace('Mlle', 'Miss')
    df_full['Title'] = df_full['Title'].replace('Ms', 'Miss')
    df_full['Title'] = df_full['Title'].replace('Mme', 'Mrs')
    
# use the EDA_helper function
EDA_helper('Title')

Number of unique features is 5, binning not needed. Calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Master,23.0,40,0.575
Miss,130.0,185,0.702703
Mr,81.0,517,0.156673
Mrs,100.0,126,0.793651
Rare,8.0,23,0.347826


### Sex

In [0]:
EDA_helper('Sex')

Number of unique features is 2, binning not needed. Calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,233.0,314,0.742038
male,109.0,577,0.188908


### Age

In [0]:
# fill the missing age info with median from the full dataset
for age in df_full['Age']:
    df_full['Age'].fillna(df_full['Age'].median(), inplace=True)

# using the EDA_helper function and setting number of bins to 4
EDA_helper('Age', 4)

Number of unique features is 98, starting to bin...
Binning successful, calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Age_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(0.169, 22.0]",98.0,231,0.424242
"(22.0, 28.0]",102.0,308,0.331169
"(28.0, 35.0]",59.0,135,0.437037
"(35.0, 80.0]",83.0,217,0.382488


### SibSp & Parch

In [0]:
# combine both columns as 'Family size'
df_full['Family Size'] = df_full['SibSp'] + df_full['Parch']

EDA_helper('Family Size')

Number of unique features is 9, binning not needed. Calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Family Size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,163.0,537,0.303538
1,89.0,161,0.552795
2,59.0,102,0.578431
3,21.0,29,0.724138
4,3.0,15,0.2
5,3.0,22,0.136364
6,4.0,12,0.333333
7,0.0,6,0.0
10,0.0,7,0.0


### Ticket

In [0]:
# import library
import re

# remove non-digits from the ticket and change to numeric type
for ticket in df_full['Ticket']:
    df_full['Ticket'] = df_full['Ticket'].apply(lambda x: x if x.isdigit() else re.sub('\D','', x))

# changing the type to numeric
df_full['Ticket'] = df_full['Ticket'].apply(pd.to_numeric)
    
EDA_helper('Ticket')

Number of unique features is 924, starting to bin...
Binning successful, calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Ticket_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(1.999, 11773.4]",63.0,174,0.362069
"(11773.4, 29150.0]",102.0,166,0.614458
"(29150.0, 239854.4]",87.0,179,0.486034
"(239854.4, 349236.2]",46.0,183,0.251366
"(349236.2, 23101294.0]",43.0,185,0.232432


### Fare

In [0]:
# fill the missing fare info with median fare from the full dataset
for fare in df_full['Fare']:
    df_full['Fare'].fillna(df_full['Fare'].median(), inplace=True)

EDA_helper('Fare')

Number of unique features is 281, starting to bin...
Binning successful, calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Fare_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
"(-0.001, 7.854]",39.0,179,0.217877
"(7.854, 10.5]",37.0,184,0.201087
"(10.5, 21.558]",73.0,171,0.426901
"(21.558, 41.579]",82.0,185,0.443243
"(41.579, 512.329]",111.0,172,0.645349


### Cabin

In [0]:
# fill the missing info with string 'N' and extract the first letter as new column 'N' for the full dataset
for cabin in df_full['Cabin']:
    df_full['Cabin'].fillna('N', inplace=True)
    df_full['Deck'] = df_full['Cabin'].apply(lambda x: 'N' if pd.isnull(x) else x[0])

EDA_helper('Deck')

Number of unique features is 9, binning not needed. Calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Deck,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,7.0,15,0.466667
B,35.0,47,0.744681
C,35.0,59,0.59322
D,25.0,33,0.757576
E,24.0,32,0.75
F,8.0,13,0.615385
G,2.0,4,0.5
N,206.0,687,0.299854
T,0.0,1,0.0


### Embarked

In [0]:
# fill the missing info with the most common value
for cabin in df_full['Embarked']:
    df_full['Embarked'].fillna('S', inplace=True)

EDA_helper('Embarked')

Number of unique features is 3, binning not needed. Calculating impact...


Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,Yes,Total,In %
Embarked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
C,93.0,168,0.553571
Q,30.0,77,0.38961
S,219.0,646,0.339009


# Feature Engineering and Selection

### New Feature: Family Survival


In [0]:
# extract the last name from the 'Name' column (using the full_data)
for name in df_full['Name']:
    df_full['Last Name'] = df_full['Name'].str.extract('([A-Za-z]+)\,', expand=False)

DEFAULT_SURVIVAL_VALUE = 0.5
df_full['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in df_full[['Survived','Name', 'Last Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df_full.loc[df_full['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                df_full.loc[df_full['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      df_full.loc[df_full['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 421


In [0]:
for _, grp_df in df_full.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    df_full.loc[df_full['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    df_full.loc[df_full['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(df_full[df_full['Family_Survival']!=0.5].shape[0]))

train_size = len(df_train)

# Family_Survival in df_train and df_test:
df_train['Family_Survival'] = df_full['Family_Survival'][:train_size]
df_test['Family_Survival'] = df_full['Family_Survival'][train_size:]

Number of passenger with family/group survival information: 551


### Selecting Most Important Features

In [0]:
# define a list of columns to work with going further
columns_to_keep = ['Sex_code', 'Pclass', 'Fare_bin_code', 'Age_bin_code', 'Family Size_code', 'Family_Survival']

# create new datafames with the desired columns
train = df_train[columns_to_keep]
test = df_test[columns_to_keep]

# save the target column for later use
train_labels = df_train['Survived']

print('Train data shape: ', train.shape)
print('Test data shape: ', test.shape)

Train data shape:  (891, 6)
Test data shape:  (418, 6)


In [0]:
train.head()

Unnamed: 0,Sex_code,Pclass,Fare_bin_code,Age_bin_code,Family Size_code,Family_Survival
0,1,3,0,0,1,0.5
1,0,1,3,3,1,0.5
2,0,3,4,1,0,0.5
3,0,1,3,2,1,0.0
4,1,3,4,2,0,0.5


# Modeling

### Gender Model


In [0]:
# create simple predicition based on gender (women live, men die)
gender_pred = df_test['Sex'].apply(lambda x: '1' if x=='female' else '0')
gender_pred.value_counts()

0    266
1    152
Name: Sex, dtype: int64

When submitted, this gender-only model will get a **score of 0.76555.**

### Random Forest Model

In [0]:
# import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# define the scaler instance
scaler = StandardScaler()

# fit on train data, transform both train and test data
train = scaler.fit_transform(train)
test = scaler.transform(test)
        
# define the classifier instance
clf = RandomForestClassifier(n_estimators=100, random_state = 42)

# fit the classifier on the train data and previously saved train labels
clf.fit(train, train_labels)

# predict on test data
rf_results = clf.predict(test)

In [0]:
# make a submission dataframe
submit = df_test.loc[:, ['PassengerId']]
submit.loc[:, 'Survived'] = rf_results

# save the submission dataframe
submit.to_csv('submissionRF_.csv', index = False)

When submitted, this random forest model will get a **score of 0.79425**

### TPOT Model


Interestingly, TPOT also selected a random forest model which ** scored 0.81818.**

In [0]:
# import TPOT
from tpot import TPOTClassifier

# create instance
pipeline_optimizer = TPOTClassifier(max_time_mins=120, n_jobs = -1, random_state=42, verbosity=2, cv=5)

# fit TPOT on the train data
# commented out after the run
#pipeline_optimizer.fit(train, train_labels)

# export optimized code
# commented out after the run
#pipeline_optimizer.export('tpot_titanic_pipeline.py')

# import libraries
from sklearn.pipeline import make_pipeline

# create the pipeline from TPOT
# original pipeline inluded a Binarizer and RBFSampler which scored only 0.78947 
exported_pipeline = make_pipeline(
    RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.45, min_samples_leaf=14, min_samples_split=13, n_estimators=100)
)

# fit the pipeline on the train data
exported_pipeline.fit(train, train_labels)

# predict on the test data
results = exported_pipeline.predict(test)

In [0]:
# make a submission dataframe
submit = df_test.loc[:, ['PassengerId']]
submit.loc[:, 'Survived'] = results

# save the submission dataframe
submit.to_csv('submission.csv', index = False)