In [None]:
import pandas as pd
pd.set_option('precision', 2)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import seaborn as sb
import matplotlib.pyplot as plt
sb.set()
sb.set_palette('Set1')
plt.rcParams['figure.figsize'] = (11.7, 8.27)

In [None]:
data = pd.read_csv('data/titanic_train.csv')
data.info()

There are 12 columns of various types int, object, float. Also there are some missing values.
Lets look at how much data is missing from each column.

In [None]:
data.isnull().sum()

3 columns have missing data. We will do something about this a little later.
For now lets look at some samples from data and try to understand the columns with `object` as their type.

In [None]:
data.head()

First lets look at the columns with object type.
* `Name`: `string`
* `Sex`: `string` most likely a categorical variable
* `Ticket`: `string` seems like ticket ID
* `Cabin`: `string` seems like cabin no. / room id
* `Embarked`: `string` seems like code of station from which passenger embarked

Another interesting thing to observe is `Age` is a float value and not integer. Not very common representation. We can try to deal with this later.

Let try to find unique values for each column.

In [None]:
for col in data.columns:
    print("%-20s %-10s %-10s" %(col, data[col].dtype, data[col].nunique()))

* `PassengerId` has datatype of `int` and all are distinct values. This feature will not be very useful for modelling.
* `Survived` is the target value we need to predict
* `Pclass` has 3 distinct `int` values. This defines passenger class.
* `Sex` has only 2 distinct values.
* `SibSp` are `int` values denoting with how many total siblings and spouse passenger is travelling.
* `Parch` are `int` values denoting with how many total parents and children passenger is travelling.
* `Embarked` has 3 distinct values.
* `Ticket`, `Fare`, `Cabin` and `Name` have a number of distinct values.

As a first step let us ignore columns with large number of missing values. And let us convert categorical data into their **One Hot** representations. 

In [None]:
features_v1 = data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']]
labels = data[['Survived']]

embarked_mode = features_v1['Embarked'].mode()[0]
print(embarked_mode)
features_v1.fillna(value = {'Embarked': embarked_mode}, inplace = True)
features_v1.isnull().sum()

In [None]:
features_v1 = pd.get_dummies(features_v1, columns = ['Sex', 'Embarked'])
features_v1.describe()

In [None]:
train_features \
, test_features \
, train_labels \
, test_labels = train_test_split(features_v1, labels, test_size = 0.3, random_state = 73, shuffle = True)

In [None]:
models = {'lr': LogisticRegression() \
          , 'svm': SVC() \
          , 'dt': DecisionTreeClassifier() \
          , 'rf': RandomForestClassifier() \
          , 'knn': KNeighborsClassifier() \
          , 'nb': GaussianNB() \
         }

In [None]:
print("%-10s %-10s %-10s %-10s" % ("model", "cv acc", "cv stddev", "train acc"))
for name, model in models.items():
    kfold = KFold(n_splits = 10, random_state = 73)
    cv_results = cross_val_score(model, train_features, train_labels, cv = kfold, scoring = 'accuracy')
    model.fit(train_features, train_labels)
    train_accuracy = accuracy_score(train_labels, model.predict(train_features))
    print("%-10s %-10.4f %-10.4f %-10.4f" % (name, cv_results.mean(), cv_results.std(), train_accuracy))
  

Till now we have not performed **feature scaling**.
Let us do the same and see if it helps.

In [None]:
features_v2 = MinMaxScaler().fit_transform(features_v1)
# print(pd.DataFrame(features_v2, columns = features_v1.columns).describe())

def run_experiment(features):
    train_features \
    , test_features \
    , train_labels \
    , test_labels = train_test_split(features, labels, test_size = 0.3, random_state = 73, shuffle = True)

    models = {'lr': LogisticRegression() \
              , 'svm': SVC() \
              , 'dt': DecisionTreeClassifier() \
              , 'rf': RandomForestClassifier() \
              , 'knn': KNeighborsClassifier() \
              , 'nb': GaussianNB() \
             }

    print("%-10s %-10s %-10s %-10s %-10s" % ("model", "cv acc", "cv stddev", "train acc", "overfitting"))
    for name, model in models.items():
        kfold = KFold(n_splits = 10, random_state = 73)
        cv_results = cross_val_score(model, train_features, train_labels, cv = kfold, scoring = 'accuracy')
        model.fit(train_features, train_labels)
        train_accuracy = accuracy_score(train_labels, model.predict(train_features))
        print("%-10s %-10.4f %-10.4f %-10.4f %-10.4f" % (name \
                                                         , cv_results.mean() \
                                                         , cv_results.std() \
                                                         , train_accuracy \
                                                         , train_accuracy - cv_results.mean() \
                                                        ))

run_experiment(features_v2)

Overall train accuracy is around 80%. 
Decision Tree and Random Forest seem to have more overfitting compared to other models. 
We **assume** that this accuracy is unacceptable and models are currently **underfitting**.
To fix this underfitting **we need more and better features**.
To derive these features **we need to look at data**. We will do some **EDA** on complete available training data.

#### We also assume that the test data has similar distribution as training data. But we will need to TEST THAT LATER. TODO

In [None]:
data.head(2)

The **first thing** we should look at is the distribution of classes to ensure there is no **class imbalance**.

In [None]:
print(data['Survived'].value_counts() / data.shape[0])
sb.countplot(x = 'Survived', data = data)

* The class distribution is 62% and 38%. Not very imbalanced. 
* If needed we can **adjust class weights** if the model permits. **TODO**

* The next feature we look at is **Pclass**.
* This defines the class with which passenger travelled.
* As already seen this has only 3 distinct values.
* Lets see how passenger class relates to passenger survival.

In [None]:
sb.countplot(x = 'Pclass', hue = 'Survived', data = data)

* Significant number of passengers of class 3 did not survive
* more passengers of class 1 survived
* so passenger class seems to be a very important feature.
* to help the models even more we should **One Hot encode** this feature. **TODO**

Next lets look at `Sex` feature.

In [None]:
sb.countplot(x = 'Sex', hue = 'Survived', data = data)

* More females survived.
* significantly high males did not survive.
* Important feature to consider

* `Parch` denotes with total no. of parents and children person is travelling with.
* `SibSp` denotes with total no. of siblings and spouses person is travelling with.
* Lets observe how they relate to survival

In [None]:
sb.countplot(x = 'Parch', hue = 'Survived', data = data)

In [None]:
sb.countplot(x = 'SibSp', hue = 'Survived', data = data)

* more persons travelling alone did not survive
* persons with less than or equal to 3 total parents and children had better chances for survival
* persons with more than 3 parents and children were very less and did not survive
* can create features **alone**, **small parch**, and **large parch** based on these observations **TODO**


* persons with a 1 SibSp survived more
* persons with 2 SibSp had survived equally as they did not survive
* many persons with more than 3 SibSp did not survive
* can create features **alone**, **small sibsp** and **large sibsp** based on these observations **TODO**

We can also build a `Family Size` for a person using `Parch` and `SibSp`.
Lets see how that relates to survival

In [None]:
data['familySize'] = data['Parch'] + data['SibSp']
sb.countplot(x = 'familySize', hue = 'Survived', data = data)

* persons small families with less than 4 members survived more
* persons with large families with did not survive more
* we can create **small family** and **large family** as features **TODO**


Lets look at `Embarked` feature

In [None]:
sb.countplot(x = 'Embarked', hue = 'Survived', data = data)

* people who embarked on `C` survived more
* people who embarked on `S` died more

Lets create features based on these initial observations and check the model performance

In [None]:
features_v3 = pd.get_dummies(features_v1, columns = ['Pclass'])
features_v3['familySize'] = features_v3['Parch'] + features_v3['SibSp']
features_v3['alone'] = features_v3['familySize'].map(lambda x: 1 if x == 0 else 0)
features_v3['smallFamily'] = features_v3['familySize'].map(lambda x: 1 if 0 < x < 4 else 0)
features_v3['largeFamily'] = features_v3['familySize'].map(lambda x: 1 if x >= 4 else 0)
features_v3['smallParch'] = features_v3['Parch'].map(lambda x: 1 if 0 < x < 4 else 0)
features_v3['largeParch'] = features_v3['Parch'].map(lambda x: 1 if x >= 4 else 0)
features_v3['smallSibSp'] = features_v3['SibSp'].map(lambda x: 1 if 0 < x < 3 else 0)
features_v3['largeSibSp'] = features_v3['SibSp'].map(lambda x: 1 if x >= 3 else 0)
features_v3 = pd.DataFrame(MinMaxScaler().fit_transform(features_v3), columns = features_v3.columns)
features_v3.head()

In [None]:
run_experiment(features_v3)

So far we have not looked at `Name`, `Fare`, `Age` and `Cabin`.

`Fare` has no missing values. So lets look at that first.

In [None]:
sb.kdeplot(data[data['Survived'] == 0]['Fare'], label = 'Died')
sb.kdeplot(data[data['Survived'] == 1]['Fare'], label = 'Survived')

* More people who paid lower fares died
* many people paid low fares. hence data seems skewed. we can try **log transformation** to reduce this skewness

In [None]:
sb.kdeplot(np.log(data[data['Survived'] == 0]['Fare']), label = 'Died')
sb.kdeplot(np.log(data[data['Survived'] == 1]['Fare']), label = 'Survived')

Data seems much more discreminative now. Lets add this feature and see how model performs.

In [None]:
fare_features = pd.concat([data['Fare'], np.log(data['Fare'] + 1)], axis = 1)
fare_features.columns = ['Fare', 'logFare']
fare_features.describe()

`Fare`s are not in same scale. Lets perform `Standard Scaling` on them

In [None]:
fare_features_scaled = pd.DataFrame(StandardScaler().fit_transform(fare_features), columns = fare_features.columns)
fare_features_scaled.describe()

In [None]:
features_v4 = pd.concat([features_v3, fare_features_scaled], axis = 1)
features_v4.describe()

In [None]:
run_experiment(features_v4)

Decision tree and random forest are overfitting. We will handle it later. **TODO**

Lets look at a few names and see if we can extract some information out of them

In [None]:
data[['Name']].head(10)

* every name seems to have a title
* few names have some names in brackets. these seem like maiden names of females. we can check if Mrs title and presence of maiden name are correlated
* we can check how titles are related to survival

In [None]:
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

In [None]:
titles = pd.DataFrame(data['Name'].map(extract_title).values, columns = ['Title'])
titles.head()

Lets find out how many distinct title are there

In [None]:
titles['Title'].value_counts()

* `Mlle`, `Ms`, `Lady` seem to be referring to women. We can verify by their `Sex`

In [None]:
people = pd.concat([data[['Name', 'Sex']], titles], axis = 1)
people[(people['Title'] == 'Mlle') | (people['Title'] == 'Ms') | (people['Title'] == 'Lady')]

Lets find out how survival relates to title.

In [None]:
title_survival = pd.concat([data[['Survived']], titles], axis = 1)
sb.countplot(x = 'Title', hue = 'Survived', data = title_survival)

* Most people with `Mr` title died
* `Mrs`, `Miss` and `Master` had better survival.
* The data for other titles is very little. Lets try dropping frequent titles and check survival of other titles.

In [None]:
filtered = title_survival[title_survival['Title'] != 'Mr']
filtered = filtered[filtered['Title'] != 'Mrs']
filtered = filtered[filtered['Title'] != 'Miss']
filtered = filtered[filtered['Title'] != 'Master']
sb.countplot(x = 'Title', hue = 'Survived', data = filtered)

* all `Rev` died
* Officers `Dr`, `Major`, `Col`, `Capt` died
* `Mme`, `Ms`, `Mlle` are missspllings for `Miss` they survived
* Royalty `Don`, `Lady`, `Sir`, `Countess`, `Jonkheer` survived more.
* Lets check survival on basis of such groups

In [None]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}
new_titles = pd.DataFrame(titles['Title'].map(title_dictionary), columns = ['Title'])
new_title_survival = pd.concat([data[['Survived']], new_titles], axis = 1)
sb.countplot(x = 'Title', hue = 'Survived', data = new_title_survival)

In [None]:
features_v5 = pd.concat([features_v4, pd.get_dummies(new_titles)], axis = 1)
features_v5.describe()

In [None]:
run_experiment(features_v5)

* Some improvement in model. So title was an important feature
* Lets also check if we didnt combine the title how did the models performed

In [None]:
features_v6 = pd.concat([features_v4, pd.get_dummies(titles)], axis = 1)
run_experiment(features_v6)

* Minor accuracy drop. We will continue to use combined titles
* Lets now check how `Cabin` relates to survival

In [None]:
cabin_survival = data[['Cabin', 'Survived']]
cabin_survival.head(20)

Each cabin seems to start with a uppercase letter. Lets see how many distinct such values are there

In [None]:
cabin_survival.dropna()['Cabin'].map(lambda x: x[0]).unique()

Lets replace `NaN` values with letter `X` and check survival against cabin

In [None]:
cabin_survival.fillna('X', inplace=True)
cabin_survival['Cabin'] = cabin_survival['Cabin'].map(lambda x: x[0])
sb.countplot(x = 'Cabin', hue = 'Survived', data = cabin_survival[cabin_survival['Cabin'] != 'X'].sort_values('Cabin'))
plt.show()
sb.countplot(x = 'Cabin', hue = 'Survived', data = cabin_survival.sort_values('Cabin'))

`Cabin` seems to be an important feature. Lets add it to our features and check model performance.

In [None]:
features_v7 = pd.concat([features_v5, pd.get_dummies(cabin_survival[['Cabin']])], axis = 1)
features_v7.describe()

In [None]:
run_experiment(features_v7)

Similar performance. Later we will look at feature importance and see which features we should keep.
Finally lets look at `Age`.

In [None]:
age_survival = data[['Age', 'Survived']]
sb.kdeplot(age_survival[age_survival['Survived'] == 0]['Age'], label = 'Died')
sb.kdeplot(age_survival[age_survival['Survived'] == 1]['Age'], label = 'Survived')

* Relatively similar behavior
* Better chances of survival when age is less than about 10
* We observed `Master` as a title earlier. Lets look at age range for that title

In [None]:
title_age = pd.concat([people, data['Age']], axis = 1).dropna()
title_age[title_age['Title'] == 'Master'].describe()

* Max age for master title is 12. Lets check the same for girls as well

In [None]:
title_age[(title_age['Sex'] == 'female') & (title_age['Age'] <= 12)]

Every title is `Miss`. Lets look at titles for age above 12.

In [None]:
title_age[(title_age['Sex'] == 'female') & (title_age['Age'] > 12) & (title_age['Title'] == 'Miss')].head()

We can impute missing age values by mean values by title

In [None]:
new_titles['Title'].unique()

In [None]:
mean_age_by_title = {}
for title in new_titles['Title'].unique():
    mean_age_by_title[title] = title_age[title_age['Title'] == title].mean()