In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')


In [None]:
# Basic Checking of shapes
print('Number of Training Examples {}'.format(train_df.shape))
print('Number of Test Examples {}'.format(test_df.shape))
print('Train Features:\n', train_df.columns)
print('Test Features\n', test_df.columns)

# Exploratory Data Analysis
* PassengerId
This features just uniquely determines the passanger and will be used for identification purposes only.
* Survived
Our target variable. Take the values 0 and 1 where 1 corresponds to survivors.
* Pclass 
This variable defines class (socio-economic status) of our passenger takes values 1,2,3 
* Name Sex Age Fare are self explanatory.
* SibSp 
This refers to number of siblings or spouse aboard on titanic for this person.
* Parch
This refers to number of parents or children aboard on titanic for this person.
* Cabin 
It is the cabin number of passengers.
* Embarked
It is the port of embarkation has three unique values
    * C Cheerburg 
    * Q Queenstown
    * S Southampton


In [None]:
train_df.info()

As we can see Age 179/891 , Cabin 687/891 and Embarked 889/891 columns have missing values. 

In [None]:
test_df.info()

Here we can see Age 86/418 Fare 1/418 Cabin 327/418 has missing values.

## Combining Train And Test Datasets
We will be combining train_df and test_df so that whatever preprocessing we apply get applied to both of them and then we can seperate them and make predictions on test data using our model. I admit that in reality you will not have  a test dataset available to you and you will be required to build machine learning pipelines. But for my first competition I guess this is a start. I will improve the code later.

In [None]:
def concat_df(train_data, test_data):
    return pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
def divide_df(merged_df):
    return merged_df.loc[:890], merged_df.loc[891:].drop(['Survived'], axis=1)

df = concat_df(train_df, test_df)

## Age
Missing values in age can be best filled by using median. But median of whole dataset is not a good choice.
We will fill the age according to median of Pclass and Sex. As can be seen by correlation plot below.

In [None]:
df.corr().abs()

In [None]:
age_by_pclass_sex = df.groupby(['Sex', 'Pclass'])[['Age']].median()
age_by_pclass_sex

In [None]:
df['Age'] = df.groupby(['Sex','Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

## Embarked
We will fill the missing values of embarked feature.

In [None]:
df[df['Embarked'].isna()]

In [None]:
df['Embarked'] = df['Embarked'].fillna('S') # Filling Southampton as this is the value for martha evelyn

## Fare
It is missing for only one person.

In [None]:
df[df['Fare'].isna()]

In the correlation table below you can see that Fare is best correlated with Pclass, Parch, Age, SibSp so we can fill this according to the median fare of the class. Let us group them by Pclass, Parch, SibSp.

In [None]:
fare_for_alone_traveller_of_3rd_class = df.groupby(['Pclass', 'Parch', 'SibSp'])['Fare'].median()[3,0,0]
df['Fare'] = df['Fare'].fillna(fare_for_alone_traveller_of_3rd_class)

## Cabin 
Cabin has has about 77% of values missing. Dropping such a coloumn makes absolute sense. But still some of the cabin may have higher surival rates as the cabins represent decks in which cabins are related. Let us see the passengers distribution in cabins.

In [None]:
cabin_decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'M']
def map_to_deck(cabin: str) -> str:
    for deck in cabin_decks:
        if deck in cabin:
            return deck
    return cabin
df['Cabin'] = df['Cabin'].fillna('M')
df['Cabin'] = df['Cabin'].apply(map_to_deck)
df['Cabin'] = df['Cabin'].replace('T', 'A')
survival_by_deck = {}
for deck, survived in zip(df['Cabin'], df['Survived']):
    if deck == "Missing":
        continue
    if np.isnan(survived):
        continue
    if deck not in survival_by_deck:
        survival_by_deck[deck] = [0,0]
    survival_by_deck[deck][int(survived)]+=1
for k, v in survival_by_deck.items():
    survival_by_deck[k] = v[1]/(v[0]+v[1])
sns.barplot(x=list(survival_by_deck.keys()), y=list(survival_by_deck.values()))

It turns out people of cabin D, B, E most likely survived whereas cabin A, G had less than 50% chance of  survival. 

In [None]:
df.groupby(['Pclass', 'Cabin']).size()

It turns out that Cabin A, B, C was reserved for 1st class passengers.  Here we will group them according to their class.

In [None]:
df['Cabin'] = df['Cabin'].replace(['A', 'B', 'C'], 'ABC')
df['Cabin'] = df['Cabin'].replace(['D', 'E'], 'DE')
df['Cabin'] = df['Cabin'].replace(['F', 'G'], 'FG')
df['Cabin'].value_counts()

In [None]:
df.isna().sum()

In [None]:
train_df, test_df = divide_df(df)

## Target Distribution

In [None]:
survived_stats = df['Survived'].value_counts().reset_index()
plt.figure(figsize=(8,6))
sns.barplot(x=survived_stats['index'], y=survived_stats['Survived'])
plt.title('Survival Percentage')
total = survived_stats['Survived'].sum()
plt.xticks((0,1), ['Not Survived {:.2f}%'.format(survived_stats.loc[0,'Survived']/total), 'Survived {:.2f}%'.format(survived_stats.loc[1,'Survived']/total) ])

In [None]:
plt.figure(figsize=(10,10))

plt.subplot(1,2,1)
plt.title('Train set correlations')
sns.heatmap(train_df.corr(), annot=True, linewidth=0.5, cmap='coolwarm')

plt.subplot(1,2,2)
plt.title('Test set correlations')
sns.heatmap(test_df.corr(), annot=True, linewidth=0.5, cmap='coolwarm')
plt.tight_layout()

In [None]:
plt.figure(figsize=(16,10))
plt.subplot(1,2,1)
sns.distplot(a=train_df[train_df['Survived'] == 1]['Age'], label='Survived')
sns.distplot(a=train_df[train_df['Survived'] == 0]['Age'], label='Not Survived')
plt.title('Distribution of Age and Survival')
plt.legend()

plt.subplot(1,2,2)
sns.distplot(a=train_df['Age'], label='Train Set')
sns.distplot(a=test_df['Age'], label='Test Set')
plt.title('Ages Test set vs train set')
plt.legend()

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
sns.distplot(a=train_df[train_df['Survived'] == 1]['Fare'], label='Survived')
sns.distplot(a=train_df[train_df['Survived'] == 0]['Fare'], label='Not Survived')
plt.title('Distribution of Fare and Survival')
plt.legend()

plt.subplot(1,2,2)
sns.distplot(a=train_df['Fare'], label='Train Set')
sns.distplot(a=test_df['Fare'], label='Test Set')
plt.title('Fares Test set vs train set')
plt.legend()

## Categorical Features

In [None]:
def plot_data(categoryA):
    data = train_df.groupby([categoryA, 'Survived']).size().reset_index()
    data.rename(columns={0:'Count'}, inplace=True)
    sns.barplot(x=categoryA, y='Count', hue='Survived', data=data)
    plt.title('{} vs Survival'.format(categoryA))
    
# embarked_vs_survival = train_df.groupby(['Embarked', 'Survived']).size().reset_index()
# embarked_vs_survival.rename(columns={0:'Count'}, inplace=True)
plt.figure(figsize=(20,10))
plt.subplot(2,3,1)
plot_data('Embarked')

plt.subplot(2,3,2)
plot_data('Sex')

plt.subplot(2,3,3)
plot_data('Pclass')

plt.subplot(2,3,4)
plot_data('SibSp')

plt.subplot(2,3,5)
plot_data('Parch')

plt.subplot(2,3,6)
plot_data('Cabin')

plt.tight_layout()


# Feature Engineering

## Name
We will extract titles from name and replace the title with name.

In [None]:
titles = ['Mr', 'Mrs', 'Ms','Master', 'Dr','Miss', 'Don', 'Capt', 'Col', 'Dona', 'Rev', 'Mlle', 'Mme', 'Major', 'Jonkheer', 'Countess']
def to_title(name: str) -> str:
    for title in titles:
        if title in name:
            return title
    return name
def replace_titles(x: pd.DataFrame) -> str:
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
df['Title'] = df['Name'].apply(to_title)
df['Title'] = df.apply(replace_titles, axis=1)
df.head(10)

In [None]:

title_df = df.groupby(['Title', 'Survived']).size().reset_index()
plt.figure(figsize=(14,8))
sns.barplot(x='Title', y=0, hue='Survived', data=title_df)

## Creating new features
A feature such as family size makes sense rather than having seperate SibSp and Parch features we will try to create such a feature.

In [None]:
df['Family_size'] = df['SibSp'] + df['Parch']
family_size_df = df.groupby(['Family_size', 'Survived']).size()
plt.figure(figsize=(16,10))
sns.barplot(x='Family_size', y=0, hue='Survived', data=family_size_df.reset_index())
percentages = []
for i in range(11):
    try:
        percentage = family_size_df.loc[(i,1.0)]/family_size_df.loc[i].sum()
        percentages.append(percentage)
    except:
        percentage = 0
        percentages.append(percentage)
labels = ['Size {} \nSurvived {:.2f}%'.format(i, percentages[i]) for i in range(11)]
plt.xticks(tuple(range(11)), labels)
family_size_df.head(25)

So it seems that people travelling alone has only 30% chance of surviving whereas these chances progressively improve with family sizes of 1,2,3 having percentages 55%, 58%, 72% and then as family size further increases chances of survival go kaboom

In [None]:
df.head()

## Binning Continous Features

In [None]:
df['Fare'] = pd.qcut(df['Fare'], 13)
df['Fare']

In [None]:
df['Age'] = pd.qcut(df['Age'], 10)
df["Age"]

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le_cols = ['Sex', 'Embarked','Title','Cabin', 'Age', 'Fare']
for col in le_cols:
    df[col] = LabelEncoder().fit_transform(df[col])
cols_to_drop = ['Name', 'Ticket']
df.head()
df = df.drop(cols_to_drop, axis=1)
train_df, test_df = divide_df(df)
train_df.head()

In [None]:
test_df.head()

In [None]:
X = train_df.drop(['Survived'], axis=1)
y = train_df.Survived
print(X.shape, y.shape)
X.head()

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# model = RandomForestClassifier(random_state=0)
# param_grid = {
#     'n_estimators':[50,100,150,200],
#     'min_samples_split':[2,4,6,8],
#     'min_samples_leaf':[1,2,4, 6],
# }
# clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, scoring='accuracy')
# start = time.time()
# clf.fit(X,y)
# print('Grid Search took {}s'.format(time.time()-start))
# print(clf.best_params_)
# scores = cross_val_score(RandomForestClassifier(random_state = 0,**clf.best_params_,),X, y, cv=5, n_jobs=-1)
# print('Mean Accuracy {} Mean Std. {}'.format(scores.mean(), scores.std()))

In [None]:
best_params = {'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 50}
model = RandomForestClassifier(random_state=0, **best_params)
model.fit(X, y)
predictions = model.predict(test_df)
predictions

In [None]:
output = pd.DataFrame({'PassengerId':test_df.PassengerId, 'Survived':predictions})
output.head()

In [None]:
output.to_csv('submission.csv', index=False)