## Import libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Seaborn settings
sns.set_style('darkgrid')
sns.set_palette('deep')

## Load data

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

# Exploratory Data Analysis

In [None]:
train.info()

### Overview
* Target variable: Survived. 
* Categorical Variables: Pclass, Sex, Age, Cabin, Embarked
* Continuous Variables: Fare, SibSp, Parch
* PassengerId, Name, Ticket are useless for analysis

In [None]:
# Correlation matrix Heat map plot 
corr = train.drop(['PassengerId', 'Name'], axis=1).corr()
f, ax = plt.subplots(figsize=(5, 5))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True);

* Survived correleates with both Fare and Pclass
* Fare and Pclass are correlated (Both indicates social-economic level of the passenger)

### Survived
* Indicates if the passenger survived to the shipwreck or not. 
* Target variable. This is what we want to predict. 


In [None]:
train['Survived'].value_counts()

### Pclass
* Ticket class with the passenger is traveling
* Three categories: 1st class (wealthier), 2nd class, 3rd class (poorer)
* Wealthier people has more survival probability?

In [None]:
train['Pclass'].value_counts()

#### vs Survived

In [None]:
sns.countplot('Pclass', hue='Survived', palette=['firebrick', 'seagreen'],data=train);

* People with 3rd class ticket tends to dies quite more that other ticket classes
* This seems a relevant variable for the model

### Age

* Age of the passenger
* We know about "children and women first" rescue protocol, so younger passenger should survive more than older ones.

In [None]:
train['Age'].isna().sum()

* There are 177 missing values for age. We could infer it from other variables afterwards

In [None]:
# Histogram
sns.distplot(train['Age']);

#### vs Survived

In [None]:
sns.distplot(train[train['Survived'] == 1].Age, bins=20, label='Survived')
sns.distplot(train[train['Survived'] == 0].Age, bins=20, label='Not Survived')

plt.legend(prop={'size': 12})
plt.title('Survived vs not survived age passengers dist.')
plt.xlabel('Age')
plt.ylabel('Frequency');

* Both dist are quite similar except we can observ children (0-10 years old aprox.) have higher survive probability. As we had expected because of children and women first protocol.
* Middle-age people (20-30 years old) seems the most likely to die.

### Sex
* Sex of the passenger

In [None]:
sns.countplot('Sex', data=train);

In [None]:
sns.countplot('Sex', hue='Survived', palette=['firebrick', 'seagreen'],data=train);

* Women survive more as same as children as expected.

### Cabin

In [None]:
train['Cabin'].isna().sum(), train.shape

* Most of the Cabin values are missing (around 77%), we are going to ignore this variable for now

### Embarked 
* City the passenger has embarked
* Three categories: C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
train['Embarked'].isna().sum()

* There are just two missing values. We could assign the city from which more passengers embark (which is Southampton)

In [None]:
embarked_mode = train['Embarked'].mode()[0]
train['Embarked'].fillna(embarked_mode, inplace=True)

In [None]:
train['Embarked'].isna().sum()

In [None]:
sns.countplot('Embarked', palette=['firebrick', 'seagreen'],data=train);

In [None]:
sns.countplot('Embarked', hue='Survived', palette=['firebrick', 'seagreen'], data=train);

* Survival rate is around 50% in both Cherbourg and Queenstown, but most of people coming from Southampton died.

In [None]:
sns.countplot('Embarked', hue='Pclass', data=train);

* Low survivality ration could to come from the fact that there are many 3rd class passengers which embarked in Southampton

## Continuous variables
### Fare
* Ticket fare
* We can transform this variable in categorical by split fare values into bins.

In [None]:
train['Fare'].isna().sum()

In [None]:
sns.boxplot(train['Fare']);

In [None]:
sns.distplot(train['Fare']);

* Fare has a very high skewness to the right.

#### vs Survived

In [None]:
sns.boxplot('Survived', 'Fare', data=train);

* Passengers with high fare tickets tends to survive more than passengers with low ones

### Sibsp and Parch
* Sibsp= Numbers of siblings and spouses abord
* Parch = Number of parents/child abord
* We'll put together in one variable called Family

In [None]:
for df in [train, test]:
    df['Family'] = df['SibSp'] + df['Parch']

In [None]:
sns.countplot('Family', data=train);

* Most travel alone

In [None]:
plt.figure(figsize=(15,5))
sns.countplot('Family', hue='Survived', palette=['firebrick', 'seagreen'], data=train);

* It seems passengers travelling with their families tends to survive more. This can be due to women and children doesn't used to travel alone.

In [None]:
fig, ax = plt.subplots(1, 3,figsize=(15,5))
ax[0].set_title("Men")
ax[1].set_title("Women")
ax[2].set_title("Children")
sns.countplot('Family', data=train[train['Sex'] == 'male'], ax=ax[0])
sns.countplot('Family', data=train[train['Sex'] == 'female'], ax=ax[1])
sns.countplot('Family', data=train[train['Age'] < 10], ax=ax[2]);

In [None]:
adult_men = train[(train['Sex'] == 'male') & (train['Age'] > 15) ]
sns.countplot('Family', hue='Survived', palette=['firebrick', 'seagreen'], data=adult_men);
plt.legend(loc='upper right');
plt.title('Adult men: Family vs Survived');

## Feature Engineering
### Title

* Some passengers has titles in their names. We'll extract from the 'Name' column and transform in a new categorical variable.

In [None]:
train['Name'].head()

In [None]:
titles = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0])
titles.head()

In [None]:
titles.value_counts()

In [None]:
# Title Categories: [Mr, Miss, Mrs, Master, Royalty, Officer]
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles(df):
    """ Extract Title passenger from Name column and assign into one title category """
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    df['Title'] = df['Title'].map(Title_Dictionary)
    return df

In [None]:
for df in [train, test]:
    get_titles(df)

In [None]:
train['Title']

In [None]:
train.columns

### Missing values: Age

* We are gonna impute age missing values from other variable. We need to look for variables that correleates with age.

In [None]:
# Correlation matrix Heat map plot 
corr = train.drop(['PassengerId', 'Name', 'Survived'], axis=1).corr()
f, ax = plt.subplots(figsize=(5, 5))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True);

In [None]:
sns.boxplot('Pclass', 'Age', data=train);

* Age correlates with Pclass, so we use it for impute age missing values.

In [None]:
title_age = pd.pivot_table(train, index='Title', values='Age').sort_values(by='Age', ascending=True)
title_age

In [None]:
sns.boxplot('Title', 'Age', data=train, order=title_age.index);

* Title seems a good variable for estimate age, so we use both Title and Pclass for compute age missing values.

In [None]:
title_pclass_age = pd.pivot_table(train, index=['Title', 'Pclass'], values='Age')
title_pclass_age

In [None]:
missing_age = train['Age'].isna()
filled_age = train[missing_age].apply(lambda x: title_pclass_age.loc[(x.Title, x.Pclass)], axis=1)
train[missing_age] = filled_age

In [None]:
train['Age'].isna().sum()

In [None]:
train.columns

In [None]:
#
# Fare and Age into bins¿?
#