# Data Exploration Starter
## Data Exploration for the Titanic Dataset
### Fork from the work of Sergei Neviadomski  (https://www.kaggle.com/adamjm32/titanic-data-exploration-starter)
### Also inspired by and forked from the notebook presented (https://www.kaggle.com/startupsci/titanic-data-science-solutions)

In [1]:
### Necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Seaborn style
sns.set_style("whitegrid")

ModuleNotFoundError: No module named 'pandas'

In [None]:
### Let's import our data
train_df = pd.read_csv('data/train.csv') #,index_col='PassengerId')
test_df = pd.read_csv('data/test.csv') #, index_col='PassengerId')


### and test if everything OK

train_df.head()

In [None]:
train_df.info()
print('_'*40)
test_df.info()

In [None]:
### Lets checkout the statistics of the dataset 

train_df.describe()


In [None]:
test_df.describe()

# Assumptions, Correlations, Completions and Correcting 

## What are some assumptions we can likely make about the dataset?

## What are some correlations that maybe present in the dataset?

## What data should we definatly focus on completing (fix missing data)?

## What data could we consider dropping?

## What new features could we create?




### Correlations 

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

## Exersise: Check the correlation of Sex vs Survived and SibSp by Survivded

In [None]:
train_df[["Sex", "Survived"]].groupby(['Sex']

In [None]:
train_df[["SibSp", "Survived"]]

In [None]:
train_data = pd.read_csv('data/train.csv',index_col='PassengerId')

In [None]:
###  check for NAs (Not a number) so that Pandas can understands them
train_data.isnull().sum()

In [None]:
### Now let's prepare lists of numeric and categorical columns
# Numeric Features
numeric_features = ['Age', 'Fare']
# Categorical Features
ordinal_features = ['Pclass', 'SibSp', 'Parch'] # multi-class categories 
nominal_features = ['Sex', 'Embarked'] # binary class categories

In [None]:
### Adding new column with easier to read target names
train_data['target_name'] = train_data['Survived'].map({0: 'Not Survived', 1: 'Survived'})

### Target Exploration

In [None]:
### Target variable exploration
sns.countplot(train_data.target_name);
plt.xlabel('Survived?');
plt.ylabel('Number of occurrences');
plt.show()

### Corralation between features (variables)

In [None]:
### Corralation matrix heatmap
# Getting correlation matrix
cor_matrix = train_data[numeric_features + ordinal_features].corr().round(2)
# Plotting heatmap 
fig = plt.figure(figsize=(12,12));
sns.heatmap(cor_matrix, annot=True, center=0, cmap = sns.diverging_palette(250, 10, as_cmap=True), ax=plt.subplot(111));
plt.show()

### Numeric Features Exploration

In [None]:
### Plotting Numeric Features
# Looping through and Plotting Numeric features
for column in numeric_features:    
    # Figure initiation
    fig = plt.figure(figsize=(18,12))
    
    ### Distribution plot
    sns.distplot(train_data[column].dropna(), ax=plt.subplot(221));
    # X-axis Label
    plt.xlabel(column, fontsize=14);
    # Y-axis Label
    plt.ylabel('Density', fontsize=14);
    # Adding Super Title (One for a whole figure)
    plt.suptitle('Plots for '+column, fontsize=18);
    
    ### Distribution per Survived / Not Survived Value
    # Not Survived hist
    sns.distplot(train_data.loc[train_data.Survived==0, column].dropna(),
                 color='red', label='Not Survived', ax=plt.subplot(222));
    # Survived hist
    sns.distplot(train_data.loc[train_data.Survived==1, column].dropna(),
                 color='blue', label='Survived', ax=plt.subplot(222));
    # Adding Legend
    plt.legend(loc='best')
    # X-axis Label
    plt.xlabel(column, fontsize=14);
    # Y-axis Label
    plt.ylabel('Density per Survived / Not Survived Value', fontsize=14);
    
    ### Average Column value per Survived / Not Survived Value
    sns.barplot(x="target_name", y=column, data=train_data, ax=plt.subplot(223));
    # X-axis Label
    plt.xlabel('Survived or Not Survived?', fontsize=14);
    # Y-axis Label
    plt.ylabel('Average ' + column, fontsize=14);
    
    ### Boxplot of Column per Survived / Not Survived Value
    sns.boxplot(x="target_name", y=column, data=train_data, ax=plt.subplot(224));
    # X-axis Label
    plt.xlabel('Survived or Not Survived?', fontsize=14);
    # Y-axis Label
    plt.ylabel(column, fontsize=14);
    # Printing Chart
    plt.show()

### Categorical (Ordinal) Features Exploration

In [None]:
### Plotting Categorical Features
# Looping through and Plotting Categorical features
for column in ordinal_features:
    # Figure initiation
    fig = plt.figure(figsize=(18,18))

    ### Average Column value per Survived / Not Survived Value
    sns.barplot(x="target_name", y=column, data=train_data, ax=plt.subplot(321));
    # X-axis Label
    plt.xlabel('Survived or Not Survived?', fontsize=14);
    # Y-axis Label
    plt.ylabel('Average ' + column, fontsize=14);
    # Adding Super Title (One for a whole figure)
    plt.suptitle('Plots for '+column, fontsize=18);

    ### Boxplot of Column per Survived / Not Survived Value
    sns.boxplot(x="target_name", y=column, data=train_data, ax=plt.subplot(322));
    # X-axis Label
    plt.xlabel('Survived or Not Survived?', fontsize=14);
    # Y-axis Label
    plt.ylabel(column, fontsize=14);

    ### Number of occurrences per categoty - target pair
    ax = sns.countplot(x=column, hue="target_name", data=train_data, ax = plt.subplot(312));
    # X-axis Label
    plt.xlabel(column, fontsize=14);
    # Y-axis Label
    plt.ylabel('Number of occurrences', fontsize=14);
    # Setting Legend location 
    plt.legend(loc=1);

    ### Adding percents over bars
    # Getting heights of our bars
    height = [p.get_height() if p.get_height()==p.get_height() else 0 for p in ax.patches]
    # Counting number of bar groups 
    ncol = int(len(height)/2)
    # Counting total height of groups
    total = [height[i] + height[i + ncol] for i in range(ncol)] * 2
    # Looping through bars
    for i, p in enumerate(ax.patches):    
        # Adding percentages   
        ax.text(p.get_x()+p.get_width()/2, height[i]*1.01 + 10,
                '{:1.0%}'.format(height[i]/total[i]), ha="center", size=14) 

    ### Survived percentage for every value of feature
    sns.pointplot(x=column, y='Survived', data=train_data, ax = plt.subplot(313));
    # X-axis Label
    plt.xlabel(column, fontsize=14);
    # Y-axis Label
    plt.ylabel('Survived Percentage', fontsize=14);
    # Printing Chart
    plt.show()

### Categorical (Nominal) Features Exploration

In [None]:
### Plotting Categorical Features
# Looping through and Plotting Categorical features
for column in nominal_features:
    # Figure initiation
    fig = plt.figure(figsize=(18,12))
    
    ### Number of occurrences per categoty - target pair
    ax = sns.countplot(x=column, hue="target_name", data=train_data, ax = plt.subplot(211));
    # X-axis Label
    plt.xlabel(column, fontsize=14);
    # Y-axis Label
    plt.ylabel('Number of occurrences', fontsize=14);
    # Adding Super Title (One for a whole figure)
    plt.suptitle('Plots for '+column, fontsize=18);
    # Setting Legend location 
    plt.legend(loc=1);
    
    ### Adding percents over bars
    # Getting heights of our bars
    height = [p.get_height() for p in ax.patches]
    # Counting number of bar groups 
    ncol = int(len(height)/2)
    # Counting total height of groups
    total = [height[i] + height[i + ncol] for i in range(ncol)] * 2
    # Looping through bars
    for i, p in enumerate(ax.patches):    
        # Adding percentages
        ax.text(p.get_x()+p.get_width()/2, height[i]*1.01 + 10,
                '{:1.0%}'.format(height[i]/total[i]), ha="center", size=14) 

    
    ### Survived percentage for every value of feature
    sns.pointplot(x=column, y='Survived', data=train_data, ax = plt.subplot(212));
    # X-axis Label
    plt.xlabel(column, fontsize=14);
    # Y-axis Label
    plt.ylabel('Survived Percentage', fontsize=14);
    # Printing Chart
    plt.show()