In [None]:
from collections import Counter
from pprint import pprint

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import utility as ut

## Read data

In [None]:
df = pd.read_csv("../data/train.csv", index_col="PassengerId")

## Data exploration

### General
- how many records has the data set (rows)?
- how many features has the data set (columns)?
- do you know the meaning of each feature?
- what are the types of the features (categorical / numerical / ordered), are they as expected?
- do we have missing values?
- have a look at the first rows of the data set

### Univariate
- for numerical features: show histogram and summary statistics
- for categorical data: show value count or bar diagram of the occurences of each category

### Bivariate
- show scatter plot and correlation between features
- especially consider the relationship between each features and the target variable

In [None]:
df.shape

- the data set has 891 records (rows) and 11 features (columns) including the target variable (`Survived`)

In [None]:
df.info()

- the data set has 6 numerical and 5 object features
- the data type of each column matches the expectation
- we have missing values in the following columns
    - `Age`
    - `Cabin` (over 75% of the data missing)
    - `Embarked` (only 2 values missing)

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
numerical = df.select_dtypes(include=[np.number])
for col in numerical:
    df[col].hist(figsize=(20, 10), legend=True, align="mid")
    plt.show()

- only 38% survived, so the estimator P(x) = 0 will give 62% accuracy
- we have 3 ticket classes with relations 2 : 2 : 5
- age distribution is centered around 30 and is skewed towards right
- number of siblings and spouses is mostly 0 (600 times), followed by 1 (200 times) and the other are exceptions 
- number of parents and children is also mostly 0 (660 times), followed by 1 (120 times), then 2 (80 times) and the others are exceptions
- fare is centered around 14 with most values below 100

In [None]:
categorical = df.select_dtypes(include="object")
for col in categorical:
    print(col)
    pprint(Counter(df[col]))
    # print(df[col].value_counts())

- from the names we could extract some further information, otherwise drop this column
- we have 577 male and 314 female passengers
- from the ticket we could extract some further information, otherwise drop this column
- from the cabin we could extract some further information, otherwise drop this column
- 644 embarked in Southampton, 168 in Cherbourg and 77 in Queenstown

In [None]:
categories = ["Embarked", "Sex", "Pclass"]
for col in categories:
    sns.catplot(data=df, kind="count", x="Survived", col=col)
    plt.show()