### Initial exploration of titanic use case data

**Objective**: Validate the data quality

**Results**: 

    - There is 891 samples in the train dataset and 418 in the test dataset  
    - Some columns have missing values (age, cabin and embarked)  
    - Some columns have unique values, they are probably ids  
    - Some columns has a really low count of unique values they are probably categories
    - For the test dataset there is no 'Suvived' column

In [None]:
# Import the tools
import os 

import pandas as pd  

In [None]:
# Constants

DATA_PATH = '/home/wedeueis/Projetos/Artefact/Meetup/titanic/data/'

TRAIN_FILE = "raw/train.csv"
TEST_FILE = "raw/test.csv"

In [None]:
def variable_description(df, var_name):
    "Calculate some metric values for a column and return it as a Dataframe"
    var_desc = {
        'name' : var_name,
        'data type': df[var_name].dtype,
        'unique values': len(df[var_name].unique()),
        '% of unique values': (len(df[var_name].unique()) / len(df[var_name])) * 100,
        'missing values': len(df[df[var_name].isna()]),
        '% of missing values': (len(df[df[var_name].isna()]) / len(df[var_name])) * 100
    }
    return var_desc

#### Train Data

In [None]:
# Read the data
train = pd.read_csv(os.path.join(DATA_PATH, TRAIN_FILE)) #data to fit the model
print(f"Number of training samples: {len(train)}")
train.head()

In [None]:
# Descriptive Statiscs
train.describe()

In [None]:
# Overall quality
train_quality_df = pd.DataFrame()
for i, col in enumerate(train.columns):
    train_quality_df = pd.concat([train_quality_df, pd.DataFrame(variable_description(train, col), index=[i])])
train_quality_df

#### Test Data

In [None]:
# Read the data
test  = pd.read_csv(os.path.join(DATA_PATH, TEST_FILE))  #data to measure model performance
print(f"Number of test samples: {len(test)}")
test.head()

In [None]:
# Descriptive Statiscs
test.describe()

In [None]:
# Overall quality
test_quality_df = pd.DataFrame()
for i, col in enumerate(test.columns):
    test_quality_df = pd.concat([test_quality_df, pd.DataFrame(variable_description(test, col), index=[i])])
test_quality_df