### Initial exploration of titanic use case data

**Objective**: Validate the data quality

**Results**: 

    - There is 891 samples in the train dataset and 418 in the test dataset  
    - Some columns have missing values (age, cabin and embarked)  
    - Some columns have unique values, they are probably ids  
    - Some columns has a really low count of unique values they are probably categories
    - For the test dataset ther is no 'Suvived' column

In [53]:
# Import the tools
import os 

import pandas as pd  

In [54]:
RAW_PATH = "../../data/raw/"

In [55]:
def variable_description(df, var_name):
    "Calculate some metric values for a column and return it as a Dataframe"
    var_desc = {
        'name' : var_name,
        'data type': df[var_name].dtype,
        'unique values': len(df[var_name].unique()),
        '% of unique values': (len(df[var_name].unique()) / len(df[var_name])) * 100,
        'missing values': len(df[df[var_name].isna()]),
        '% of missing values': (len(df[df[var_name].isna()]) / len(df[var_name])) * 100
    }
    return var_desc

#### Train Data

In [56]:
# Read the data
train = pd.read_csv(os.path.join(RAW_PATH, "train.csv")) #data to fit the model
print(f"Number of training samples: {len(train)}")
train.head()

Number of training samples: 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [57]:
# Descriptive Statiscs
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [58]:
# Overall quality
train_quality_df = pd.DataFrame()
for i, col in enumerate(train.columns):
    train_quality_df = pd.concat([train_quality_df, pd.DataFrame(variable_description(train, col), index=[i])])
train_quality_df

Unnamed: 0,name,data type,unique values,% of unique values,missing values,% of missing values
0,PassengerId,int64,891,100.0,0,0.0
1,Survived,int64,2,0.224467,0,0.0
2,Pclass,int64,3,0.3367,0,0.0
3,Name,object,891,100.0,0,0.0
4,Sex,object,2,0.224467,0,0.0
5,Age,float64,89,9.988777,177,19.86532
6,SibSp,int64,7,0.785634,0,0.0
7,Parch,int64,7,0.785634,0,0.0
8,Ticket,object,681,76.430976,0,0.0
9,Fare,float64,248,27.833895,0,0.0


#### Test Data

In [59]:
# Read the data
test  = pd.read_csv(os.path.join(RAW_PATH, "test.csv"))  #data to measure model performance
print(f"Number of test samples: {len(test)}")
test.head()

Number of test samples: 418


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [60]:
# Descriptive Statiscs
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [61]:
# Overall quality
test_quality_df = pd.DataFrame()
for i, col in enumerate(test.columns):
    test_quality_df = pd.concat([test_quality_df, pd.DataFrame(variable_description(test, col), index=[i])])
test_quality_df

Unnamed: 0,name,data type,unique values,% of unique values,missing values,% of missing values
0,PassengerId,int64,418,100.0,0,0.0
1,Pclass,int64,3,0.717703,0,0.0
2,Name,object,418,100.0,0,0.0
3,Sex,object,2,0.478469,0,0.0
4,Age,float64,80,19.138756,86,20.574163
5,SibSp,int64,7,1.674641,0,0.0
6,Parch,int64,8,1.913876,0,0.0
7,Ticket,object,363,86.842105,0,0.0
8,Fare,float64,170,40.669856,1,0.239234
9,Cabin,object,77,18.421053,327,78.229665
