In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
import sys
path_root = Path.cwd().parents[0]
sys.path.append(str(path_root))
from src.utils import read_data

%load_ext autoreload
%autoreload 2

## First encounter with data

In [3]:
path_to_train = '../data/01raw/train.csv'
path_to_test = '../data/01raw/test.csv'

train_data , test_data = read_data(path_to_train, path_to_test)
combine = [train_data , test_data]

Train data imported successfully!!
Test data imported successfully!!


In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
train_data.info()
print('='*50 , '\n')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass 

- Features of the test data are the same as features of the training data except for the Survived feature (because it's the target).
- Features types:
- - Categorical:
- - - Pclass (ordinal)
- - - Name (nominal)
- - - Sex (nominal)
- - Numerical:
- - - Age (continuous)
- - - Fare (continuous)
- - - SibSp (discrete)
- - - Parch (discrete)
- - Mixed:
- - - Ticket (numeric and alphanumeric)
- - - Cabin (alphanumeric)

In [7]:
print("Train data missed values, %:\n")
print(100 * train_data.isnull().sum() / train_data.shape[0])
print('\n','='*50 , '\n')
print("Test data missed values, %:")
print(100 * test_data.isnull().sum() / test_data.shape[0])

Train data missed values, %:

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64


Test data missed values, %:
PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64


As we can see we have some missing values for train:
- Age: we can try to fill it
- Cabin: missed > 75%, it will be hard to fill this gaps, and we will drop it
- Embarked: less than 0.5 missed, we can deop the missing values or try to fill it if this column will be necessary as a feature

Test data:
- Age
- Fare, less than 0.5%, we can fill it
- Cabin

In [8]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [9]:
train_data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [10]:
test_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,418.0,1100.5,120.810458,892.0,996.25,1100.5,1204.75,1309.0
Pclass,418.0,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0
Age,332.0,30.27259,14.181209,0.17,21.0,27.0,39.0,76.0
SibSp,418.0,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0
Parch,418.0,0.392344,0.981429,0.0,0.0,0.0,0.0,9.0
Fare,417.0,35.627188,55.907576,0.0,7.8958,14.4542,31.5,512.3292


In [11]:
test_data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,418,418,418,91,418
unique,418,2,363,76,3
top,"Kelly, Mr. James",male,PC 17608,B57 B59 B63 B66,S
freq,1,266,5,3,270


Numerical data distribution discovering results:
- The survival rate for this data is 38.3%, we have disballance for classes.
- More than 75% of the passengers are below 38 years old.
- There are too few old passengers.
- Most passengers travel alone.
- There are a few outliers in the Fare, Age, SibSp and Parch features.

insights:

Age feature has right skewness, So if we are going to fill missing values we will not use the average.
There are outliers in Fare, Age, SibSp and Parch features. This inspire us for EDA Step.
Train and test datasets has the similar distributions for features but not identical.

Categorical data distribution discovering results:
- There are no duplicated names.
- 64.7% of the passengers are males
- There are duplicated values in the Ticket feature (23.5% are duplicated).
- There are duplicated values in the Cabin feature (27.9% are duplicated).
- 72.4% of the passengers used "S" Embarked.

insights:

Names uniqueness gives us unsight for Data Engineering Step.
Duplicate values in Ticket and Cabin features give us insights to make the right decision when filling missed values.
we will fill missed Embarked values with 'S' type.