In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
import seaborn as sns
pd.set_option('max_columns', 90)

In [12]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = pd.concat((train, test)).reset_index(drop=True)

In [13]:
all_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# First Understand the Features

## Variable Type:
- `Ordinal` => Categorial + Ordering
- `Numerical`
- `Cardinal or Nominal` => Categorial + Not Ordered

In [14]:
numerical_vars = ['Age', 'SibSp', 'Parch', 'Fare']
ordinal_vars = ['Pclass']
nominal_vars = ['Survived', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [15]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


## See the distribution of Numerical Variables

In [16]:
def multi_table(table_list):
    return HTML(f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

In [17]:
train0 = train[train.Survived == 0]
train1 = train[train.Survived == 1]
cnt = 0
detail_desc = []

for c in train.columns:
    if c == 'PassengerId':
        continue
#   To check int float and other numerical dtypes
    if train[c].dtypes != 'object':
        desc = pd.DataFrame(columns=['feature', 'data', 'type', 'count', 'mean', 'median', 'std', 'min', 'max', 'skew', 'null'])
        desc.loc[0] = [c, 'Train', train[c].dtype.name, train[c].count(), train[c].mean(), train[c].median(), train[c].std(), train[c].min(), train[c].max(), train[c].skew(), train[c].isnull().sum()]
        desc.loc[1] = [c, 'All', train[c].dtype.name, all_data[c].count(), all_data[c].mean(), all_data[c].median(), all_data[c].std(), all_data[c].min(), all_data[c].max(), all_data[c].skew(), all_data[c].isnull().sum()]
        desc.loc[2] = [c, 'Target=0', train0[c].dtype.name, train0[c].count(), train0[c].mean(), train0[c].median(), train0[c].std(), train0[c].min(), train0[c].max(), train0[c].skew(), train0[c].isnull().sum()]      
        desc.loc[3] = [c, 'Target=1', train1[c].dtype.name, train1[c].count(), train1[c].mean(), train1[c].median(), train1[c].std(), train1[c].min(), train1[c].max(), train1[c].skew(), train1[c].isnull().sum()]
        desc = desc.set_index(['feature', 'data'],drop=True)
        detail_desc.append(desc.style.background_gradient())

In [18]:
multi_table(detail_desc)

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7
Unnamed: 0_level_8,Unnamed: 1_level_8,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9
Unnamed: 0_level_10,Unnamed: 1_level_10,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11
Survived,Train,int64,891,0.383838,0.000000,0.486592,0.0,1.0,0.478523,0.0
Survived,All,int64,891,0.383838,0.000000,0.486592,0.0,1.0,0.478523,418.0
Survived,Target=0,int64,549,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Survived,Target=1,int64,342,1.000000,1.000000,0.0,1.0,1.0,0.0,0.0
Pclass,Train,int64,891,2.308642,3.000000,0.836071,1.0,3.0,-0.630548,0.0
Pclass,All,int64,1309,2.294882,3.000000,0.837836,1.0,3.0,-0.598647,0.0
Pclass,Target=0,int64,549,2.531876,3.000000,0.735805,1.0,3.0,-1.213793,0.0
Pclass,Target=1,int64,342,1.950292,2.000000,0.863321,1.0,3.0,0.096007,0.0
Age,Train,float64,714,29.699118,28.000000,14.526497,0.42,80.0,0.389108,177.0
Age,All,float64,1046,29.881138,28.000000,14.413493,0.17,80.0,0.407675,263.0

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Survived,Train,int64,891,0.383838,0.0,0.486592,0.0,1.0,0.478523,0
Survived,All,int64,891,0.383838,0.0,0.486592,0.0,1.0,0.478523,418
Survived,Target=0,int64,549,0.0,0.0,0.0,0.0,0.0,0.0,0
Survived,Target=1,int64,342,1.0,1.0,0.0,1.0,1.0,0.0,0

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Pclass,Train,int64,891,2.308642,3.0,0.836071,1,3,-0.630548,0
Pclass,All,int64,1309,2.294882,3.0,0.837836,1,3,-0.598647,0
Pclass,Target=0,int64,549,2.531876,3.0,0.735805,1,3,-1.213793,0
Pclass,Target=1,int64,342,1.950292,2.0,0.863321,1,3,0.096007,0

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Age,Train,float64,714,29.699118,28.0,14.526497,0.42,80.0,0.389108,177
Age,All,float64,1046,29.881138,28.0,14.413493,0.17,80.0,0.407675,263
Age,Target=0,float64,424,30.626179,28.0,14.17211,1.0,74.0,0.585584,125
Age,Target=1,float64,290,28.34369,28.0,14.950952,0.42,80.0,0.180458,52

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SibSp,Train,int64,891,0.523008,0.0,1.102743,0,8,3.695352,0
SibSp,All,int64,1309,0.498854,0.0,1.041658,0,8,3.84422,0
SibSp,Target=0,int64,549,0.553734,0.0,1.288399,0,8,3.515888,0
SibSp,Target=1,int64,342,0.473684,0.0,0.708688,0,4,1.958168,0

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Parch,Train,int64,891,0.381594,0.0,0.806057,0,6,2.749117,0
Parch,All,int64,1309,0.385027,0.0,0.86556,0,9,3.669078,0
Parch,Target=0,int64,549,0.32969,0.0,0.823166,0,6,3.323133,0
Parch,Target=1,int64,342,0.464912,0.0,0.771712,0,5,1.754737,0

Unnamed: 0_level_0,Unnamed: 1_level_0,type,count,mean,median,std,min,max,skew,null
feature,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Fare,Train,float64,891,32.204208,14.4542,49.693429,0.0,512.3292,4.787317,0
Fare,All,float64,1308,33.295479,14.4542,51.758668,0.0,512.3292,4.367709,1
Fare,Target=0,float64,549,22.117887,10.5,31.388207,0.0,263.0,4.55314,0
Fare,Target=1,float64,342,48.395408,26.0,66.596998,0.0,512.3292,3.862779,0


In [23]:
for c in train.columns:
    if c == 'PassengerId':
        continue
    if train[c].dtypes == 'object':
        print(train[c])

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object
0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object
0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...      