In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [45]:
df = pd.read_csv('../data/titanic.csv', usecols=['Age', 'Fare', 'Survived'])

In [46]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,34.5,7.8292
1,1,47.0,7.0
2,0,62.0,9.6875
3,0,27.0,8.6625
4,1,22.0,12.2875


In [47]:
df.describe()

Unnamed: 0,Survived,Age,Fare
count,418.0,332.0,417.0
mean,0.363636,30.27259,35.627188
std,0.481622,14.181209,55.907576
min,0.0,0.17,0.0
25%,0.0,21.0,7.8958
50%,0.0,27.0,14.4542
75%,1.0,39.0,31.5
max,1.0,76.0,512.3292


In [48]:
df.isnull().sum()

Survived     0
Age         86
Fare         1
dtype: int64

In [49]:
df.isnull().mean()

Survived    0.000000
Age         0.205742
Fare        0.002392
dtype: float64

In [50]:
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [51]:
X_train.shape, X_test.shape

((334, 2), (84, 2))

In [52]:
X_train.isnull().mean()

Age     0.215569
Fare    0.002994
dtype: float64

In [53]:
mean_age = X_train['Age'].mean()
median_age = X_train['Age'].median()

mean_fare = X_train['Fare'].mean()
median_fare = X_train['Fare'].median()

In [54]:
X_train['Age_mean'] = X_train['Age'].fillna(mean_age)
X_train['Age_median'] = X_train['Age'].fillna(median_age)

X_train['Fare_mean'] = X_train['Fare'].fillna(mean_fare)
X_train['Fare_median'] = X_train['Fare'].fillna(median_fare)

In [55]:
X_train.sample(5)

Unnamed: 0,Age,Fare,Age_mean,Age_median,Fare_mean,Fare_median
11,46.0,26.0,46.0,46.0,26.0,26.0
107,,7.75,29.307252,27.0,7.75,7.75
374,54.0,81.8583,54.0,54.0,81.8583,81.8583
135,24.0,7.8542,24.0,24.0,7.8542,7.8542
284,2.0,20.2125,2.0,2.0,20.2125,20.2125


In [56]:
print('Original Age variance : ', X_train['Age'].var())
print('Median Age variance : ', X_train['Age_median'].var())
print('Mean Age variance : ', X_train['Age_mean'].var())

print('Original Fare variance : ', X_train['Fare'].var())
print('Median Fare variance : ', X_train['Fare_median'].var())
print('Mean Fare variance : ', X_train['Fare_mean'].var())

Original Age variance :  184.70402996695034
Median Age variance :  145.67090989552426
Mean Age variance :  144.76802348760972
Original Fare variance :  2333.007047160699
Median Fare variance :  2327.1458643048395
Mean Fare variance :  2326.0010199920484


Using SkLearn

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [58]:
imputer1 = SimpleImputer(strategy='mean')
imputer2 = SimpleImputer(strategy='median')

In [61]:
trf = ColumnTransformer([
    ('imputer1' , imputer1,['Age']),
    ('imputer2', imputer2,['Fare'])
], remainder='passthrough')

In [62]:
trf.fit(X_train)

0,1,2
,transformers,"[('imputer1', ...), ('imputer2', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [63]:
ColumnTransformer(remainder='passthrough',
                  transformers=[('imputer1', SimpleImputer(strategy='median'), ['Age']),
                                ('imputer2', SimpleImputer(), ['Fare'])])

0,1,2
,transformers,"[('imputer1', ...), ('imputer2', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [64]:
trf.named_transformers_['imputer1'].statistics_

array([29.30725191])

In [65]:
trf.named_transformers_['imputer2'].statistics_

array([14.1083])

In [66]:
X_train = trf.transform(X_train)
X_test = trf.transform(X_test)

In [67]:
X_test

array([[ 29.30725191,  51.8625    ],
       [ 21.        ,   7.7958    ],
       [ 61.        ,  12.35      ],
       [ 64.        ,  26.55      ],
       [ 29.30725191,  69.55      ],
       [ 61.        , 262.375     ],
       [ 33.        ,  20.575     ],
       [ 29.30725191,   8.05      ],
       [ 55.        ,  59.4       ],
       [ 29.        ,  26.        ],
       [ 31.        ,   7.7333    ],
       [ 21.        ,  21.        ],
       [ 15.        ,  39.        ],
       [ 30.        ,  21.        ],
       [ 28.        , 263.        ],
       [ 34.        ,  26.        ],
       [ 24.        ,   7.775     ],
       [ 23.        ,   7.7958    ],
       [ 21.        ,  11.5       ],
       [ 47.        ,  42.4       ],
       [ 38.        ,  21.        ],
       [ 29.30725191,   7.8875    ],
       [ 29.30725191,   7.55      ],
       [  7.        ,  15.2458    ],
       [ 29.30725191,   7.7333    ],
       [ 55.        ,  25.7       ],
       [ 53.        ,  28.5       ],
 