In [11]:
import numpy as np 
import pandas as pd 
from sklearn.impute import  SimpleImputer

In [12]:
# univariate, which imputes values in the i-th feature dimension using only non-missing values in that feature dimension
# SimpleImputer is one of the example
# multivariate imputation algorithms use the entire set of available feature dimensions to estimate the missing values (e.g. IterativeImputer).
df = pd.read_csv('titanic_toy.csv')
print("The number of empty values are: \n",df.isna().sum())
print("The percentage of empty values are: \n", (df.isna().mean() *100))
df.head()


The number of empty values are: 
 Age         177
Fare         45
Family        0
Survived      0
dtype: int64
The percentage of empty values are: 
 Age         19.865320
Fare         5.050505
Family       0.000000
Survived     0.000000
dtype: float64


Unnamed: 0,Age,Fare,Family,Survived
0,22.0,7.25,1,0
1,38.0,71.2833,1,1
2,26.0,7.925,0,1
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [13]:
imp = SimpleImputer(missing_values=np.nan, strategy = 'mean')
#imp.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
# When we use imp.fit() the values are stored in imp
fareValues = imp.fit_transform([df['Fare']])
print(type(fareValues))

<class 'numpy.ndarray'>


 373 378 400 418 429 434 466 469 490 513 518 520 521 562 633 649 662 673
 698 702 763 816 831 860 871 878 889]. At least one non-missing value is needed for imputation with strategy='mean'.


In [14]:
df.isna().sum()

Age         177
Fare         45
Family        0
Survived      0
dtype: int64

In [15]:
# So one way is univariate imputation -> Taking help of one column
# Other is taking for help of more columns -> Iterative Imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [16]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [18]:
print("The percentage of empty values are: \n", X_train.isnull().mean()) 

The percentage of empty values are: 
 Age       0.207865
Fare      0.050562
Family    0.000000
dtype: float64


In [19]:
## Using strategy = most_frequent
df = pd.DataFrame([["a", "x"],[np.nan, "y"],["a", np.nan],["b", "y"]], dtype="category")
imp = SimpleImputer(strategy='most_frequent')
someVal = imp.fit_transform(df)
someVal

array([['a', 'x'],
       ['a', 'y'],
       ['a', 'y'],
       ['b', 'y']], dtype=object)

### Multivariate Feature Imputation

In [None]:
## Using Multivariate Feature imputation
from sklearn.impute import IterativeImputer
itImp = IterativeImputer