In [1]:
import pandas as pd
import numpy as np


#This is for univariate imputation

The word “impute” means a value assigned to something by inference from the value of the products or processes to which it contributes. In statistics, imputation is the process of replacing missing data with substituted values.

SimpleImputer is a class found in package sklearn.impute. It is used to impute / replace the numerical or categorical missing data related to one or more features with appropriate values. 

sklearn.impute package is used for importing SimpleImputer class.
SimpleImputer takes two argument such as missing_values and strategy.
fit_transform method is invoked on the instance of SimpleImputer to impute the missing values.

class sklearn.impute.SimpleImputer(*, missing_values=nan, strategy='mean', fill_value=None, copy=True, add_indicator=False, keep_empty_features=False)

strategy parameter is either string or callable.



In [2]:
data={
     'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
    'Sold_out':[1,0,0,1,0,0,0,1,0,1]
}

df1=pd.DataFrame(data=data)
df1.head()

Unnamed: 0,Social_media_followers,Sold_out
0,1000000.0,1
1,,0
2,2000000.0,0
3,1310000.0,1
4,1700000.0,0


In [3]:
df2=pd.DataFrame(
    {
        'farthest_run_mi': [50,62,np.nan, 100, 26, 13, 31, 50]
    }
)
df2

Unnamed: 0,farthest_run_mi
0,50.0
1,62.0
2,
3,100.0
4,26.0
5,13.0
6,31.0
7,50.0


In [4]:
df2.isna().sum()

farthest_run_mi    1
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
imp_mean=SimpleImputer(strategy='mean')


In [7]:
imp_mean.fit_transform(df2)

array([[ 50.        ],
       [ 62.        ],
       [ 47.42857143],
       [100.        ],
       [ 26.        ],
       [ 13.        ],
       [ 31.        ],
       [ 50.        ]])

In [8]:
imp_median=SimpleImputer(strategy='median')
imp_median.fit_transform(df2)

array([[ 50.],
       [ 62.],
       [ 50.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [9]:
imp_mode=SimpleImputer(strategy='most_frequent')
imp_mode.fit_transform(df2)

array([[ 50.],
       [ 62.],
       [ 50.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [10]:
imp_constant=SimpleImputer(strategy='constant', fill_value=13)
imp_constant.fit_transform(df2)

array([[ 50.],
       [ 62.],
       [ 13.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [11]:
#categorical values handling

df3=pd.DataFrame(
    {
        'names':['sagar', 'alok', 'jignesh', 'chhotu', 'piyush', np.nan, 'ryan']
    }
)
df3

Unnamed: 0,names
0,sagar
1,alok
2,jignesh
3,chhotu
4,piyush
5,
6,ryan


In [12]:
imp_cat_constant=SimpleImputer(strategy='constant', fill_value='babe')
imp_cat_constant.fit_transform(df3)

array([['sagar'],
       ['alok'],
       ['jignesh'],
       ['chhotu'],
       ['piyush'],
       ['babe'],
       ['ryan']], dtype=object)

In [13]:
#add a indicator which will tell you which values got transformed.
imp_cat_constant=SimpleImputer(strategy='constant', fill_value='babe', add_indicator=True)
imp_cat_constant.fit_transform(df3)

array([['sagar', False],
       ['alok', False],
       ['jignesh', False],
       ['chhotu', False],
       ['piyush', False],
       ['babe', True],
       ['ryan', False]], dtype=object)

In [28]:
#Another simple example
students = [[85, 'M', 'verygood'],
           [95, 'F', 'excellent'],
           [75, None,'good'],
           [np.nan, 'M', 'average'],
           [70, 'M', 'good'],
           [np.nan, None, 'verygood'],
           [92, 'F', 'verygood'],
           [98, 'M', 'excellent']]
dfstd = pd.DataFrame(students)
dfstd.columns = ['marks', 'gender', 'result']
dfstd

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


In [49]:
# Missing values is represented using NaN and hence specified. If it 
# is empty field, missing values will be specified as ''

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
dfstd.marks = imputer.fit_transform(dfstd['marks'].values.reshape(-1,1))[:,0]
dfstd

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


In [50]:

imputer = SimpleImputer(missing_values=None, strategy='most_frequent')
dfstd.gender = imputer.fit_transform(dfstd['gender'].values.reshape(-1,1))[:,0]
dfstd

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,M,verygood
6,92.0,F,verygood
7,98.0,M,excellent


In [29]:
dfstd

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


array([['M'],
       ['F'],
       [None],
       ['M'],
       ['M'],
       [None],
       ['F'],
       ['M']], dtype=object)

In [36]:
dfstd[dfstd['gender']=='M']['gender'].count()

np.int64(4)

ValueError: Cannot use <function impute_values at 0x000002711511DA60> strategy with non-numeric data:
could not convert string to float: 'M'