# Missing Value Imputation using Scikit-learn

Here we'll fill the missing value using Scikit learn library. Here using pipeline we will fill the value using 4 different 
ways(Mean, Median, Most frequent and Constant).

In [1]:
pwd

'C:\\Users\\User\\Git-Hup Files\\Feature Eng'

In [2]:
cd C:\Users\User\Desktop\Anindita Mitra\Machine Learning\EDA

C:\Users\User\Desktop\Anindita Mitra\Machine Learning\EDA


In [3]:
#Import all the library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
#load the dataset
df = pd.read_csv("Icecream_Sales.csv")

In [5]:
df.head()

Unnamed: 0,ORDERNUMBER,PRODUCTLINE,Country,ITEMSSOLD,SALES
0,10381,Strawberry Icecream,Belgium,30.0,8254.8
1,10391,Strawberry Icecream,Belgium,34.0,4140.23
2,10411,,Belgium,41.0,12001.0
3,10424,,Belgium,,19861.77
4,10107,,Belgium,49.0,27722.54


In [6]:
#Lets check the data types
df.dtypes

ORDERNUMBER      int64
PRODUCTLINE     object
Country         object
ITEMSSOLD      float64
SALES          float64
dtype: object

In [7]:
# Numerical Missing Value
#Assigning an oject which has only the numerical value 
num = df.select_dtypes(include=["int64","float64"]).columns

In [8]:
num

Index(['ORDERNUMBER', 'ITEMSSOLD', 'SALES'], dtype='object')

In [9]:
#Checking individual data frame respectively
#df1 : Numerical Value
df[num].isnull().sum()

ORDERNUMBER     0
ITEMSSOLD      14
SALES          11
dtype: int64

In [10]:
# Categorical missing value
#Assigning an oject which has only the categorical value 
cat = df.select_dtypes(include=["object"]).columns

In [11]:
cat

Index(['PRODUCTLINE', 'Country'], dtype='object')

In [12]:
df[cat].isnull().sum()

PRODUCTLINE    18
Country        26
dtype: int64

In [13]:
num_mean = ["ITEMSSOLD"]
num_median= ["SALES"]
cat_mode = ["PRODUCTLINE"]
cat_constant = ["Country"]

In [14]:
# Create 4 different Pipelines like Mean, Median, Most frequent and Constant
num_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
num_median_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
cat_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent"))])
cat_constant_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="Belgium"))])


In [15]:
preprocessor = ColumnTransformer(transformers=[("mean_imputer",num_mean_imputer,num_mean),
                                              ("median_imputer",num_median_imputer,num_median),
                                             ("mode_imputer",cat_mode_imputer,cat_mode),
                                             ("constant_imputer",cat_constant_imputer,cat_constant)])

In [16]:
preprocessor.fit(df)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('mean_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['ITEMSSOLD']),
                                ('median_imputer',
                                 Pipeline(memory=None,
                       

In [17]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('mean_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['ITEMSSOLD']),
                                ('median_imputer',
                                 

In [18]:
preprocessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([35.13333333])

In [19]:
preprocessor.named_transformers_["median_imputer"].named_steps["imputer"].statistics_

array([5163.4546665])

In [20]:
preprocessor.named_transformers_["mode_imputer"].named_steps["imputer"].statistics_

array(['Chocolate Icecream'], dtype=object)

In [21]:
df_new = preprocessor.transform(df)

In [22]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='mean',
                                 verbose=0))],
           verbose=False),
  ['ITEMSSOLD']),
 ('median_imputer',
  Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='median',
                                 verbose=0))],
           verbose=False),
  ['SALES']),
 ('mode_imputer',
  Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='most_frequent',
                                 verbose=0))],
           verbose=False),
  ['PRODUCTLINE']),
 ('constant_imputer',
  Pipeline(memory=N

In [23]:
#new dataframe is created and filled the missing value
df_new2 = pd.DataFrame(df_new, columns=num_mean+num_median+cat_mode+cat_constant)

In [24]:
df_new2.head()

Unnamed: 0,ITEMSSOLD,SALES,PRODUCTLINE,Country
0,30.0,8254.8,Strawberry Icecream,Belgium
1,34.0,4140.23,Strawberry Icecream,Belgium
2,41.0,12001.0,Chocolate Icecream,Belgium
3,35.1333,19861.8,Chocolate Icecream,Belgium
4,49.0,27722.5,Chocolate Icecream,Belgium


In [25]:
df_new2.dtypes

ITEMSSOLD      object
SALES          object
PRODUCTLINE    object
Country        object
dtype: object

In [26]:
df_new2.isnull().sum()

ITEMSSOLD      0
SALES          0
PRODUCTLINE    0
Country        0
dtype: int64