In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [131]:
data=pd.read_csv('Travel.csv')

In [132]:
num_col=[fea for fea in data.columns if (data[fea].dtype !='O' and data[fea].dtype !=int)]#segregating categorical and numerical variables 
num_col

['Age',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

In [133]:
data[num_col].isnull().mean()

Age                         0.046236
DurationOfPitch             0.051350
NumberOfFollowups           0.009206
PreferredPropertyStar       0.005319
NumberOfTrips               0.028642
NumberOfChildrenVisiting    0.013502
MonthlyIncome               0.047668
dtype: float64

Method 1: Performing Mean Imputation

In [134]:
# Method 1: Performing Mean Imputation

for var in num_col:
    value=data[var].mean()
    data[var]=data[var].fillna(value)

In [135]:
data[num_col].isnull().mean() # Checking after imputation

Age                         0.0
DurationOfPitch             0.0
NumberOfFollowups           0.0
PreferredPropertyStar       0.0
NumberOfTrips               0.0
NumberOfChildrenVisiting    0.0
MonthlyIncome               0.0
dtype: float64

Method 2: Imputing Missing Values by mean using scikit-learn

In [136]:
# Method 2: Imputing Missing Values by mean using scikit-learn
import sklearn
from sklearn.impute import SimpleImputer
imputer= SimpleImputer(strategy='mean')

In [137]:
data=pd.read_csv('Travel.csv')
num_col=[fea for fea in data.columns if (data[fea].dtype !='O' and data[fea].dtype !=int)]
#segregating categorical and numerical variables 
X=data[num_col]

In [138]:
imputer.fit(X) # fitting SimpleImputer to the datafrane X

In [139]:
X

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,41.0,6.0,3.0,3.0,1.0,0.0,20993.0
1,49.0,14.0,4.0,4.0,2.0,2.0,20130.0
2,37.0,8.0,4.0,3.0,7.0,0.0,17090.0
3,33.0,9.0,3.0,3.0,2.0,1.0,17909.0
4,,8.0,3.0,4.0,1.0,0.0,18468.0
...,...,...,...,...,...,...,...
4883,49.0,9.0,5.0,4.0,2.0,1.0,26576.0
4884,28.0,31.0,5.0,3.0,3.0,2.0,21212.0
4885,52.0,17.0,4.0,4.0,7.0,3.0,31820.0
4886,19.0,16.0,4.0,3.0,3.0,2.0,20289.0


In [140]:
X=imputer.transform(X)
X

array([[4.1000e+01, 6.0000e+00, 3.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        2.0993e+04],
       [4.9000e+01, 1.4000e+01, 4.0000e+00, ..., 2.0000e+00, 2.0000e+00,
        2.0130e+04],
       [3.7000e+01, 8.0000e+00, 4.0000e+00, ..., 7.0000e+00, 0.0000e+00,
        1.7090e+04],
       ...,
       [5.2000e+01, 1.7000e+01, 4.0000e+00, ..., 7.0000e+00, 3.0000e+00,
        3.1820e+04],
       [1.9000e+01, 1.6000e+01, 4.0000e+00, ..., 3.0000e+00, 2.0000e+00,
        2.0289e+04],
       [3.6000e+01, 1.4000e+01, 4.0000e+00, ..., 3.0000e+00, 2.0000e+00,
        2.4041e+04]])

In [143]:
result=pd.DataFrame(X,columns=['Age',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome'])
result # dataframe after mean imputation by sklearn's simple Imputer

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,41.000000,6.0,3.0,3.0,1.0,0.0,20993.0
1,49.000000,14.0,4.0,4.0,2.0,2.0,20130.0
2,37.000000,8.0,4.0,3.0,7.0,0.0,17090.0
3,33.000000,9.0,3.0,3.0,2.0,1.0,17909.0
4,37.622265,8.0,3.0,4.0,1.0,0.0,18468.0
...,...,...,...,...,...,...,...
4883,49.000000,9.0,5.0,4.0,2.0,1.0,26576.0
4884,28.000000,31.0,5.0,3.0,3.0,2.0,21212.0
4885,52.000000,17.0,4.0,4.0,7.0,3.0,31820.0
4886,19.000000,16.0,4.0,3.0,3.0,2.0,20289.0


In [144]:
result.isnull().mean() # Checking after imputation

Age                         0.0
DurationOfPitch             0.0
NumberOfFollowups           0.0
PreferredPropertyStar       0.0
NumberOfTrips               0.0
NumberOfChildrenVisiting    0.0
MonthlyIncome               0.0
dtype: float64

In [145]:
# The above imputations are applicable for median imputation also.

Method 3: Replacing missing values in categorical features by the feature's mode.

In [146]:
data=pd.read_csv('Travel.csv')

In [147]:
cat_col=[fea for fea in data.columns if data[fea].dtype =='O'] #segregating categorical and numerical variables 
cat_col

['TypeofContact',
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation']

In [149]:
X=data[cat_col]
X

Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Self Enquiry,Salaried,Female,Deluxe,Single,Manager
1,Company Invited,Salaried,Male,Deluxe,Divorced,Manager
2,Self Enquiry,Free Lancer,Male,Basic,Single,Executive
3,Company Invited,Salaried,Female,Basic,Divorced,Executive
4,Self Enquiry,Small Business,Male,Basic,Divorced,Executive
...,...,...,...,...,...,...
4883,Self Enquiry,Small Business,Male,Deluxe,Unmarried,Manager
4884,Company Invited,Salaried,Male,Basic,Single,Executive
4885,Self Enquiry,Salaried,Female,Standard,Married,Senior Manager
4886,Self Enquiry,Small Business,Male,Basic,Single,Executive


In [150]:
X.isnull().mean() 

TypeofContact     0.005115
Occupation        0.000000
Gender            0.000000
ProductPitched    0.000000
MaritalStatus     0.000000
Designation       0.000000
dtype: float64

In [151]:
# Only the first feature that is TypeofContact has missing values

In [152]:
value=X['TypeofContact'].mode()[0]
X['TypeofContact']=X['TypeofContact'].fillna(value)
X

Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Self Enquiry,Salaried,Female,Deluxe,Single,Manager
1,Company Invited,Salaried,Male,Deluxe,Divorced,Manager
2,Self Enquiry,Free Lancer,Male,Basic,Single,Executive
3,Company Invited,Salaried,Female,Basic,Divorced,Executive
4,Self Enquiry,Small Business,Male,Basic,Divorced,Executive
...,...,...,...,...,...,...
4883,Self Enquiry,Small Business,Male,Deluxe,Unmarried,Manager
4884,Company Invited,Salaried,Male,Basic,Single,Executive
4885,Self Enquiry,Salaried,Female,Standard,Married,Senior Manager
4886,Self Enquiry,Small Business,Male,Basic,Single,Executive


In [153]:
X.isnull().mean() # Checking after imputation, null values of 'TypeofContact' feature is zero.

TypeofContact     0.0
Occupation        0.0
Gender            0.0
ProductPitched    0.0
MaritalStatus     0.0
Designation       0.0
dtype: float64

Imputing using sklearn

In [154]:
#Imputing using sklearn
X_1=data[cat_col]

imputer_1=SimpleImputer(strategy='most_frequent')
X_1.isnull().mean()

TypeofContact     0.005115
Occupation        0.000000
Gender            0.000000
ProductPitched    0.000000
MaritalStatus     0.000000
Designation       0.000000
dtype: float64

In [155]:
imputer_1.fit(X_1)
X_1=imputer_1.transform(X_1)
X_1


array([['Self Enquiry', 'Salaried', 'Female', 'Deluxe', 'Single',
        'Manager'],
       ['Company Invited', 'Salaried', 'Male', 'Deluxe', 'Divorced',
        'Manager'],
       ['Self Enquiry', 'Free Lancer', 'Male', 'Basic', 'Single',
        'Executive'],
       ...,
       ['Self Enquiry', 'Salaried', 'Female', 'Standard', 'Married',
        'Senior Manager'],
       ['Self Enquiry', 'Small Business', 'Male', 'Basic', 'Single',
        'Executive'],
       ['Self Enquiry', 'Salaried', 'Male', 'Basic', 'Unmarried',
        'Executive']], dtype=object)

In [156]:
result_1=pd.DataFrame(X_1,columns=['TypeofContact',# converting back to DataFrame from Numpy Array
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation'])
result_1

Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Self Enquiry,Salaried,Female,Deluxe,Single,Manager
1,Company Invited,Salaried,Male,Deluxe,Divorced,Manager
2,Self Enquiry,Free Lancer,Male,Basic,Single,Executive
3,Company Invited,Salaried,Female,Basic,Divorced,Executive
4,Self Enquiry,Small Business,Male,Basic,Divorced,Executive
...,...,...,...,...,...,...
4883,Self Enquiry,Small Business,Male,Deluxe,Unmarried,Manager
4884,Company Invited,Salaried,Male,Basic,Single,Executive
4885,Self Enquiry,Salaried,Female,Standard,Married,Senior Manager
4886,Self Enquiry,Small Business,Male,Basic,Single,Executive


In [157]:
result_1.isnull().mean() # Checking after imputation

TypeofContact     0.0
Occupation        0.0
Gender            0.0
ProductPitched    0.0
MaritalStatus     0.0
Designation       0.0
dtype: float64

Method 4:Replacing Missing Values with an arbitary number

In [158]:
data=pd.read_csv('Travel.csv')
num_col=[fea for fea in data.columns if (data[fea].dtype !='O' and data[fea].dtype !=int)]
#segregating categorical and numerical variables 
X=data[num_col]

In [159]:
X.max()

Age                            61.0
DurationOfPitch               127.0
NumberOfFollowups               6.0
PreferredPropertyStar           5.0
NumberOfTrips                  22.0
NumberOfChildrenVisiting        3.0
MonthlyIncome               98678.0
dtype: float64

In [160]:
X.isnull().sum()

Age                         226
DurationOfPitch             251
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64

In [161]:
X['Age'].fillna(70,inplace=True) # Replacing by Null values by an arbitary number 70 for Age feature because it is greater than
# the max value of Age

In [162]:
X.isnull().sum() # Checking after imputation

Age                           0
DurationOfPitch             251
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64

In [163]:
# Using sklearn
imputer=SimpleImputer(strategy='constant',fill_value=70)
data=pd.read_csv('Travel.csv')
X_1=data['Age'].to_numpy() # Converting to Numpy array
X_1=X_1.reshape(-1, 1)


In [164]:
imputer.fit(X_1)


In [165]:
X_1=imputer.transform(X_1)

In [166]:
result_1=pd.DataFrame(X_1,columns=['Age'])# converting back to DataFrame from Numpy Array
 
result_1.isnull().sum() # checking after Imputation

Age    0
dtype: int64

In [168]:
result_1

Unnamed: 0,Age
0,41.0
1,49.0
2,37.0
3,33.0
4,70.0
...,...
4883,49.0
4884,28.0
4885,52.0
4886,19.0


Method 5: Replacing Missing Values in Categorical Variables

In [169]:
data=pd.read_csv('Travel.csv')
X=data[cat_col] # We would replace missing values with a string "Missing"
X.isnull().sum()

TypeofContact     25
Occupation         0
Gender             0
ProductPitched     0
MaritalStatus      0
Designation        0
dtype: int64

In [170]:
for var in cat_col:
    X[var].fillna('Missing',inplace=True)

In [171]:
X.isnull().sum() # Checking after Imputation

TypeofContact     0
Occupation        0
Gender            0
ProductPitched    0
MaritalStatus     0
Designation       0
dtype: int64

In [172]:
# Using sklearn
imputer=SimpleImputer(strategy='constant',fill_value='Missing')
data=pd.read_csv('Travel.csv')
X=data[cat_col]
imputer.fit(X)

In [173]:
X=imputer.transform(X)

In [174]:
X

array([['Self Enquiry', 'Salaried', 'Female', 'Deluxe', 'Single',
        'Manager'],
       ['Company Invited', 'Salaried', 'Male', 'Deluxe', 'Divorced',
        'Manager'],
       ['Self Enquiry', 'Free Lancer', 'Male', 'Basic', 'Single',
        'Executive'],
       ...,
       ['Self Enquiry', 'Salaried', 'Female', 'Standard', 'Married',
        'Senior Manager'],
       ['Self Enquiry', 'Small Business', 'Male', 'Basic', 'Single',
        'Executive'],
       ['Self Enquiry', 'Salaried', 'Male', 'Basic', 'Unmarried',
        'Executive']], dtype=object)

In [175]:
result_1=pd.DataFrame(X)# converting back to DataFrame from Numpy Array
 
result_1.isnull().sum() # Checking after Imputation

0    0
1    0
2    0
3    0
4    0
5    0
dtype: int64

Method 6:

Replacing missing values with a value at the end of the distribution


Replacing missing values with a value at the end of the distribution, is equivalent to replacing with an arbitary value manually. But in this case we do it by automatically selecting as those at the very end of the variable distribution.
As per the IQR proximity rule,missing values are replaced with q3+1.5(IQR) at the right tail or by q1-1.5(IQR) at the left tail.

In [176]:
X=data[num_col]
X.isnull().sum()

Age                         226
DurationOfPitch             251
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64

In [177]:
for var in num_col:
    IQR =X[var].quantile(0.75)-X[var].quantile(0.25)
    value=X[var].quantile(0.75)+(1.5*IQR)
    X[var]=X[var].fillna(value)

In [178]:
X

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,41.0,6.0,3.0,3.0,1.0,0.0,20993.0
1,49.0,14.0,4.0,4.0,2.0,2.0,20130.0
2,37.0,8.0,4.0,3.0,7.0,0.0,17090.0
3,33.0,9.0,3.0,3.0,2.0,1.0,17909.0
4,63.5,8.0,3.0,4.0,1.0,0.0,18468.0
...,...,...,...,...,...,...,...
4883,49.0,9.0,5.0,4.0,2.0,1.0,26576.0
4884,28.0,31.0,5.0,3.0,3.0,2.0,21212.0
4885,52.0,17.0,4.0,4.0,7.0,3.0,31820.0
4886,19.0,16.0,4.0,3.0,3.0,2.0,20289.0


In [179]:
X.isnull().sum()# Checking after imputation

Age                         0
DurationOfPitch             0
NumberOfFollowups           0
PreferredPropertyStar       0
NumberOfTrips               0
NumberOfChildrenVisiting    0
MonthlyIncome               0
dtype: int64

In [180]:
# Doing the same with feature Engine
X=data[num_col]

import feature_engine
from feature_engine.imputation import EndTailImputer


In [181]:
# set up the imputer
tail_imputer = EndTailImputer(imputation_method='gaussian',
                          tail='right',
                          fold=3,
                          variables=num_col)
# fit the imputer
tail_imputer.fit(X)

In [183]:
X=tail_imputer.transform(X)
X

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,41.000000,6.0,3.0,3.0,1.0,0.0,20993.0
1,49.000000,14.0,4.0,4.0,2.0,2.0,20130.0
2,37.000000,8.0,4.0,3.0,7.0,0.0,17090.0
3,33.000000,9.0,3.0,3.0,2.0,1.0,17909.0
4,65.571426,8.0,3.0,4.0,1.0,0.0,18468.0
...,...,...,...,...,...,...,...
4883,49.000000,9.0,5.0,4.0,2.0,1.0,26576.0
4884,28.000000,31.0,5.0,3.0,3.0,2.0,21212.0
4885,52.000000,17.0,4.0,4.0,7.0,3.0,31820.0
4886,19.000000,16.0,4.0,3.0,3.0,2.0,20289.0


Method 7: Multivariate Imputation by Chained Equations

Multivariate Imputation by Chained Equations is a multiple Imputation Technique that models each variable with missing values as a function of the remaining variables and uses that estimate for Impputation

A more sophisticated approach is to use the IterativeImputer class, which models each feature with missing values as a function of other features, and uses that estimate for imputation. It does so in an iterated round-robin fashion: at each step, a feature column is designated as output y and the other feature columns are treated as inputs X. A regressor is fit on (X, y) for known y. Then, the regressor is used to predict the missing values of y. This is done for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. The results of the final imputation round are returned.

In [184]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [185]:
X=data[num_col]
X

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,41.0,6.0,3.0,3.0,1.0,0.0,20993.0
1,49.0,14.0,4.0,4.0,2.0,2.0,20130.0
2,37.0,8.0,4.0,3.0,7.0,0.0,17090.0
3,33.0,9.0,3.0,3.0,2.0,1.0,17909.0
4,,8.0,3.0,4.0,1.0,0.0,18468.0
...,...,...,...,...,...,...,...
4883,49.0,9.0,5.0,4.0,2.0,1.0,26576.0
4884,28.0,31.0,5.0,3.0,3.0,2.0,21212.0
4885,52.0,17.0,4.0,4.0,7.0,3.0,31820.0
4886,19.0,16.0,4.0,3.0,3.0,2.0,20289.0


In [186]:
X.isnull().sum()

Age                         226
DurationOfPitch             251
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64

In [187]:
imp = IterativeImputer(max_iter=10,random_state=0)

In [188]:
imp.fit(X)

In [189]:
X=imp.transform(X)
X

array([[4.1000e+01, 6.0000e+00, 3.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        2.0993e+04],
       [4.9000e+01, 1.4000e+01, 4.0000e+00, ..., 2.0000e+00, 2.0000e+00,
        2.0130e+04],
       [3.7000e+01, 8.0000e+00, 4.0000e+00, ..., 7.0000e+00, 0.0000e+00,
        1.7090e+04],
       ...,
       [5.2000e+01, 1.7000e+01, 4.0000e+00, ..., 7.0000e+00, 3.0000e+00,
        3.1820e+04],
       [1.9000e+01, 1.6000e+01, 4.0000e+00, ..., 3.0000e+00, 2.0000e+00,
        2.0289e+04],
       [3.6000e+01, 1.4000e+01, 4.0000e+00, ..., 3.0000e+00, 2.0000e+00,
        2.4041e+04]])

In [190]:
result_1=pd.DataFrame(X)# Converting back to a DataFrame
result_1.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64