# Missing Value Imputation using Pipeline (Scikit-Learn)

In [48]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [49]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

# show all rows and all columns
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [50]:
df = pd.read_csv(r"C:\Users\user\Pictures\PYTHON_PANDAS\banglore.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [51]:
X_train = df.drop(columns='price')
y_train = df["price"]
print("shape of train data : ",X_train.shape )
print("shape of test data : ",y_train.shape )

shape of train data :  (13320, 8)
shape of test data :  (13320,)


In [52]:
isnull_sum = X_train.isnull().sum()
isnull_sum                      #isnull_sum['size']

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
dtype: int64

In [53]:
X_train.drop(columns='society',inplace=True)   #drop sociwty columns beacuse of more then 20% NanN value
X_train.columns

Index(['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath',
       'balcony'],
      dtype='object')

In [57]:
#finding numerical variables which have missing values
num_vars = X_train.select_dtypes(include=['int64','float64']).columns
num_vars_miss = [var for var in num_vars if isnull_sum[var]>0]
num_vars_miss

['bath', 'balcony']

In [58]:
#finding categorical variables which have missing values
cat_vars = X_train.select_dtypes(include=['object']).columns
cat_vars_miss = [var for var in cat_vars if isnull_sum[var]>0]
cat_vars_miss

['location', 'size']

In [59]:
num_vars_mean = ["bath"] # in this we will fill mean value
num_vars_median = ["balcony"] # in this we will fill median value
cat_vars_mode = ["location"] # in this we will fill most_frequent value
cat_vars_missing = ["size"] # in this we will fill constant value

In [63]:
# actually we are applying different strategies to same data type variable, so we will use PIPELINE .
num_vars_mean_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="mean"))])
num_vars_median_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="median"))])
cat_vars_mode_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])
cat_vars_missing_imputer = Pipeline(steps=[("imputer",SimpleImputer(strategy="constant",fill_value="missing"))])

In [64]:
preprocessor = ColumnTransformer( transformers=[
    ("mean-imputer",num_vars_mean_imputer,num_vars_mean),
    ("median-imputer",num_vars_median_imputer,num_vars_median),
    ("mode-imputer",cat_vars_mode_imputer,cat_vars_mode),
    ("missing-imputer",cat_vars_missing_imputer,cat_vars_missing)
], remainder="passthrough")

In [65]:
#now fit data
preprocessor.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('mean-imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['bath']),
                                ('median-imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['balcony']),
                                ('mode-imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['location']),
                                ('missing-imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant'))]

In [66]:
X_train_clean = preprocessor.transform(X_train)
X_train_clean[0]   # this is in 2D numpy array

array([2.0, 1.0, 'Electronic City Phase II', '2 BHK',
       'Super built-up  Area', '19-Dec', '1056'], dtype=object)

In [67]:
# convert array to dataframe
X_train_clean_df= pd.DataFrame(X_train_clean,columns=X_train.columns)
X_train_clean_df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,2,1,Electronic City Phase II,2 BHK,Super built-up Area,19-Dec,1056
1,5,3,Chikka Tirupathi,4 Bedroom,Plot Area,Ready To Move,2600
2,2,3,Uttarahalli,3 BHK,Built-up Area,Ready To Move,1440
3,3,1,Lingadheeranahalli,3 BHK,Super built-up Area,Ready To Move,1521
4,2,1,Kothanur,2 BHK,Super built-up Area,Ready To Move,1200


In [68]:
X_train.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0
