## Step 3 in ML process:

1. Missing Value Treatment for numerical data.
2. Missing Value Treatment for text data.
3. Feature Scaling for numerical data
4. Feature Encoding for text data.

### Using Pipeline we can automate this process of the all the Data Preprocessing steps.



### Step 1: Loading all the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2: Load the data

In [2]:
data = pd.read_csv('loan_Feb.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
data.shape

(614, 13)

## Step 3: Data Preprocessing

In [4]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## Observation:

1. Gender, Self, Married - Employed are text columns having missing values
2. Dependents, LoanAmount, Loan_Amount_Term, Credit_History - are numerical column having missing values

In [5]:
## Missing Value treatment for numerical data

from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')
sim_imp 

In [6]:
data['LoanAmount'] = sim_imp.fit_transform(data[['LoanAmount']])
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
## Missing Value treatment with mode strategy

from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [8]:
data[['Gender', 'Dependents', 'Married', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']] = si.fit_transform(data[['Gender', 'Dependents', 'Married', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']])
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [9]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Feature Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [11]:
data[['LoanAmount', 'ApplicantIncome', 'CoapplicantIncome']] = mms.fit_transform(data[['LoanAmount', 'ApplicantIncome', 'CoapplicantIncome']])
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,0.070489,0.0,0.172214,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,0.05483,0.036192,0.172214,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,0.03525,0.0,0.082489,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,0.030093,0.056592,0.160637,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,0.072356,0.0,0.191027,360.0,1.0,Urban,Y


### Feature Encoding

In [12]:
dic1 = {'Male' : 0, 'Female' : 1}
dic2 = {'Yes' : 1, 'No': 0}
dic3 = {'Graduate' : 0, 'Not Graduate' : 1}
dic5 = {"Y" : 1, 'N' : 0}

In [13]:
data[['Married', 'Self_Employed']] = data[['Married', 'Self_Employed']].replace(dic2)
data['Gender'] = data['Gender'].replace(dic1)
data['Education'] = data['Education'].replace(dic3)
data['Loan_Status'] = data['Loan_Status'].replace(dic5)

In [14]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0,0,0,0,0,0.070489,0.0,0.172214,360.0,1.0,Urban,1
1,LP001003,0,1,1,0,0,0.05483,0.036192,0.172214,360.0,1.0,Rural,0
2,LP001005,0,1,0,0,1,0.03525,0.0,0.082489,360.0,1.0,Urban,1
3,LP001006,0,1,0,1,0,0.030093,0.056592,0.160637,360.0,1.0,Urban,1
4,LP001008,0,0,0,0,0,0.072356,0.0,0.191027,360.0,1.0,Urban,1


In [15]:
## Loan_ID is of no use hence delete the column

data = data.drop('Loan_ID', axis = 1)
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,0,0,0.070489,0.0,0.172214,360.0,1.0,Urban,1
1,0,1,1,0,0,0.05483,0.036192,0.172214,360.0,1.0,Rural,0
2,0,1,0,0,1,0.03525,0.0,0.082489,360.0,1.0,Urban,1
3,0,1,0,1,0,0.030093,0.056592,0.160637,360.0,1.0,Urban,1
4,0,0,0,0,0,0.072356,0.0,0.191027,360.0,1.0,Urban,1


In [16]:
## One hot Encoding the Property Area Col

data_ohe = pd.get_dummies(data['Property_Area'])
data_ohe

Unnamed: 0,Rural,Semiurban,Urban
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
609,1,0,0
610,1,0,0
611,0,0,1
612,0,0,1


In [17]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rural,Semiurban,Urban
0,0,0,0,0,0,0.070489,0.0,0.172214,360.0,1.0,Urban,1,0,0,1
1,0,1,1,0,0,0.05483,0.036192,0.172214,360.0,1.0,Rural,0,1,0,0
2,0,1,0,0,1,0.03525,0.0,0.082489,360.0,1.0,Urban,1,0,0,1
3,0,1,0,1,0,0.030093,0.056592,0.160637,360.0,1.0,Urban,1,0,0,1
4,0,0,0,0,0,0.072356,0.0,0.191027,360.0,1.0,Urban,1,0,0,1


In [18]:
data = data.drop('Property_Area', axis = 1)
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Rural,Semiurban,Urban
0,0,0,0,0,0,0.070489,0.0,0.172214,360.0,1.0,1,0,0,1
1,0,1,1,0,0,0.05483,0.036192,0.172214,360.0,1.0,0,1,0,0
2,0,1,0,0,1,0.03525,0.0,0.082489,360.0,1.0,1,0,0,1
3,0,1,0,1,0,0.030093,0.056592,0.160637,360.0,1.0,1,0,0,1
4,0,0,0,0,0,0.072356,0.0,0.191027,360.0,1.0,1,0,0,1


## Automating all the above steps performed using Pipeline Function from the sklearn library

#### Steps to create a pipeline

- Step 1 : Import all the libraries and functions needed for Data Preprocessing
- Step 2 : Mention the steps needed to perform automation and store all the steps in the 'steps' variable
- Step 3 : Import Pipeline function from sklearn library and create pipeline with the 'steps' variable.

#### Step 1: Import all the libraries and functions needed for Data Preprocessing for numerical data

In [19]:
## Use this for numerical data

from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')
sim_imp

In [20]:
## Use this for categorical data

from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [22]:
## Use this for feature Scaling

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

### Step 2 : Mention the steps needed to perform automation on numerical data and store all the steps in the 'steps' variable

In [36]:
num_steps = [('Num Missing treatment', sim_imp), ('Num Scaling', mms)]
num_steps

[('Num Missing treatment', SimpleImputer()), ('Num Scaling', MinMaxScaler())]

### Step 3: Import Pipeline function from sklearn library and create pipeline with the num_steps variable.

In [37]:
from sklearn.pipeline import Pipeline
num_pipe = Pipeline(num_steps)
num_pipe

In [41]:
### Shortcut method for Step 2 and Step 3

from sklearn.pipeline import make_pipeline
num_pipe = make_pipeline('sim_imp', 'mms')
num_pipe

## Read the data from the march month

In [38]:
data_mar = pd.read_csv('loan_March.csv')
data_mar

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [39]:
data_mar[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = num_pipe.fit_transform(data_mar[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])
data_mar.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,0.078865,0.0,0.157088,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,0.042411,0.0625,0.187739,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,0.068938,0.075,0.344828,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,0.032263,0.106083,0.137931,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,0.045168,0.0,0.095785,360.0,1.0,Urban


# Create a Categorical Pipeline

#### Step 1 : To import all the libraries and functions necessary for performing Data Preprocessing

In [42]:
## Missing Value treatment on the categorical data

from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [43]:
## Encoding the categorical data

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe

#### Step 2: Define the steps for categorical pipeline and store it in 'cat_steps' variable

In [44]:
cat_steps = [('Cat Missing treatment', si), ('One Hot Encoding', ohe)]
cat_steps

[('Cat Missing treatment', SimpleImputer(strategy='most_frequent')),
 ('One Hot Encoding', OneHotEncoder())]

#### Step 3: Create the pipeline for the categorical columns

In [45]:
from sklearn.pipeline import Pipeline
cat_pipe = Pipeline(cat_steps)
cat_pipe

### apply the cat_pipe on the march data

In [47]:
data_mar.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,0.078865,0.0,0.157088,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,0.042411,0.0625,0.187739,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,0.068938,0.075,0.344828,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,0.032263,0.106083,0.137931,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,0.045168,0.0,0.095785,360.0,1.0,Urban


In [48]:
data_cat_pipe = cat_pipe.fit_transform(data_mar[['Property_Area']])
data_cat_pipe

<367x3 sparse matrix of type '<class 'numpy.float64'>'
	with 367 stored elements in Compressed Sparse Row format>

In [49]:
data_cat_pipe.toarray()

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [50]:
data_ohe = pd.DataFrame(data_cat_pipe.toarray(), columns = ['Rural', 'Semiurban', 'Urban'])
data_ohe

Unnamed: 0,Rural,Semiurban,Urban
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
362,0.0,0.0,1.0
363,0.0,0.0,1.0
364,0.0,1.0,0.0
365,1.0,0.0,0.0


In [51]:
data_mar = pd.concat([data_mar, data_ohe], axis = 1)

In [52]:
data_mar.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Rural,Semiurban,Urban
0,LP001015,Male,Yes,0,Graduate,No,0.078865,0.0,0.157088,360.0,1.0,Urban,0.0,0.0,1.0
1,LP001022,Male,Yes,1,Graduate,No,0.042411,0.0625,0.187739,360.0,1.0,Urban,0.0,0.0,1.0
2,LP001031,Male,Yes,2,Graduate,No,0.068938,0.075,0.344828,360.0,1.0,Urban,0.0,0.0,1.0
3,LP001035,Male,Yes,2,Graduate,No,0.032263,0.106083,0.137931,360.0,,Urban,0.0,0.0,1.0
4,LP001051,Male,No,0,Not Graduate,No,0.045168,0.0,0.095785,360.0,1.0,Urban,0.0,0.0,1.0


### You have automated the Data preprocessing steps using num_pipe and cat_pipe
#### You can also automate the pipeline as well. Using a function called as Column Transformer()

In [None]:
data_apr = num_pipe.fit_transform(data_apr)
data_apr = cat_pipe.fit_transform(data_apr)

In [None]:
data_apr = column_transformer.fit_transform(data_apr)

In [53]:
data_mar = pd.read_csv('loan_March.csv')
data_mar

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


# Create a Column Transformer for num_pipe and cat_pipe

#### To create the Column Transformer follow the below steps
- Step 1 : Seperate all numerical and categorical variables
- Step 2 : Create a Column Transformer
- Step 3 : Apply the column transformer on the data

## Step 1: Seperate all numerical and categorical variables

In [54]:
num_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
cat_features = ['Property_Area']

### Step 2: Create the column transformer

In [55]:
from sklearn.compose import ColumnTransformer
col_trans = ColumnTransformer(
                                [('num_pipeline', num_pipe, num_features), 
                                 ('categorical pipeline', cat_pipe, cat_features)]
)
col_trans

In [64]:
data_col_trans = col_trans.fit_transform(data_mar)

ValueError: not enough values to unpack (expected 3, got 2)