## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [34]:
data = pd.read_csv('loan_Feb.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [35]:
data.shape

(614, 13)

In [36]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Data Preprocessing

In [6]:
data.isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'mean')
sim_imp

In [8]:
data['LoanAmount'] = sim_imp.fit_transform(data[['LoanAmount']])
data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,5849,0.0,146.412162
1,4583,1508.0,128.0
2,3000,0.0,66.0
3,2583,2358.0,120.0
4,6000,0.0,141.0


In [9]:
data.isnull().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
dtype: int64

## Feature Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [11]:
data = mms.fit_transform(data)
data

array([[0.07048856, 0.        , 0.19885986],
       [0.05482993, 0.03619171, 0.17221418],
       [0.03525046, 0.        , 0.08248915],
       ...,
       [0.09798392, 0.00575995, 0.35311143],
       [0.09193568, 0.        , 0.25759768],
       [0.05482993, 0.        , 0.17945007]])

In [13]:
data = pd.DataFrame(data, columns = ['ApplicantIncome', 'CoapplicantIncome','LoanAmount'])
data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,0.070489,0.0,0.19886
1,0.05483,0.036192,0.172214
2,0.03525,0.0,0.082489
3,0.030093,0.056592,0.160637
4,0.072356,0.0,0.191027


## Automating these steps using Pipeline function from sklearn library

## To create a pipeline :
- Step 1 : Import all the libraries need for preprocessing 
- example : for this loan data we need library for missing value treatment and library for Scaling
- Step 2 : Mention the steps needed to perform automation
- Step 3 : Import Pipeline function from sklearn library and create the pipeline

In [14]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')
sim_imp

In [15]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

#### Mentioning the steps needed to perform automation

In [17]:
steps = [('MissingValueImputer Function', sim_imp), ('FeatureScaling', mms)]
steps

[('MissingValueImputer Function', SimpleImputer(strategy='median')),
 ('FeatureScaling', MinMaxScaler())]

### Import Pipeline function from sklearn library and create pipeline

In [18]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps)
pipe

### Apply the pipeline on the data

In [20]:
data_piped = pipe.fit_transform(data)
data_piped

array([[0.07048856, 0.        , 0.19885986],
       [0.05482993, 0.03619171, 0.17221418],
       [0.03525046, 0.        , 0.08248915],
       ...,
       [0.09798392, 0.00575995, 0.35311143],
       [0.09193568, 0.        , 0.25759768],
       [0.05482993, 0.        , 0.17945007]])

In [21]:
data_pipe = pd.DataFrame(data_piped, columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'])
data_pipe.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,0.070489,0.0,0.19886
1,0.05483,0.036192,0.172214
2,0.03525,0.0,0.082489
3,0.030093,0.056592,0.160637
4,0.072356,0.0,0.191027


## The above pipeline was created and applied on loan_data for feburary month. I import March month data as well 

In [23]:
data_march = pd.read_csv('loan_March.csv')
data_march.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [26]:
data_march[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = pipe.fit_transform(data_march[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])
data_march.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,0.078865,0.0,0.157088,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,0.042411,0.0625,0.187739,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,0.068938,0.075,0.344828,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,0.032263,0.106083,0.137931,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,0.045168,0.0,0.095785,360.0,1.0,Urban


## Create a pipeline for Categorical data

In [28]:
## Import all the libraries necessary for performing preprocessing

from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [29]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe

In [30]:
## Define the steps

steps_cat = [('MissingValueImputer Fuction', si), ('FeatureEncoding', ohe)]
steps_cat

[('MissingValueImputer Fuction', SimpleImputer(strategy='most_frequent')),
 ('FeatureEncoding', OneHotEncoder())]

In [31]:
## Create the pipeline

from sklearn.pipeline import Pipeline
pipe_cat = Pipeline(steps_cat)
pipe_cat

In [37]:
data_cat_piped = pipe_cat.fit_transform(data[['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']])
data_cat_piped

<614x11 sparse matrix of type '<class 'numpy.float64'>'
	with 3070 stored elements in Compressed Sparse Row format>

In [39]:
data_cat_piped.toarray()

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [41]:
data_pipe = pd.DataFrame(data_cat_piped.toarray(), columns = ['female', 'Male', 'Yes', 'No', 'Graduate', 'Non-Graduate', 'No', 'Yes','Urban', 'Semi_urban', 'Rural'])
data_pipe.head()

Unnamed: 0,female,Male,Yes,No,Graduate,Non-Graduate,No.1,Yes.1,Urban,Semi_urban,Rural
0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [42]:
data_march.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,0.078865,0.0,0.157088,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,0.042411,0.0625,0.187739,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,0.068938,0.075,0.344828,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,0.032263,0.106083,0.137931,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,0.045168,0.0,0.095785,360.0,1.0,Urban


In [45]:
pipe_cat.fit_transform(data_march[['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']])

<367x11 sparse matrix of type '<class 'numpy.float64'>'
	with 1835 stored elements in Compressed Sparse Row format>