## Loading the Standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

In [3]:
use_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
df_feb = pd.read_csv('loan_Feb.csv', usecols = use_cols)
df_feb.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,5849,0.0,
1,4583,1508.0,128.0
2,3000,0.0,66.0
3,2583,2358.0,120.0
4,6000,0.0,141.0


In [4]:
df_feb.shape

(614, 3)

## Missing Values Treatment

In [5]:
df_feb.isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')

In [16]:
df_feb['LoanAmount'] = sim_imp.fit_transform(df_feb[['LoanAmount']])
df_feb.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,5849,0.0,128.0
1,4583,1508.0,128.0
2,3000,0.0,66.0
3,2583,2358.0,120.0
4,6000,0.0,141.0


In [17]:
df_feb.isnull().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
dtype: int64

## Scaling

In [20]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [23]:
data = ss.fit_transform(df_feb)
data

array([[ 0.07299082, -0.55448733, -0.21124125],
       [-0.13441195, -0.03873155, -0.21124125],
       [-0.39374734, -0.55448733, -0.94899647],
       ...,
       [ 0.43717437, -0.47240418,  1.27616847],
       [ 0.35706382, -0.55448733,  0.49081614],
       [-0.13441195, -0.55448733, -0.15174486]])

In [25]:
data = pd.DataFrame(data, columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'])
data

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,0.072991,-0.554487,-0.211241
1,-0.134412,-0.038732,-0.211241
2,-0.393747,-0.554487,-0.948996
3,-0.462062,0.251980,-0.306435
4,0.097728,-0.554487,-0.056551
...,...,...,...
609,-0.410130,-0.554487,-0.889500
610,-0.212557,-0.554487,-1.258378
611,0.437174,-0.472404,1.276168
612,0.357064,-0.554487,0.490816


## Creating a Pipeline

- Import the pipeline function from sklearn library
- Specify the steps needed to create a pipeline

Steps for Preprocessing are :
1. Missing Value Treatment
2. StandardScaling

## Import the necessary libraries and functions for preprocessing steps

In [29]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')

In [30]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

## Spcify the steps that we want to take

In [31]:
steps = [('imputer', sim_imp), ('scaler', ss)]

## Import pipeline and create a pipeline from the steps

In [32]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps)

## Fit the pipeline on data

In [33]:
pipe.fit_transform(df_feb)

array([[ 0.07299082, -0.55448733, -0.21124125],
       [-0.13441195, -0.03873155, -0.21124125],
       [-0.39374734, -0.55448733, -0.94899647],
       ...,
       [ 0.43717437, -0.47240418,  1.27616847],
       [ 0.35706382, -0.55448733,  0.49081614],
       [-0.13441195, -0.55448733, -0.15174486]])

In [35]:
df_feb_piped = pd.DataFrame(pipe.fit_transform(df_feb), columns = use_cols)
df_feb_piped

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,0.072991,-0.554487,-0.211241
1,-0.134412,-0.038732,-0.211241
2,-0.393747,-0.554487,-0.948996
3,-0.462062,0.251980,-0.306435
4,0.097728,-0.554487,-0.056551
...,...,...,...
609,-0.410130,-0.554487,-0.889500
610,-0.212557,-0.554487,-1.258378
611,0.437174,-0.472404,1.276168
612,0.357064,-0.554487,0.490816


## Import the dataset for March Month and apply the pipe on the data

In [36]:
use_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
df_mar = pd.read_csv('loan_March.csv', usecols = use_cols)
df_mar.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,5720,0,110.0
1,3076,1500,126.0
2,5000,1800,208.0
3,2340,2546,100.0
4,3276,0,78.0


## Fit the pipe on the March data

In [37]:
pipe.fit_transform(df_mar)

array([[ 0.1864605 , -0.67333511, -0.42677996],
       [-0.35269225, -0.02984821, -0.16395333],
       [ 0.0396413 ,  0.09884917,  1.18303317],
       ...,
       [-0.31721094,  0.18164449, -0.16395333],
       [ 0.0396413 ,  0.353241  ,  0.36169994],
       [ 0.89608667, -0.67333511, -0.62389994]])

In [39]:
df_mar_piped = pd.DataFrame(pipe.fit_transform(df_mar), columns = use_cols)
df_mar_piped

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,0.186461,-0.673335,-0.426780
1,-0.352692,-0.029848,-0.163953
2,0.039641,0.098849,1.183033
3,-0.502774,0.418877,-0.591047
4,-0.311909,-0.673335,-0.952433
...,...,...,...
362,-0.162439,0.088982,-0.377500
363,-0.132056,-0.369180,-0.344647
364,-0.317211,0.181644,-0.163953
365,0.039641,0.353241,0.361700


## Create pipelines for Categorical data

In [70]:
use_cols = ['Gender', 'Married', 'Self_Employed', 'Education']
df_feb = pd.read_csv('loan_Feb.csv', usecols = use_cols)
df_feb.head()

Unnamed: 0,Gender,Married,Education,Self_Employed
0,Male,No,Graduate,No
1,Male,Yes,Graduate,No
2,Male,Yes,Graduate,Yes
3,Male,Yes,Not Graduate,No
4,Male,No,Graduate,No


In [72]:
df_feb['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

## Creating a pipeline for Categorical variables

- Steps for the pipline
    1. Missing Value treatment of all the columns
    2. One hot Encoding of all the columns

## Import the libraries for the steps

In [73]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')

In [74]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop = 'first')

## Specify the steps

In [75]:
steps_cat = [('missing_imputer', si), ('Encoding', ohe)]

## Import the pipeline from sklearn

In [76]:
from sklearn.pipeline import Pipeline
pipe_cat = Pipeline(steps_cat)
pipe_cat

Pipeline(steps=[('missing_imputer', SimpleImputer(strategy='most_frequent')),
                ('Encoding', OneHotEncoder(drop='first'))])

In [77]:
## fit on the data
pipe_cat.fit_transform(df_feb)

<614x4 sparse matrix of type '<class 'numpy.float64'>'
	with 1119 stored elements in Compressed Sparse Row format>

In [81]:
df_piped_cat = pd.DataFrame(pipe_cat.fit_transform(df_feb).toarray(),columns = ['Male', 'Married', "SelfEmplyed", "Education"])#, columns = ['Gender_Male', 'Gender_Female', 'Married_Yes',
                                                                                #'Married_No', 'SelfEmployed_Yes', 'SelfEmployed_No'])
df_piped_cat

Unnamed: 0,Male,Married,SelfEmplyed,Education
0,1.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0
2,1.0,1.0,0.0,1.0
3,1.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
609,0.0,0.0,0.0,0.0
610,1.0,1.0,0.0,0.0
611,1.0,1.0,0.0,0.0
612,1.0,1.0,0.0,0.0


## Load the March data

In [82]:
use_cols = ['Gender', 'Married', 'Self_Employed','Education']
df_mar = pd.read_csv('loan_March.csv', usecols = use_cols)
df_mar.head()

Unnamed: 0,Gender,Married,Education,Self_Employed
0,Male,Yes,Graduate,No
1,Male,Yes,Graduate,No
2,Male,Yes,Graduate,No
3,Male,Yes,Graduate,No
4,Male,No,Not Graduate,No


## Fit this pipeline on March data

In [83]:
pipe_cat.fit_transform(df_mar)

<367x4 sparse matrix of type '<class 'numpy.float64'>'
	with 651 stored elements in Compressed Sparse Row format>

In [85]:
df_mar_piped = pd.DataFrame(pipe_cat.fit_transform(df_mar).toarray())
df_mar_piped

Unnamed: 0,0,1,2,3
0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.0
3,1.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0
...,...,...,...,...
362,1.0,1.0,1.0,1.0
363,1.0,1.0,0.0,0.0
364,1.0,0.0,0.0,0.0
365,1.0,1.0,0.0,0.0
