# What is a pipeline?
- A pipeline is a series of steps applied in order to data, to prepare the data for Machine Learning.
- All preprocessing steps can be included in the pipeline.
- Pipelines can be used to automate the data preprocessing and the data modelling steps, making it easier to create and maintain the ML models.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_feb = pd.read_csv("loan_Feb.csv")
data_feb.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Observations for the Feb month data:
#### Following Data Preprocessing steps are a must to performed.
1. Missing values are present in the data hence they need to be treated.
2. Scaling issues in the data hence that also has to be treated.
3. Perform Feature Encoding in order to convert the text in to numerical values.

In [4]:
data_mar = pd.read_csv("loan_March.csv")
data_mar.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


# Observations for the March month data:
#### Following Data Preprocessing steps are a must to performed.
1. Missing values are present in the data hence they need to be treated.
2. Scaling issues in the data hence that also has to be treated.
3. Perform Feature Encoding in order to convert the text in to numerical values.

- Assuming you will also have data for the April month, you may have to perform the same preprocessing steps on that april data

# Creating a Pipeline
### Steps to create a pipeline:
Step 1 : Import all the necessary libraries and functions need for Preprocessing   
Step 2 : Mention the steps needed to perform automation and store all the steps in the 'steps' variable  
Step 3 : Import pipeline function from sklearn library and create a pipeline

In [5]:
data_feb.shape

(614, 13)

# Creating a pipline for numerical data

In [7]:
# Library needed for missing value treatment of numerical data
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = "median")
sim_imp

In [8]:
# Lirary needed for Feature Scaling for numerical data
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [9]:
num_steps = [("Missing value treatment", sim_imp), ("Feature Scaling", mms)]
num_steps

[('Missing value treatment', SimpleImputer(strategy='median')),
 ('Feature Scaling', MinMaxScaler())]

In [10]:
from sklearn.pipeline import Pipeline
num_pipe = Pipeline(num_steps)
num_pipe

In [11]:
data_feb.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [13]:
data_feb[["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]] = num_pipe.fit_transform(data_feb[["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]])
data_feb.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,0.070489,0.0,0.172214,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,0.05483,0.036192,0.172214,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,0.03525,0.0,0.082489,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,0.030093,0.056592,0.160637,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,0.072356,0.0,0.191027,360.0,1.0,Urban,Y


# Creating pipeline for Categorical Data

In [15]:
# Missing value for Categorical data. strategy = "most_frequent"
# Label Encoder
# One hot Encoder 

## Create 2 pipeline one with missing value treatment and label encoding
## 2nd one with missing value treatment and one hot encoding

In [16]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = "most_frequent")
si

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le

In [18]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe

In [19]:
steps_le = [("Missing Value Treatment", si), ("Label Encoding", le)]
steps_ohe =  [("Missing Value Treatment", si), ("One Hot Encoding", ohe)]

In [20]:
cols_le = ["Gender", "Married", "Education", "Self_Employed", "Loan_Status"]
cols_ohe = ["Property_Area"]

In [21]:
from sklearn.pipeline import Pipeline
pipe_le = Pipeline(steps = steps_le)
pipe_le

In [22]:
from sklearn.pipeline import Pipeline
pipe_ohe = Pipeline(steps = steps_ohe)
pipe_ohe

In [33]:
data_feb["Gender"] = pipe_le.fit_transform(data_feb[["Gender"]])
data_feb.head()

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [31]:
data_ohe = pipe_ohe.fit_transform(data_feb[["Property_Area"]])
data_ohe

<614x3 sparse matrix of type '<class 'numpy.float64'>'
	with 614 stored elements in Compressed Sparse Row format>

In [34]:
data_ohe.toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [35]:
data_ohe = pd.DataFrame(data_ohe.toarray(), columns = ["Rural", "Semiurban", "urban"])
data_ohe

Unnamed: 0,Rural,Semiurban,urban
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
609,1.0,0.0,0.0
610,1.0,0.0,0.0
611,0.0,0.0,1.0
612,0.0,0.0,1.0


In [36]:
data_feb = pd.concat([data_feb, data_ohe], axis = 1)
data_feb.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rural,Semiurban,urban
0,LP001002,Male,No,0,Graduate,No,0.070489,0.0,0.172214,360.0,1.0,Urban,Y,0.0,0.0,1.0
1,LP001003,Male,Yes,1,Graduate,No,0.05483,0.036192,0.172214,360.0,1.0,Rural,N,1.0,0.0,0.0
2,LP001005,Male,Yes,0,Graduate,Yes,0.03525,0.0,0.082489,360.0,1.0,Urban,Y,0.0,0.0,1.0
3,LP001006,Male,Yes,0,Not Graduate,No,0.030093,0.056592,0.160637,360.0,1.0,Urban,Y,0.0,0.0,1.0
4,LP001008,Male,No,0,Graduate,No,0.072356,0.0,0.191027,360.0,1.0,Urban,Y,0.0,0.0,1.0


In [37]:
data_feb = data_feb.drop(columns = "Property_Area")
data_feb.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Rural,Semiurban,urban
0,LP001002,Male,No,0,Graduate,No,0.070489,0.0,0.172214,360.0,1.0,Y,0.0,0.0,1.0
1,LP001003,Male,Yes,1,Graduate,No,0.05483,0.036192,0.172214,360.0,1.0,N,1.0,0.0,0.0
2,LP001005,Male,Yes,0,Graduate,Yes,0.03525,0.0,0.082489,360.0,1.0,Y,0.0,0.0,1.0
3,LP001006,Male,Yes,0,Not Graduate,No,0.030093,0.056592,0.160637,360.0,1.0,Y,0.0,0.0,1.0
4,LP001008,Male,No,0,Graduate,No,0.072356,0.0,0.191027,360.0,1.0,Y,0.0,0.0,1.0
