## Recap:

1. Implemented Pipeline in python
2. Implemented Column Transformers in python
3. Implemented numerical pipelines in python
4. Implemented categorical pipelines in python

## Agenda:

1. Implement Pipelines, Column Transformations on diabetes data
2. Implement pipelines with involving Machine Learning algorithms as part of pipelines

## Load the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Creating a data for classification

In [2]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples = 1000)

In [3]:
X.shape

(1000, 20)

In [4]:
y.shape

(1000,)

In [6]:
X

array([[-0.0631145 , -0.37896995, -0.77815754, ..., -1.7784885 ,
        -0.40535931,  0.80018636],
       [ 0.14962626,  0.28214349, -0.24792453, ..., -0.42977097,
         0.30481124, -0.4845037 ],
       [ 2.13019319,  1.26952   ,  2.2935895 , ...,  1.52624715,
        -0.62892109,  0.71267356],
       ...,
       [-0.32531853, -2.00712401,  0.96060621, ..., -0.46896094,
        -1.50429718,  1.57555643],
       [ 0.0732509 , -0.12458734, -0.85719873, ..., -0.29435942,
         1.70941535,  0.13851046],
       [ 0.9382999 ,  0.82590748, -1.77961731, ...,  2.49163842,
         0.41002122, -0.94408662]])

## Creating a pipeline with FeatureScaling and LogisticRegression

In [7]:
## Importing all the necessary libraries to create pipeline

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [9]:
## Define the steps needed for creating pipeline

steps = [('Scaler', ss), ('Classifier', lr)]
steps

[('Scaler', StandardScaler()), ('Classifier', LogisticRegression())]

In [10]:
## Create the pipeline from the above defined steps

from sklearn.pipeline import Pipeline
pipe = Pipeline(steps)
pipe

## Split the data into train set and test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)


## Apply the pipe on the X_train and y_train

In [12]:
pipe.fit(X_train, y_train)

## Perform predictions on the X_test data

In [14]:
y_pred = pipe.predict(X_test)
y_pred

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0])

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9

# Shortcut method to create pipelines

## Create a pipeline

1. Import all the necessary libraries to create the pipeline
2. Define steps needed to create the pipeline
3. import pipeline function and create the pipeline using the steps

In [24]:
## import the make_pipeline function to create the pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline
pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), StandardScaler(), OneHotEncoder(), 
                    LogisticRegression(), accuracy_score(y_test, y_pred))
pipe

## Short cut for Column Transformer

In [26]:
### Creating a pipeline for numerical variables

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
num_pipe = make_pipeline(SimpleImputer(strategy = 'mean'), StandardScaler())
num_pipe

In [28]:
## Create a pipeline for Categorical variables

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import make_pipeline
cat_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(), LabelEncoder())
cat_pipe

## Short cut way to create Column Transformer

In [29]:
num_features = []
cat_features = []
all_features = num_features + cat_features

In [32]:
from sklearn.compose import make_column_transformer
full_pipe = make_column_transformer(
                    [num_pipe, num_features],
                    [cat_pipe, cat_features]
)

In [33]:
full_pipe