In [1]:
# Import required libraries/packages
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

## Example 1

In [3]:
steps = [('Scaler',StandardScaler()),
        ('Classifier',LogisticRegression())]

In [4]:
pipe = Pipeline(steps)

In [5]:
#visualizing estimator/model/pipeline
from sklearn import set_config
set_config(display="diagram")

In [6]:
pipe

In [7]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)

In [8]:
X.shape, y.shape

((1000, 20), (1000,))

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
pipe.fit(X_train,y_train)

In [11]:
y_pred = pipe.predict(X_test)

In [12]:
pipe.score(X_test,y_test)

0.896969696969697

## Example 2

In [14]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [15]:
steps = [('Scaler',StandardScaler()),
        ('PCA',PCA(n_components=4)),
        ('SVC',SVC())]
pipe2 = Pipeline(steps)

In [16]:
pipe2

In [17]:
# accessing only scaler in pipeline using specified name
# pipe2['Scaler'].fit_transform(X_train)

In [18]:
pipe2.fit(X_train,y_train)

In [19]:
y_pred2 = pipe2.predict(X_test)

In [20]:
pipe2.score(X_test,y_test)

0.8939393939393939

## Complex example of Column Transformer

In [22]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [23]:
# numerical processing pipeline
numeric_processor = Pipeline(
    steps=[('imputation_mean',SimpleImputer(missing_values=np.nan,strategy='mean')),
          ('scaler',StandardScaler())]
)

In [24]:
numeric_processor

In [25]:
# categorical processing pipeline
categorical_processor = Pipeline(
    steps=[('constant',SimpleImputer(fill_value='missing',strategy='constant')),
          ('onehotencode',OneHotEncoder(handle_unknown='ignore'))]
)

In [26]:
categorical_processor

In [27]:
 from sklearn.compose import ColumnTransformer

In [28]:
preprocessor = ColumnTransformer(
    transformers=[('Categorical',categorical_processor,['gender','city']),
                 ('numerical',numeric_processor,['age','height'])]
)

In [29]:
preprocessor

In [30]:
from sklearn.pipeline import make_pipeline

In [31]:
final_pipe = make_pipeline(preprocessor,LogisticRegression())

In [32]:
final_pipe

In [33]:
data = {
    'gender':['M','F','F','M','M'],
    'city':['Pune','Satara','Nashik','Mumbai','Kolhapur'],
    'age':[24,22,25,27,28],
    'height':[176,150,154,169,170],
    'smoker':['Y','N','N','Y','N']
}
sample = pd.DataFrame(data)

In [34]:
sample

Unnamed: 0,gender,city,age,height,smoker
0,M,Pune,24,176,Y
1,F,Satara,22,150,N
2,F,Nashik,25,154,N
3,M,Mumbai,27,169,Y
4,M,Kolhapur,28,170,N


In [35]:
X = sample.drop('smoker',axis=1)
y = sample['smoker']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
final_pipe.fit(X_train,y_train)

In [37]:
y_preds = final_pipe.predict(X_test)

In [38]:
final_pipe.score(X_test,y_test)

0.5