In [48]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from warnings import filterwarnings
filterwarnings('ignore')

In [16]:
iris = load_iris()

In [19]:
iris.data.shape

(150, 4)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=42)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((100, 4), (50, 4), (100,), (50,))

In [38]:
# pipeline hadles below task
# 1) standard scalar 
# 2) PCA
# 3) classifier

In [39]:
p_lr = Pipeline( [ ('scalar1', StandardScaler()),
                   ('pca1', PCA(n_components=2)),
                   ('lr_classifier',LogisticRegression() )])

In [41]:
p_dt = Pipeline( [ ('scalar2', StandardScaler()),
                   ('pca2', PCA(n_components=2)),
                   ('dt_classifier',DecisionTreeClassifier() )])

In [42]:
p_rf = Pipeline( [ ('scalar3', StandardScaler()),
                   ('pca3', PCA(n_components=2)),
                   ('rf_classifier',RandomForestClassifier() )])

In [43]:
# list of pipelines
pipelines = [p_lr,p_dt,p_rf]

In [44]:
pipeline_dict={1:'Logistic Regression', 2:'Decision Tree', 3:'Random Forest'}

In [52]:
# execute one pipeline first

p_lr.fit(X_train,y_train)
p_lr.score(X_test,y_test)

0.84

In [56]:
# execution of multiple pipeline for each ML model
for i,pipe in enumerate(pipelines):
    pipe.fit(X_train,y_train)
    print(f'{pipeline_dict[i+1]}: accuracy score is {pipe.score(X_test,y_test)}')

Logistic Regression: accuracy score is 0.84
Decision Tree: accuracy score is 0.9
Random Forest: accuracy score is 0.92


### done !! Pipeline helps in organising the complex code in much efficient way


### Now, lets implementing the hyperparameter tuning via pipeline

In [57]:
from sklearn.model_selection import GridSearchCV

In [72]:
# grid param is the list of dictionaries for each classifier alone.
# each component(parameters) for pipeline can also be modified in below grid
# there should be double undescore after each classifier name and than hyperparameter name.

grid_param = [{
                'pca1__svd_solver' : ['auto', 'full', 'arpack', 'randomized']
                
                }]

In [59]:
g=GridSearchCV(p_lr,grid_param, cv= None, verbose=0, n_jobs=-1)

In [60]:
bm= g.fit(X_train,y_train)

In [61]:
bm.best_estimator_

Pipeline(memory=None,
     steps=[('scalar1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca1', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('lr_classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [62]:
bm.best_params_

{'pca1__svd_solver': 'auto'}

In [63]:
bm.best_score_

0.81

In [64]:
bm.cv_results_

{'mean_fit_time': array([0.38562584, 0.30285732, 0.33310827, 0.18217977]),
 'std_fit_time': array([0.04113137, 0.21764305, 0.07467493, 0.12151317]),
 'mean_score_time': array([0.00133117, 0.00166146, 0.00133038, 0.00099723]),
 'std_score_time': array([4.71988598e-04, 9.40605287e-04, 4.70358991e-04, 5.94720425e-07]),
 'param_pca1__svd_solver': masked_array(data=['auto', 'full', 'arpack', 'randomized'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'pca1__svd_solver': 'auto'},
  {'pca1__svd_solver': 'full'},
  {'pca1__svd_solver': 'arpack'},
  {'pca1__svd_solver': 'randomized'}],
 'split0_test_score': array([0.91428571, 0.91428571, 0.91428571, 0.91428571]),
 'split1_test_score': array([0.72727273, 0.72727273, 0.72727273, 0.72727273]),
 'split2_test_score': array([0.78125, 0.78125, 0.78125, 0.78125]),
 'mean_test_score': array([0.81, 0.81, 0.81, 0.81]),
 'std_test_score': array([0.07955742, 0.07955742, 0.07955742, 0.07955

In [67]:
y_pred=bm.predict(X_test)

In [68]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [69]:
accuracy_score(y_test, y_pred)

0.84

In [70]:
confusion_matrix(y_test, y_pred)

array([[19,  0,  0],
       [ 0,  8,  7],
       [ 0,  1, 15]], dtype=int64)

## done !! with tuning.