In [1]:
%matplotlib inline

In [2]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = load_breast_cancer()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.data, df.target, random_state=1)

In [5]:
scaler = MinMaxScaler().fit(X_train)   # finds the min and max values for the attributes in X_train
Xtrain_scaled = scaler.transform(X_train)  # Transforms all the attribute values to values between 0 and 1

In [6]:
svc = SVC()   # instantiate the SVC model

In [7]:
svc.fit(Xtrain_scaled , y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
Xtest_scaled = scaler.transform(X_test)  #scale the test data separately to prevent data leak

In [9]:
print("Test scores {:.2f}".format(svc.score(Xtest_scaled, y_test)))

Test scores 0.97


# Let us use GridsearchCV to find the best set of hyper parameters

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100]}

In [12]:
grid = GridSearchCV(svc, param_grid = param_grid, cv=5)
grid.fit(Xtrain_scaled, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [13]:
print("Best parameters:" , grid.best_params_)
print("Test set accuracy: {:.2f}".format(grid.score(Xtest_scaled, y_test)))

Best parameters: {'C': 10, 'gamma': 1}
Test set accuracy: 0.97


#### This technique of gridsearchcv leads to data leak during the cross validation because it is done on one pre-scaled training set 
#### because the block used for triaining in CV is influenced by the blocks in the testing and vice-versa
#### The splitting of the data (for training and testing) should be done before any transformation to prevent Data Leaks

# Let us try Pipline object and reduce the lines of code...

In [14]:
from sklearn.pipeline import Pipeline


In [15]:
pipe = Pipeline([("Scaler" , MinMaxScaler()), ("svm" , SVC())])

In [16]:
pipe.fit(X_train, y_train)   # calls fit function on the first step 'Scaler', transforms the training data, finally fits SVM 
                             # with scaled data

Pipeline(memory=None,
         steps=[('Scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [17]:
pipe.score(X_test, y_test)   # evaluate on a separate data. The data will first be transformed and then used for prediction
                             # since the transformation is happening independently, there will be no data leak
                             # also note the less lines of coding. No need to repeat same lines of code for testing

0.965034965034965

# Lets combine the pipeline with GridSearchCV

In [18]:
# Pipeline object is subsumed by GridSearch

param_grid = {'svm__C':[0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma':[0.001, 0.01, 0.1, 1, 10, 100]}

# Note the difference beween the param_grid definition here and the one we used earlier (given below in comment)

# param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100]}

# since we are using pipeline object which is could be a string of transformation functions for e.g. MinMaxScaler and SVM above,
# we need to specify which function do the hyper parameters belong to. Hence 'svm__C' and 'svm__gamma'

In [19]:
grid = GridSearchCV( pipe, param_grid = param_grid, cv = 5) 
grid.fit( X_train, y_train) 
print(" Best cross-validation accuracy: {:.2f}".format( grid.best_score_)) 
print(" Test set score: {:.2f}".format( grid.score( X_test, y_test))) 
print(" Best parameters:", grid.best_params_)


 Best cross-validation accuracy: 0.97
 Test set score: 0.97
 Best parameters: {'svm__C': 10, 'svm__gamma': 1}


# Convenience of Make Pipeline

In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
pipe_longway = Pipeline([('scaler', MinMaxScaler()),('svm' , SVC(C=10, gamma=1)) ])

In [22]:
pipe_short = make_pipeline(MinMaxScaler() , SVC(C=10, gamma=1))

In [23]:
# Make_pipeline names the internals steps automatically based on the name of the class of the function

print("pipeline steps :\n{}".format(pipe_short.steps))

pipeline steps :
[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svc', SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False))]


# One can extract the components from every step 
# The individual functions should support components_ function for that.


In [24]:
# Note - The last item in the pipeline need not be a predictor/model. It can be a data transformation function

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA



In [26]:
pipe = make_pipeline(StandardScaler(), PCA(n_components=2) , StandardScaler())
print("Pipe steps:\n{}".format(pipe.steps))

Pipe steps:
[('standardscaler-1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('standardscaler-2', StandardScaler(copy=True, with_mean=True, with_std=True))]


In [27]:
pipe.fit(df.data)

Pipeline(memory=None,
         steps=[('standardscaler-1',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=2,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('standardscaler-2',
                 StandardScaler(copy=True, with_mean=True, with_std=True))],
         verbose=False)

In [28]:
components = pipe.named_steps['pca'].components_
print("components shape:{} ".format(components.shape))

components shape:(2, 30) 


In [29]:
print(components)

[[ 0.21890244  0.10372458  0.22753729  0.22099499  0.14258969  0.23928535
   0.25840048  0.26085376  0.13816696  0.06436335  0.20597878  0.01742803
   0.21132592  0.20286964  0.01453145  0.17039345  0.15358979  0.1834174
   0.04249842  0.10256832  0.22799663  0.10446933  0.23663968  0.22487053
   0.12795256  0.21009588  0.22876753  0.25088597  0.12290456  0.13178394]
 [-0.23385713 -0.05970609 -0.21518136 -0.23107671  0.18611302  0.15189161
   0.06016536 -0.0347675   0.19034877  0.36657547 -0.10555215  0.08997968
  -0.08945723 -0.15229263  0.20443045  0.2327159   0.19720728  0.13032156
   0.183848    0.28009203 -0.21986638 -0.0454673  -0.19987843 -0.21935186
   0.17230435  0.14359317  0.09796411 -0.00825724  0.14188335  0.27533947]]


In [35]:
print(df.data[1].size)

30


# Let us use Logistic Regression and extract the components

In [30]:
from sklearn.linear_model import LogisticRegression


In [31]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [32]:
param_grid = {'logisticregression__C':[0.001, 0.01, 0.1, 1, 10, 100]}

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df.data, df.target, random_state=10)

In [34]:
grid = GridSearchCV(pipe, param_grid, cv=5)

In [35]:
grid.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('logisticregression',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_clas

# Objective -  to access the coefficients of the best logistic regression model

In [36]:
print("Best estimator :\n{}".format(grid.best_estimator_))

Best estimator :
Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


In [37]:
# The best estimator has two steps - 'standardscaler' and 'logisticregression'
# to access these steps we can use the named_steps attribute of the pipeline

print("Logistic regression step:\n{}".format(grid.best_estimator_.named_steps['logisticregression'])) # best_estimator refers to pipe

Logistic regression step:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [38]:
# we want the coefficients from this step of the best estimator 

print("LR Coeff:\n{}".format(grid.best_estimator_.named_steps['logisticregression'].coef_))

LR Coeff:
[[-0.37697216 -0.37812164 -0.36342407 -0.35995464 -0.1068865   0.02613519
  -0.33621487 -0.38675009 -0.046877    0.27256874 -0.45329079 -0.03844042
  -0.32255151 -0.33233598 -0.03352534  0.27129634 -0.00750114 -0.16577178
   0.28184173  0.23869367 -0.49122438 -0.58741254 -0.45188663 -0.43697457
  -0.36231599 -0.14303382 -0.3949506  -0.47523598 -0.35797313 -0.15294699]]


# Summary 

In [39]:
#1 Pipeline objects are built to chain together in a logical sequence all the transformation steps that a dataset has to 
#  go thru before it is used to build the model
# The pipeline can be directly used on the test data set and all the transformations will be carried out in same sequence 
# before the model.predict() is called to predict.

# This minimizes chance of coding errors, missing steps and optimizes the code.

