In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
#Load the file from local directory using pd.read_csv which is a special form of read_table
pima_df = pd.read_csv("Pima Indians Diabetes.csv")

In [5]:
pima_df

Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
X = pima_df.drop("class",axis=1) # select all rows and first 8 columns which are the attributes
Y = pima_df['class']

In [7]:
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
type(X_train)

pandas.core.frame.DataFrame

In [8]:
X_train.shape

(537, 8)

In [9]:
X_test.shape

(231, 8)

In [15]:
# it takes a list of tuples as parameter. The last entry is the call to the modelling algorithm
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('clf', LogisticRegression())
])

In [16]:
# use the pipeline object as you would
# a regular classifier
pipeline.fit(X_train,y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('clf', LogisticRegression())])

In [17]:
from sklearn import metrics

y_predict = pipeline.predict(X_test)
model_score = pipeline.score(X_test, y_test)
print(model_score)
print()
print(metrics.confusion_matrix(y_test, y_predict))

0.7748917748917749

[[130  17]
 [ 35  49]]


In [18]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [19]:
y_predict = lr.predict(X_test)
model_score = lr.score(X_test, y_test)
print(model_score)
print()
print(metrics.confusion_matrix(y_test, y_predict))

0.7489177489177489

[[127  20]
 [ 38  46]]


In [20]:
from sklearn.svm import SVC 
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler

In [21]:
# load and split the data 
cancer = load_breast_cancer() 
X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state = 0) # compute minimum and maximum on the training data 
scaler = MinMaxScaler().fit( X_train)



In [24]:
X_train_scaled = scaler.transform( X_train) 
X_train_scaled

array([[0.23044157, 0.32157676, 0.21940433, ..., 0.31484671, 0.30277942,
        0.09858323],
       [0.20062473, 0.42116183, 0.19452699, ..., 0.06965208, 0.34042973,
        0.06677161],
       [0.62232003, 0.76929461, 0.60403566, ..., 0.56079917, 0.19850187,
        0.07431457],
       ...,
       [0.11619102, 0.35726141, 0.11077327, ..., 0.17402687, 0.17524147,
        0.17263545],
       [0.12963226, 0.35311203, 0.11706171, ..., 0.        , 0.06780997,
        0.06919848],
       [0.21434995, 0.59004149, 0.21235575, ..., 0.33251808, 0.10782574,
        0.21172767]])

In [25]:
svm = SVC() # learn an SVM on the scaled training data svm.fit( X_train_scaled, y_train)
svm.fit( X_train_scaled, y_train)

SVC()

In [26]:
X_test_scaled = scaler.transform( X_test) 
print(" Test score: {:.2f}".format( svm.score( X_test_scaled, y_test)))

 Test score: 0.97


# Use Pipeline to link all the steps into one single object

In [27]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([(" scaler", MinMaxScaler()), (" svm", SVC())])

In [28]:
pipe.fit( X_train, y_train)

Pipeline(steps=[(' scaler', MinMaxScaler()), (' svm', SVC())])

In [29]:
print(" Test score: {:.2f}". format( pipe.score( X_test, y_test)))

 Test score: 0.97


In [30]:
y_pred = pipe.predict(X_test)

In [31]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        53
           1       0.97      0.99      0.98        90

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143



# Makepipeline for simpler pipe

In [32]:

from sklearn.pipeline import make_pipeline



In [41]:
#pipe = make_pipeline( MinMaxScaler(), MinMaxScaler(), StandardScaler(),  (SVC())) 
pipe = make_pipeline(StandardScaler(),  (LogisticRegression())) 
print(" Pipeline steps:\ n{}". format( pipe.steps))

 Pipeline steps:\ n[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())]


In [42]:
pipe.fit( X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [43]:
print(" Test score: {:.2f}". format( pipe.score( X_test, y_test)))

 Test score: 0.96
