# Activity 16.01: Complete ML Workflow in a Pipeline

You have been assigned the task of doing an initial screening of patients based on their body parameters, such as cholesterol, blood pressure, pulse, and more.

The aim of this activity is for you to predict whether a patient has a heart ailment using the patient parameters' dataset. To make the data science life cycle simple, you will be using an ML pipeline.

In [24]:
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer


In [15]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter16/Dataset/processed.cleveland.data'

rawData = pd.read_csv(filename, sep=',', header=None, na_values='?')

In [16]:
rawData.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'label']

In [17]:
rawData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [18]:
rawData.loc[rawData['label'] > 0, 'label'] = 1

In [19]:
alteredData = rawData.dropna(axis=0) 
alteredData.shape

(297, 14)

In [22]:
y = alteredData.pop('label')
X = alteredData
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0


In [23]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=123)

In [26]:
#processing Engine
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

In [27]:
#spot checking models
classifiers = [LogisticRegression(random_state=123), KNeighborsClassifier(), RandomForestClassifier(random_state=123), AdaBoostClassifier(random_state=123)]

In [33]:
for classifier in classifiers:
    estimator =Pipeline(steps=[('preprocessor', preprocessor), ('dimred', PCA(10)), ('classifier', classifier)])
    estimator.fit(train_X, train_y)
    print(classifier,'\n model score',estimator.score(test_X, test_y))
    print('--------')

LogisticRegression(random_state=123) 
 model score 0.7888888888888889
--------
KNeighborsClassifier() 
 model score 0.7777777777777778
--------
RandomForestClassifier(random_state=123) 
 model score 0.8333333333333334
--------
AdaBoostClassifier(random_state=123) 
 model score 0.7222222222222222
--------


In [52]:
# Creating a pipeline with Logistic Regression
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA()),
                           ('classifier',RandomForestClassifier(random_state=123))])



In [53]:
param_grid = {'dimred__n_components':[10,11,12,13],"classifier__n_estimators": [50,100,200]}

In [58]:
best_estimator = GridSearchCV(pipe, cv=10, param_grid=param_grid)

In [59]:
best_estimator.fit(train_X, train_y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object'))])),
                                       ('dimred', PCA()),
                                       ('classifier',
                                        RandomForestClassifier(random_state=123))]),
             param_grid={'classifier__n_estimators': [50, 100, 200],
                         'dimred__n_components': [10, 11, 12, 13]})

In [60]:
print('Best Score', best_estimator.best_score_)
print('Best Params', best_estimator.best_params_)

Best Score 0.8221428571428572
Best Params {'classifier__n_estimators': 50, 'dimred__n_components': 12}


In [61]:
predictions = best_estimator.predict(test_X)

In [62]:
print(classification_report(predictions, test_y))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        49
           1       0.76      0.76      0.76        41

    accuracy                           0.78        90
   macro avg       0.78      0.78      0.78        90
weighted avg       0.78      0.78      0.78        90



In [63]:
print(confusion_matrix(predictions, test_y))

[[39 10]
 [10 31]]
