# Heart Disease Random Forest Classifier

In this example we use the Heart Disease data from
https://www.kaggle.com/ronitf/heart-disease-uci

We train a pipeline composed of a DictVectorizer, RobustScaler and a RandomForest classifier and deploy it on Blazee.

In [1]:
import pandas as pd
import numpy as np
import logging
logging.getLogger().setLevel(logging.INFO)

# Load the Data
df = pd.read_csv('./data/heart.csv')
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [2]:
from sklearn.model_selection import train_test_split

# Split the data into train/test sets
X = df.drop(['target'], axis = 1)
y = df['target']
# y = y[:, np.newaxis]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 101)

# Transform the features to a list of dict, which is the
# expected input of our pipeline
X_train = X_train.to_dict(orient = 'records')
X_test = X_test.to_dict(orient = 'records')
X_train[0]

{'age': 48,
 'sex': 1,
 'cp': 0,
 'trestbps': 130,
 'chol': 256,
 'fbs': 1,
 'restecg': 0,
 'thalach': 150,
 'exang': 1,
 'oldpeak': 0.0,
 'slope': 2,
 'ca': 2,
 'thal': 3}

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier

# Create a Pipeline that transforms and scales the data
# followed by a RandomForest classifier
pipeline = make_pipeline(DictVectorizer(sparse=False),
                         RobustScaler(),
                         RandomForestClassifier(n_estimators=1000, random_state=10))

# Train the pipeline
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.82

In [8]:
import blazee
import os

# Deploy the model on Blazee
api_key = os.environ['BLAZEE_API_KEY']
bz = blazee.Blazee(api_key)
model = bz.deploy_model(pipeline, model_name="HeartDisease RF2")
bz.all_models()

INFO:root:Uploading model version to Blazee  (5.428464 MB)...
INFO:root:Deploying new model version: v1...
INFO:root:Successfully deployed model version 5afa5b86-d0a7-412c-b5f5-5bc55942be45


[<BlazeeModel 'Pima'
 	id=078d1904-d032-4b66-87b8-fb15e4a08bc3>, <BlazeeModel 'HeartDisease RF2'
 	id=a872fb60-e749-4294-9783-d5df2341d48d>]

In [9]:
# Make sure predictions work as expected
local_predictions = pipeline.predict(X_test)
local_probas = pipeline.predict_proba(X_test)

blazee_preds = model.predict_batch(X_test[:3])
for i, pred in enumerate(blazee_preds):
    print(pred)
    assert pred.prediction == local_predictions[i]
    assert (pred.probas == local_probas[i]).all()

<Prediction
	prediction=0
	probas=[0.874, 0.126]>
<Prediction
	prediction=1
	probas=[0.204, 0.796]>
<Prediction
	prediction=1
	probas=[0.253, 0.747]>


In [10]:
from sklearn.linear_model import SGDClassifier

# We then train a new pipeline using a SGD Classifier,
# and realize it is performing better on the test set
pipeline2 = make_pipeline(DictVectorizer(sparse=False),
                         RobustScaler(),
                         SGDClassifier(max_iter=1000))
pipeline2.fit(X_train, y_train)
pipeline2.score(X_test, y_test)



0.88

In [11]:
# We upload the new version of the model on Blazee
model.update(pipeline2)

INFO:root:Uploading model version to Blazee  (2.317383 KB)...
INFO:root:Deploying new model version: v2...
INFO:root:Successfully deployed model version 25f71e6c-8d08-4fa2-9a8e-97913bb49cb3


<BlazeeModel 'HeartDisease RF2'
	id=a872fb60-e749-4294-9783-d5df2341d48d>

In [12]:
# The model has 2 versions now
model.versions()

[<ModelVersion 'HeartDisease RF2' @ v1
 	id=5afa5b86-d0a7-412c-b5f5-5bc55942be45
 	deployed=True
 	created_at=2019-04-12 09:20:03+00:00>, <ModelVersion 'HeartDisease RF2' @ v2
 	id=25f71e6c-8d08-4fa2-9a8e-97913bb49cb3
 	deployed=True
 	created_at=2019-04-12 09:21:44+00:00>]

In [14]:
# It is still using the v1 as the default version
print("Default version: " + model.default_version.name)
v1 = model.get_version('v1')
v2 = model.get_version('v2')

# This means that making predictions will still use v1
v1_prediction = v1.predict(X_test[0])
print(f'Pred from V1: {v1_prediction}')
v2_prediction = v2.predict(X_test[0])
print(f'Pred from V2: {v2_prediction}')
print('^ V2 prediction has no probas, as it is using "hinge" loss')
model_prediction = model.predict(X_test[0])
print(f'Pred from model: {model_prediction}')

assert model_prediction.prediction == v1_prediction.prediction
assert model_prediction.probas == v1_prediction.probas

Default version: v1
Pred from V1: <Prediction
	prediction=0
	probas=[0.874, 0.126]>
Pred from V2: <Prediction
	prediction=0
	probas=None>
^ V2 prediction has no probas, as it is using "hinge" loss
Pred from model: <Prediction
	prediction=0
	probas=[0.874, 0.126]>


In [15]:
# Now, we tested v2 a bunch and want to set it to production so we make it the default version
v2.make_default()


print("Default version: " + model.default_version.name)

# Making predictions from this model will use v2 now
model_prediction = model.predict(X_test[0])
print(f'Pred from model: {model_prediction}')

assert model_prediction.prediction == v2_prediction.prediction
assert model_prediction.probas == v2_prediction.probas


Default version: v2
Pred from model: <Prediction
	prediction=0
	probas=None>
