# 1.  Data scientist experimentation notebook

Think of this notebook as a data scientist's scratch notebook where they experiment manually with data preparation, algorithm selection, hyper-paramamet tuning and such.  This notebook does not have any Azure ML Service integration.  The next notebook demonstrates how a data scientist can take this to the next level (integrate with Azure ML) in prep for MLOps build and release pipeline.<br>

The experiment is classification type, leveraging ScikitLearn library.  Use case is Coronary Heart Disease Prediction with the famous Kaggle dataset - framingham.csv<br>

In [3]:
# Load necessary packages
import pandas as pd
import numpy as np
import pickle
import os

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
# Rewmove local copy iof file
if os.path.exists("framingham.csv"):
  os.remove("framingham.csv")

In [5]:
# Download load file from YOUR storage account URL
!wget "https://mlopssa.blob.core.windows.net/chd-dataset/framingham.csv"

--2020-01-24 00:22:15--  https://mlopssa.blob.core.windows.net/chd-dataset/framingham.csv
Resolving mlopssa.blob.core.windows.net (mlopssa.blob.core.windows.net)... 52.225.136.36
Connecting to mlopssa.blob.core.windows.net (mlopssa.blob.core.windows.net)|52.225.136.36|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191805 (187K) [text/csv]
Saving to: ‘framingham.csv.1’


2020-01-24 00:22:15 (65.1 MB/s) - ‘framingham.csv.1’ saved [191805/191805]



In [None]:
# Read into a Pandas dataframe
df = pd.read_csv('framingham.csv')

In [None]:
# Create a boolean array of smokers
smoke = (df['currentSmoker']==1)
# Apply mean to NaNs in cigsPerDay but using a set of smokers only
df.loc[smoke,'cigsPerDay'] = df.loc[smoke,'cigsPerDay'].fillna(df.loc[smoke,'cigsPerDay'].mean())

In [None]:
# Fill out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)

#### Apply Random Forest Classifier

In [None]:
features = df.iloc[:,:-1]
result = df.iloc[:,-1] # the last column is what we are about to forecast

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, result, test_size = 0.2, random_state = 14)

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

In [None]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.12
sfm = SelectFromModel(clf, threshold=0.12)

# Train the selector
sfm.fit(X_train, y_train)

In [None]:
feat_labels = list(features.columns.values) # creating a list with features' names
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
# With only imporant features. Can check X_important_train.shape[1]
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [None]:
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)

#### The metrics are...

In [None]:
predictions_y_4 = clf_important.predict(X_important_test)

print("============================")
print("Classification Report")
print("============================")
print(classification_report(y_test, predictions_y_4))
print("")

print("============================")
print("Confusion Matrix")
print("============================")
print(confusion_matrix(y_test, predictions_y_4))
print("")

# Under ROC curve
print("============================")
print("ROC")
print("============================")
prob_y_4 = clf_important.predict_proba(X_important_test)
prob_y_4 = [p[1] for p in prob_y_4]
print(roc_auc_score(y_test, prob_y_4))
print("")

print("============================")
print("Accuracy score")
print("============================")
accuracy_score(y_test, predictions_y_4)

In [None]:
os.makedirs('./outputs/model', exist_ok=True)

In [2]:
# Model path on disk
filename = './outputs/model/chd-rf-model'

In [None]:
# Save the model to disk
pickle.dump(clf_important, open(filename, 'wb'))
print("model saved in ././outputs/model/ folder")
print("Saving model files completed.")

In [5]:
# Lets run a prediction
# Test... Actual: TenYearCHD=1

age = 61
prevalentHyp = 1
sysBP = 150
glucose = 103
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[1]


In [7]:
# Lets run a prediction
# Test... Actual: TenYearCHD=0

age = 43
prevalentHyp = 1
sysBP = 180
glucose = 99
 
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

NameError: name 'loaded_model' is not defined

In [6]:
# Lets run a prediction
# Test... Actual: TenYearCHD=1

age = 63
prevalentHyp = 0
sysBP = 138
glucose = 85
 
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[1]


In [7]:
# Lets run a prediction
# Test... Actual: TenYearCHD=0
age = 52
prevalentHyp = 1
sysBP = 141
glucose = 75
 
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[0]


In [94]:
# Lets run multiple predictions
results = loaded_model.predict([[61, 1, 150, 103],[43, 1, 180, 99],[63,0,138,85]])
results

array([1, 0, 1])

In [8]:
# Lets try in the format in the way the REST service we operationalize the model to, expects it
import json

to_be_scored_json = {"data":[[61, 1, 150, 103],[43, 1, 180, 99],[63,0,138,85]]}
input_data_json = json.dumps(to_be_scored_json)

to_be_scored_list = json.loads(input_data_json)["data"]
to_be_scored_list=np.array(to_be_scored_list)
print(loaded_model.predict(to_be_scored_list).tolist())

[1, 0, 1]


## Next
Return to the lab guide for the next step.