# 1.  Data scientist experimentation notebook

Think of this notebook as a data scientist's scratch notebook where they experiment manually with data preparation, algorithm selection, hyper-paramamer tuning and such.  This notebook does not have any Azure ML Service integration.  The next notebook demonstrates how a data scientist can take this to the next level (integrate with Azure ML) in prep for MLOps build and release pipeline.<br>

The experiment is classification type, leveraging Scikit-Learn library.  Use case is Coronary Heart Disease Prediction with the famous Kaggle dataset - framingham.csv<br>

In [9]:
# Load necessary packages
import pandas as pd
import numpy as np
import pickle
import os

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [10]:
# Remove local copy iof file
if os.path.exists("framingham.csv"):
  os.remove("framingham.csv")

In [33]:
# Download load file from YOUR storage account URL
# Replace the URL here with your storage account, dataset URL from module 3, section 3.
!wget "https://ncrmlopssa.blob.core.windows.net/chd-dataset/framingham.csv"

--2020-01-27 19:44:01--  https://ncrmlopssa.blob.core.windows.net/chd-dataset/framingham.csv
Resolving ncrmlopssa.blob.core.windows.net (ncrmlopssa.blob.core.windows.net)... 52.239.223.19
Connecting to ncrmlopssa.blob.core.windows.net (ncrmlopssa.blob.core.windows.net)|52.239.223.19|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191805 (187K) [text/csv]
Saving to: ‘framingham.csv.3’


2020-01-27 19:44:02 (62.7 MB/s) - ‘framingham.csv.3’ saved [191805/191805]



In [34]:
# Read into a Pandas dataframe
df = pd.read_csv('framingham.csv')

In [35]:
# Create a boolean array of smokers
smoke = (df['currentSmoker']==1)
# Apply mean to NaNs in cigsPerDay but using a set of smokers only
df.loc[smoke,'cigsPerDay'] = df.loc[smoke,'cigsPerDay'].fillna(df.loc[smoke,'cigsPerDay'].mean())

In [14]:
# Fill out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)

#### Apply Random Forest Classifier

In [15]:
features = df.iloc[:,:-1]
result = df.iloc[:,-1] # the last column is what we are about to forecast

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, result, test_size = 0.2, random_state = 14)

In [17]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.12
sfm = SelectFromModel(clf, threshold=0.12)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.12)

In [19]:
feat_labels = list(features.columns.values) # creating a list with features' names
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

age
prevalentHyp
sysBP
glucose


In [20]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 1 (0.242214)
2. feature 10 (0.200736)
3. feature 14 (0.152858)
4. feature 7 (0.139117)
5. feature 11 (0.105004)
6. feature 0 (0.034898)
7. feature 12 (0.034277)
8. feature 4 (0.022181)
9. feature 5 (0.016595)
10. feature 8 (0.015724)
11. feature 9 (0.014800)
12. feature 13 (0.009745)
13. feature 2 (0.007321)
14. feature 6 (0.004530)
15. feature 3 (0.000000)


In [21]:
# With only imporant features. Can check X_important_train.shape[1]
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [22]:
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### The metrics are...

In [23]:
predictions_y_4 = clf_important.predict(X_important_test)

print("============================")
print("Classification Report")
print("============================")
print(classification_report(y_test, predictions_y_4))
print("")

print("============================")
print("Confusion Matrix")
print("============================")
print(confusion_matrix(y_test, predictions_y_4))
print("")

# Under ROC curve
print("============================")
print("ROC")
print("============================")
prob_y_4 = clf_important.predict_proba(X_important_test)
prob_y_4 = [p[1] for p in prob_y_4]
print(roc_auc_score(y_test, prob_y_4))
print("")

print("============================")
print("Accuracy score")
print("============================")
accuracy_score(y_test, predictions_y_4)

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.95      0.91       724
           1       0.27      0.10      0.14       124

   micro avg       0.83      0.83      0.83       848
   macro avg       0.56      0.53      0.52       848
weighted avg       0.77      0.83      0.79       848


Confusion Matrix
[[691  33]
 [112  12]]

ROC
0.6730306540723578

Accuracy score


0.8290094339622641

In [24]:
os.makedirs('./outputs/model', exist_ok=True)

In [25]:
# Model path on disk
filename = './outputs/model/chd-rf-model'

In [26]:
# Save the model to disk
pickle.dump(clf_important, open(filename, 'wb'))
print("model saved in ././outputs/model/ folder")
print("Saving model files completed.")

model saved in ././outputs/model/ folder
Saving model files completed.


In [27]:
# Lets run a prediction
# Test... Actual: TenYearCHD=1

age = 61
prevalentHyp = 1
sysBP = 150
glucose = 103
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[1]


In [28]:
# Lets run a prediction
# Test... Actual: TenYearCHD=0

age = 43
prevalentHyp = 1
sysBP = 180
glucose = 99
 
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[0]


In [29]:
# Lets run a prediction
# Test... Actual: TenYearCHD=1

age = 63
prevalentHyp = 0
sysBP = 138
glucose = 85
 
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[1]


In [30]:
# Lets run a prediction
# Test... Actual: TenYearCHD=0
age = 52
prevalentHyp = 1
sysBP = 141
glucose = 75
 
print(loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

[0]


In [31]:
# Lets run multiple predictions
results = loaded_model.predict([[61, 1, 150, 103],[43, 1, 180, 99],[63,0,138,85]])
results

array([1, 0, 1])

In [32]:
# Lets try in the format in the way the REST service we operationalize the model to, expects it
import json

to_be_scored_json = {"data":[[61, 1, 150, 103],[43, 1, 180, 99],[63,0,138,85]]}
input_data_json = json.dumps(to_be_scored_json)

to_be_scored_list = json.loads(input_data_json)["data"]
to_be_scored_list=np.array(to_be_scored_list)
print(loaded_model.predict(to_be_scored_list).tolist())

[1, 0, 1]


## Next
Return to the lab guide for the next step.