## 4. Model Evaluation
confusion matrix, hyperparameter optimization (random forests), cross validation, train_test_split (should have been done before?), metrices, accuracy, precision, recall

In [39]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, TimeSeriesSplit
from sklearn.metrics import average_precision_score, accuracy_score, precision_recall_curve

# data are stored here
data_folder = 'C:/Users/Branimir/AnacondaProjects/usecase_0056/Output/DE0001102390'

# integer for the random seed
THE_ANSWER = 42

In [12]:
# load data
data = np.load('%s/data.npz' % data_folder)

# extract training data
X_train, y_train = data['X_train'], data['y_train']

In [35]:
# let's take a look at the class labels (i.e., the y_train array)
print(y_train)

# only 34% of rows are of class 1 --> the data set is imbalanced
print('The class == 1 rows make %0.f%% of the data set.' % (100*y_train.sum()/y_train.size))

[0. 1. 1. ... 0. 0. 0.]
The class == 1 rows make 34% of the data set.


In [17]:
# instantiate the RandomForestClassifier
model = RandomForestClassifier()

# print out the default values of model hyper-parameters (will talk about hyper-parameters later)
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
# set the number of trees to 500, and the random seed to THE_ANSWER for reproducibility
model.set_params(n_estimators=500, random_state=THE_ANSWER)
#model = RandomForestClassifier(n_estimators=500, random_state=THE_ANSWER) # another way of doing it

# check that the model changed
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
# some values in the training set are NaN (i.e., are missing).
# The Random Forest classifier does not like that, so we set NaN values to 10000.
X_train_fixed = X_train.copy()
X_train_fixed[np.isnan(X_train_fixed)] = 10000

In [41]:
# fit the model using training data
model.fit(X_train_fixed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [52]:
# now that we have fit the model, we need to evaluate its performance
# that can be done using various metrics, but the simplest one is accuracy.
# Accuracy is defined as the percentage of correct predictions for the test data.
# accuracy = number of correct predictions / number of all predictions

In [48]:
## calculate accuracy on the test set
# set NaNs to 10000
X_test_fixed = X_test.copy()
X_test_fixed[np.isnan(X_test)] = 10000

# push the test data set through the model to get the classification scores
y_scores = model.predict(X_test_fixed)

# the model predicts the class.
print('Predicted class:', y_scores)
print('True class:     ', y_test)

Predicted class: [0. 0. 0. ... 1. 0. 0.]
True class:      [1. 0. 0. ... 1. 0. 1.]


In [45]:
# calculate the accuracy
print('The accuracy is %.3f.' % accuracy_score(y_test, y_scores))

The accuracy is 0.892.


In [None]:
# Great result, right? May not be since we have an imbalanced data set.
# Here's an extreme example: Imagine a data set where 90% of rows are of class 0.
# A naive classifier that predicts 0 for all rows will have an accuracy of 90%!

In [None]:
# In my opinion, what most business people would like to know is the following:
# 1) What is the detection rate? Did we correctly identify 50%, 80%, or more of all class 1 objects?
# 2) What is the false alarm rate? What percentage of those tagged as class 1 are *not* class 1?
# The detection rate is called "recall" in machine learning,
# and the false alarm rate is related to "precision" (actually, it is 1-precision)

In [None]:
# use graphics on this page to explain the confusion matrix, precision, and recall
# https://www.jeremyjordan.me/evaluating-a-machine-learning-model/

In [50]:
# push the test data set through the model to get the classification *scores* (not labels)
# (note that we use the "predict_proba" method and not "predict")
y_scores = model.predict_proba(X_test_fixed)

# the model calculates the score for both classes (0 and 1). Each row sums to 1.
print(y_scores)
print('The y_scores is a 2-dimensional array:', y_scores.shape)

[[0.88866667 0.11133333]
 [0.828      0.172     ]
 [0.85693333 0.14306667]
 ...
 [0.378      0.622     ]
 [0.584      0.416     ]
 [0.804      0.196     ]]
The y_scores is a 2-dimensional array. (22034, 2)


In [51]:
# calculate the area under the precision-recall curve for the "positive" class
# by feeding the true class label (y_test) and the predicted classification scores (y_scores[:, 1])
print('The area under the Precision-Recall curve is %.3f.' % average_precision_score(y_test, y_scores[:, 1]))

The area under the Precision-Recall curve is 0.923.


In [30]:
# different metrics for evaluating classification models can be found here:
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [9]:
X_valid, y_valid = data['X_valid'], data['y_valid']
X_devel, y_devel = data['X_devel'], data['y_devel']
X_test, y_test = data['X_test'], data['y_test']

(65579, 33)