## Setup

In [2]:
import pandas as pd
import os
import pickle
from pathlib import Path
from typing import Tuple
import numpy as np
import yaml
import unittest

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score

In [3]:
__file__ = "ml_pred_quality_unit_test.ipynb"
PROJECT_DIR = Path(__file__).resolve().parents[1]
PROCESSED_DIR = "data\\processed"
PARAMS = yaml.safe_load(open(f'{os.path.join(PROJECT_DIR)}\\params.yaml'))["train"]
X_TRAIN = f"{os.path.join(PROJECT_DIR, PROCESSED_DIR)}\\x_train.txt"
Y_TRAIN = f"{os.path.join(PROJECT_DIR, PROCESSED_DIR)}\\y_train.txt"
X_TEST = f"{os.path.join(PROJECT_DIR, PROCESSED_DIR)}\\x_test.txt"
Y_TEST = f"{os.path.join(PROJECT_DIR, PROCESSED_DIR)}\\y_test.txt"
CLF_OUT = "models/rf_clf.pkl"

In [16]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(np.loadtxt(X_TRAIN, delimiter=','), np.loadtxt(Y_TRAIN, delimiter=','))

In [57]:
dt_clf.score(np.loadtxt(X_TEST, delimiter=','), np.loadtxt(Y_TEST, delimiter=','))

0.7077922077922078

In [26]:
y_test = np.loadtxt(Y_TEST, delimiter=',')
y_predict = dt_clf.predict(np.loadtxt(X_TEST, delimiter=','))
accuracy = accuracy_score(y_test, y_predict)
roc = roc_auc_score(y_test, y_predict)
cr = classification_report(y_predict, y_test, output_dict=True)
cm = confusion_matrix(y_predict, y_test)
f1 = f1_score(y_test, y_predict)
accuracy

0.7077922077922078

In [22]:
roc

0.6768707482993197

In [23]:
cm

array([[80, 20],
       [25, 29]], dtype=int64)

In [27]:
f1

0.5631067961165048

In [24]:
cr

{'0.0': {'precision': 0.7619047619047619,
  'recall': 0.8,
  'f1-score': 0.7804878048780488,
  'support': 100},
 '1.0': {'precision': 0.5918367346938775,
  'recall': 0.5370370370370371,
  'f1-score': 0.5631067961165048,
  'support': 54},
 'accuracy': 0.7077922077922078,
 'macro avg': {'precision': 0.6768707482993197,
  'recall': 0.6685185185185185,
  'f1-score': 0.6717973004972768,
  'support': 154},
 'weighted avg': {'precision': 0.7022705185970491,
  'recall': 0.7077922077922078,
  'f1-score': 0.7042632953123126,
  'support': 154}}

In [4]:
class SimplePipeline:
    def __init__(self):
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.load_dataset()

    def load_dataset(self):
        """
        load the dataset
        """
        self.X_train, self.X_test, = np.loadtxt(X_TRAIN, delimiter=','), np.loadtxt(X_TEST, delimiter=',')
        self.y_train, self.y_test = np.loadtxt(Y_TRAIN, delimiter=','), np.loadtxt(Y_TEST, delimiter=',')

    def train(self, algorithm=DecisionTreeClassifier(random_state=42)):
        self.model = algorithm
        self.model.fit(self.X_train, self.y_train)
    
    def predict(self, input_data):
        return self.model.predict(input_data)
    
    def get_accuracy(self):
         return self.model.score(X=self.X_test, y=self.y_test)

    def run_pipeline(self):
        """Helper method to run multiple pipeline methods with one call."""
        self.load_dataset()
        self.train()

In [5]:
class NewModelPipeline():
    def __init__(self):
        self.X_test, self.Y_test = None, None
        self.model = None
        self.load_model()
        self.load_dataset()

    def load_model(self):
        with (open(os.path.join(PROJECT_DIR, CLF_OUT), "rb")) as f:
            self.model = pickle.load(f)
    
    def load_dataset(self):
        """
        load the dataset
        """
        self.X_test = np.loadtxt(X_TEST, delimiter=',')
        self.y_test = np.loadtxt(Y_TEST, delimiter=',')

    def predict(self, input_data):
        return self.model.predict(input_data)

    def get_accuracy(self):
         return self.model.score(X=self.X_test, y=self.y_test)

    def run_pipeline(self):
        """Helper method to run multiple pipeline methods with one call."""
        self.load_model
        self.load_dataset()

In [27]:
class TestModelPredictions(unittest.TestCase):
    def setUp(self):
        # We prepare both pipelines for use in the tests
        self.pipeline_v1 = SimplePipeline()
        self.pipeline_v1.run_pipeline()
        self.pipeline_v2 = NewModelPipeline()
        self.pipeline_v2.run_pipeline()
    
    def test_accuracy_higher_than_benchmark(self):

        # given
        benchmark_accuracy = .65

        # predictions = self.pipeline_v1.predict(self.pipeline_v1.X_test)

        # When
        # actual_accuracy = accuracy_score(
        #     y_true=self.pipeline_v1.y_test,
        #     y_pred=predictions)

        actual_accuracy = self.pipeline_v1.get_accuracy()
            
        # Then
        print(f'model accuracy: {round(actual_accuracy, 4)}, benchmark accuracy: {benchmark_accuracy}')
        self.assertTrue(round(actual_accuracy, 4) > benchmark_accuracy)
    

    def test_f1_score_higher_than_benchmark(self):
        """
        >0.9 very good
        0.8 - 0.9 good
        0.5 - 0.8 ok
        < 0.5 not good
        """
        predictions = self.pipeline_v2.predict(self.pipeline_v1.X_test)

        # given
        benchmark_f1_score = 0.5

        actual_f1_score = f1_score(self.pipeline_v2.y_test, predictions)
        # Then
        print(f'model f1: {round(actual_f1_score, 4)}, benchmark f1: {benchmark_f1_score}')
        self.assertTrue(round(actual_f1_score, 4) > benchmark_f1_score)
    

    def test_tpr_higher_than_benchmark(self):

        predictions = self.pipeline_v2.predict(self.pipeline_v2.X_test)

        # given
        benchmark_tpr_score = 0.3
        cm = confusion_matrix(predictions, self.pipeline_v2.y_test)
        tn,fp,fn,tp = cm.ravel()

        # sensitivity, hit rate, recall, or true positive rate
        actual_tpr = tp/(tp+fn)

        # then
        print(f"model tpr: {round(actual_tpr, 4)}, benchmark tpr: {benchmark_tpr_score}")
        self.assertTrue(round(actual_tpr, 4) > benchmark_tpr_score)

    
    def test_tnr_higher_than_benchmark(self):

        predictions = self.pipeline_v2.predict(self.pipeline_v2.X_test)

        # given
        benchmark_tnr_score = 0.3
        cm = confusion_matrix(predictions, self.pipeline_v2.y_test)
        tn,fp,fn,tp = cm.ravel()

        # Specificity or true negative rate
        actual_tnr = tn/(tn+fp)

        # then
        print(f"model tnr: {round(actual_tnr, 4)}, benchmark tnr: {benchmark_tnr_score}")
        self.assertTrue(round(actual_tnr, 4) > benchmark_tnr_score)

    def test_fnr_lower_than_benchmark(self):
        
        predictions = self.pipeline_v2.predict(self.pipeline_v2.X_test)

        # given
        benchmark_fnr_score = 0.20
        cm = confusion_matrix(predictions, self.pipeline_v2.y_test)
        tn,fp,fn,tp = cm.ravel()

        # False negative rate
        actual_fnr = fn/(tp+fn)

        # then
        print(f"model fnr:{round(actual_fnr,4)}, benchmark fnr:{benchmark_fnr_score}")
        self.assertTrue(round(actual_fnr, 4) < benchmark_fnr_score)



    def test_accuracy_compared_to_basemodel(self):
        # when
        v1_accuracy = self.pipeline_v1.get_accuracy()
        v2_accuracy = self.pipeline_v2.get_accuracy()

        # Then
        # print(f'pipeline v1 accuracy: {v1_accuracy}')
        print(f'pipeline v2 accuracy: {round(v2_accuracy,4)} >= {round(v1_accuracy, 4)} pipeline v1 accuracy')
        self.assertTrue(v2_accuracy >= v1_accuracy)

In [28]:
import sys

suite = unittest.TestLoader().loadTestsFromTestCase(TestModelPredictions)
unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

....

pipeline v2 accuracy: 0.7532 >= 0.7078 pipeline v1 accuracy
model accuracy: 0.7078, benchmark accuracy: 0.65
model f1: 0.6042, benchmark f1: 0.5
model fnr:0.383, benchmark fnr:0.8


..
----------------------------------------------------------------------
Ran 6 tests in 0.445s

OK


model tnr: 0.8131, benchmark tnr: 0.3
model tpr: 0.617, benchmark tpr: 0.3


<unittest.runner.TextTestResult run=6 errors=0 failures=0>