In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rankdata
from cryptoaml.utils import read_pickle
from scipy.stats import friedmanchisquare
from Orange.evaluation.scoring import compute_CD
from Orange.evaluation.scoring import graph_ranks

In [2]:
def friedman_test(df_perf):
    
    # count the number of tested datasets per classifier
    df_counts = pd.DataFrame({'count': df_perf.groupby(
        ['model']).size()}).reset_index()
    
    # get the maximum number of tested datasets
    max_nb_datasets = df_counts['count'].max()
    
    # get the list of classifiers who have been tested on nb_max_datasets
    classifiers = list(df_counts.loc[df_counts['count'] == max_nb_datasets]
                       ['model'])
    
    # test the null hypothesis using friedman before doing a post-hoc analysis
    stat, p  = friedmanchisquare(*(
        np.array(df_perf.loc[df_perf['model'] == c]['score'])
        for c in classifiers))
    
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
        return False
    else:
        print('Different distributions (reject H0)')
        return True 
        
def stat_test(results, filename):
    reject_h0 = friedman_test(results)
    if reject_h0:
        results["ranks"] = results.groupby("dataset_name").rank(ascending=False)
        results_avg_ranks = results.groupby("model")["ranks"].mean().to_dict()
        classifiers = list(results_avg_ranks.keys())
        avg_ranks = list(results_avg_ranks.values())
        n_datasets = results["dataset_name"].nunique()
        critical_diff = compute_CD(
            avg_ranks, 
            n=n_datasets, 
            alpha="0.05", 
            test="nemenyi"
        )
        print("Critical difference used for statistically significance: {}".format(round(critical_diff,3)))
        graph_ranks(
            avg_ranks, 
            classifiers, 
            cd=critical_diff, 
            width=10, 
            textspace=1.5,
            filename=filename
        )
        plt.show()

In [3]:
exp1_accuracy = [
    {"model":"XGB", "score": 0.978, "dataset_name": "ell_AF" },
    {"model":"XGB", "score": 0.979, "dataset_name": "ell_AF_NE" },
    {"model":"XGB", "score": 0.785, "dataset_name": "noaa" },

    {"model":"LGBM", "score": 0.979, "dataset_name": "ell_AF" },
    {"model":"LGBM", "score": 0.979, "dataset_name": "ell_AF_NE" },
    {"model":"LGBM", "score": 0.795, "dataset_name": "noaa" },
    
    {"model":"RF", "score": 0.977, "dataset_name": "ell_AF" },
    {"model":"RF", "score": 0.979, "dataset_name": "ell_AF_NE" },
    {"model":"RF", "score": 0.784, "dataset_name": "noaa" },
    
    {"model":"NCL_XGB", "score": 0.978, "dataset_name": "ell_AF" },
    {"model":"NCL_XGB", "score": 0.979, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_XGB", "score": 0.746, "dataset_name": "noaa" },
    
    {"model":"SMOTE_XGB", "score": 0.980, "dataset_name": "ell_AF" },
    {"model":"SMOTE_XGB", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_XGB", "score": 0.781, "dataset_name": "noaa" },
    
    {"model":"SMOTE_NCL_XGB", "score": 0.979, "dataset_name": "ell_AF" },
    {"model":"SMOTE_NCL_XGB", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_NCL_XGB", "score": 0.731, "dataset_name": "noaa" },
    
    {"model":"NCL_LGBM", "score": 0.978, "dataset_name": "ell_AF" },
    {"model":"NCL_LGBM", "score": 0.979, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_LGBM", "score": 0.752, "dataset_name": "noaa" },
    
    {"model":"SMOTE_LGBM", "score": 0.980, "dataset_name": "ell_AF" },
    {"model":"SMOTE_LGBM", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_LGBM", "score": 0.778, "dataset_name": "noaa" },
    
    {"model":"NCL_SMOTE_LGBM", "score": 0.980, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.736, "dataset_name": "noaa" },
    
    {"model":"NCL_RF", "score": 0.979, "dataset_name": "ell_AF" },
    {"model":"NCL_RF", "score": 0.981, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_RF", "score": 0.746, "dataset_name": "noaa" },
    
    {"model":"SMOTE_RF", "score": 0.978, "dataset_name": "ell_AF" },
    {"model":"SMOTE_RF", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_RF", "score": 0.771, "dataset_name": "noaa" },
 
    {"model":"NCL_SMOTE_RF", "score": 0.976, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_RF", "score": 0.979, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_RF", "score": 0.733, "dataset_name": "noaa" }
]


print("Accuracy Differences")
accuracy_df = pd.DataFrame(data=exp1_accuracy)
stat_test(accuracy_df, "exp1_nemenyi_accuracy.png")

Accuracy Differences
Statistics=13.568, p=0.258
Same distributions (fail to reject H0)


In [4]:
exp1_precision = [
 
    {"model":"RF", "score": 0.897, "dataset_name": "ell_AF" },
    {"model":"RF", "score": 0.958, "dataset_name": "ell_AF_NE" },
    {"model":"RF", "score": 0.748, "dataset_name": "noaa" },
    
    {"model":"XGB", "score": 0.921, "dataset_name": "ell_AF" },
    {"model":"XGB", "score": 0.986, "dataset_name": "ell_AF_NE" },
    {"model":"XGB", "score": 0.749, "dataset_name": "noaa" },
    
    {"model":"NCL_XGB", "score": 0.913, "dataset_name": "ell_AF" },
    {"model":"NCL_XGB", "score": 0.985, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_XGB", "score": 0.606, "dataset_name": "noaa" },
    
    {"model":"SMOTE_XGB", "score": 0.939, "dataset_name": "ell_AF" },
    {"model":"SMOTE_XGB", "score": 0.975, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_XGB", "score": 0.711, "dataset_name": "noaa" },
    
    {"model":"SMOTE_NCL_XGB", "score": 0.924, "dataset_name": "ell_AF" },
    {"model":"SMOTE_NCL_XGB", "score": 0.965, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_NCL_XGB", "score": 0.585, "dataset_name": "noaa" },
    
    {"model":"LGBM", "score": 0.932, "dataset_name": "ell_AF" },
    {"model":"LGBM", "score": 0.985, "dataset_name": "ell_AF_NE" },
    {"model":"LGBM", "score": 0.750, "dataset_name": "noaa" },
    
    {"model":"NCL_LGBM", "score": 0.918, "dataset_name": "ell_AF" },
    {"model":"NCL_LGBM", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_LGBM", "score": 0.613, "dataset_name": "noaa" },
    
    {"model":"SMOTE_LGBM", "score": 0.939, "dataset_name": "ell_AF" },
    {"model":"SMOTE_LGBM", "score": 0.980, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_LGBM", "score": 0.708, "dataset_name": "noaa" },
    
    {"model":"NCL_SMOTE_LGBM", "score": 0.942, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.972, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.593, "dataset_name": "noaa" },
    
    {"model":"NCL_RF", "score": 0.945, "dataset_name": "ell_AF" },
    {"model":"NCL_RF", "score": 0.977, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_RF", "score": 0.609, "dataset_name": "noaa" },
    
    {"model":"SMOTE_RF", "score": 0.929, "dataset_name": "ell_AF" },
    {"model":"SMOTE_RF", "score": 0.965, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_RF", "score": 0.677, "dataset_name": "noaa" },
 
    {"model":"NCL_SMOTE_RF", "score": 0.878, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_RF", "score": 0.945, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_RF", "score": 0.591, "dataset_name": "noaa" }
]

print("Precision Differences")
precision_df = pd.DataFrame(data=exp1_precision)
stat_test(precision_df, "exp1_nemenyi_precision.png")

Precision Differences
Statistics=16.526, p=0.123
Same distributions (fail to reject H0)


In [9]:
exp1_recall = [

    {"model":"XGB", "score": 0.732, "dataset_name": "ell_AF" },
    {"model":"XGB", "score": 0.692, "dataset_name": "ell_AF_NE" },
    {"model":"XGB", "score": 0.596, "dataset_name": "noaa" },
    
    {"model":"NCL_XGB", "score": 0.734, "dataset_name": "ell_AF" },
    {"model":"NCL_XGB", "score": 0.687, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_XGB", "score": 0.829, "dataset_name": "noaa" },
    
    {"model":"SMOTE_XGB", "score": 0.735, "dataset_name": "ell_AF" },
    {"model":"SMOTE_XGB", "score": 0.716, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_XGB", "score": 0.652, "dataset_name": "noaa" },
    
    {"model":"SMOTE_NCL_XGB", "score": 0.734, "dataset_name": "ell_AF" },
    {"model":"SMOTE_NCL_XGB", "score": 0.721, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_NCL_XGB", "score": 0.843, "dataset_name": "noaa" },
    
    {"model":"LGBM", "score": 0.732, "dataset_name": "ell_AF" },
    {"model":"LGBM", "score": 0.695, "dataset_name": "ell_AF_NE" },
    {"model":"LGBM", "score": 0.639, "dataset_name": "noaa" },
        
    {"model":"NCL_LGBM", "score": 0.732, "dataset_name": "ell_AF" },
    {"model":"NCL_LGBM", "score": 0.688, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_LGBM", "score": 0.824, "dataset_name": "noaa" },
    
    {"model":"SMOTE_LGBM", "score": 0.733, "dataset_name": "ell_AF" },
    {"model":"SMOTE_LGBM", "score": 0.705, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_LGBM", "score": 0.646, "dataset_name": "noaa" },
    
    {"model":"NCL_SMOTE_LGBM", "score": 0.732, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.708, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.834, "dataset_name": "noaa" },
    
    {"model":"RF", "score": 0.721, "dataset_name": "ell_AF" },
    {"model":"RF", "score": 0.715, "dataset_name": "ell_AF_NE" },
    {"model":"RF", "score": 0.595, "dataset_name": "noaa" },
      
    {"model":"NCL_RF", "score": 0.726, "dataset_name": "ell_AF" },
    {"model":"NCL_RF", "score": 0.723, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_RF", "score": 0.803, "dataset_name": "noaa" },
    
    {"model":"SMOTE_RF", "score": 0.721, "dataset_name": "ell_AF" },
    {"model":"SMOTE_RF", "score": 0.722, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_RF", "score": 0.686, "dataset_name": "noaa" },
 
    {"model":"NCL_SMOTE_RF", "score": 0.725, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_RF", "score": 0.726, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_RF", "score": 0.820, "dataset_name": "noaa" }
]


print("Recall Differences")
recall_df = pd.DataFrame(data=exp1_recall)
stat_test(recall_df, "exp1_nemenyi_recall.png")

Recall Differences
Statistics=11.000, p=0.443
Same distributions (fail to reject H0)


In [6]:
exp1_f1 = [
  
    {"model":"XGB", "score": 0.815, "dataset_name": "ell_AF" },
    {"model":"XGB", "score": 0.813, "dataset_name": "ell_AF_NE" },
    {"model":"XGB", "score": 0.664, "dataset_name": "noaa" },

    {"model":"NCL_XGB", "score": 0.814, "dataset_name": "ell_AF" },
    {"model":"NCL_XGB", "score": 0.809, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_XGB", "score": 0.700, "dataset_name": "noaa" },
    
    {"model":"SMOTE_XGB", "score": 0.824, "dataset_name": "ell_AF" },
    {"model":"SMOTE_XGB", "score": 0.826, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_XGB", "score": 0.680, "dataset_name": "noaa" },
    
    {"model":"SMOTE_NCL_XGB", "score": 0.818, "dataset_name": "ell_AF" },
    {"model":"SMOTE_NCL_XGB", "score": 0.825, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_NCL_XGB", "score": 0.691, "dataset_name": "noaa" },
    
    {"model":"LGBM", "score": 0.820, "dataset_name": "ell_AF" },
    {"model":"LGBM", "score": 0.815, "dataset_name": "ell_AF_NE" },
    {"model":"LGBM", "score": 0.690, "dataset_name": "noaa" },
    
    {"model":"NCL_LGBM", "score": 0.814, "dataset_name": "ell_AF" },
    {"model":"NCL_LGBM", "score": 0.809, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_LGBM", "score": 0.703, "dataset_name": "noaa" },
    
    {"model":"SMOTE_LGBM", "score": 0.823, "dataset_name": "ell_AF" },
    {"model":"SMOTE_LGBM", "score": 0.820, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_LGBM", "score": 0.675, "dataset_name": "noaa" },
    
    {"model":"NCL_SMOTE_LGBM", "score": 0.824, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.819, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_LGBM", "score": 0.693, "dataset_name": "noaa" },
    
    {"model":"RF", "score": 0.800, "dataset_name": "ell_AF" },
    {"model":"RF", "score": 0.819, "dataset_name": "ell_AF_NE" },
    {"model":"RF", "score": 0.662, "dataset_name": "noaa" },
        
    {"model":"NCL_RF", "score": 0.821, "dataset_name": "ell_AF" },
    {"model":"NCL_RF", "score": 0.831, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_RF", "score": 0.693, "dataset_name": "noaa" },
    
    {"model":"SMOTE_RF", "score": 0.812, "dataset_name": "ell_AF" },
    {"model":"SMOTE_RF", "score": 0.826, "dataset_name": "ell_AF_NE" },
    {"model":"SMOTE_RF", "score": 0.682, "dataset_name": "noaa" },
 
    {"model":"NCL_SMOTE_RF", "score": 0.794, "dataset_name": "ell_AF" },
    {"model":"NCL_SMOTE_RF", "score": 0.821, "dataset_name": "ell_AF_NE" },
    {"model":"NCL_SMOTE_RF", "score": 0.687, "dataset_name": "noaa" }
]


print("Recall Differences")
f1_df = pd.DataFrame(data=exp1_f1)
stat_test(f1_df, "exp1_nemenyi_f1.png")

Recall Differences
Statistics=11.775, p=0.381
Same distributions (fail to reject H0)


In [10]:
# https://machinelearningmastery.com/use-statistical-significance-tests-interpret-machine-learning-results/

# T-Test

In [58]:
from scipy.stats import normaltest
from matplotlib import pyplot

def check_dist(values):
    value, p = normaltest(values)
    print(value, p)
    if p >= 0.05:
        print('likely normal')
    else:
        print('unlikely normal')

In [59]:
exp_1_tuned_results = read_pickle("persistence/experiment_1/results/tuned_models_iter_results.pkl")
exp_1_rf_results = read_pickle("persistence/experiment_1/results/benchmark_model_results.pkl")

In [64]:
print("RF (AF) - Accuracy")
check_dist(exp_1_rf_results["random_forest"]["AF"]["metrics_iterations"]["accuracy"])
print("------------------------------------------")
print("RF (AF_NE) - Accuracy")
check_dist(exp_1_rf_results["random_forest"]["AF_NE"]["metrics_iterations"]["accuracy"])
print("------------------------------------------")
print("RF (AF) - Precision")
check_dist(exp_1_rf_results["random_forest"]["AF"]["metrics_iterations"]["precision"])
print("------------------------------------------")
print("RF (AF_NE) - Precision")
check_dist(exp_1_rf_results["random_forest"]["AF_NE"]["metrics_iterations"]["precision"])
print("------------------------------------------")
print("RF (AF) - Recall")
check_dist(exp_1_rf_results["random_forest"]["AF"]["metrics_iterations"]["recall"])
print("------------------------------------------")
print("RF (AF_NE) - Recall")
check_dist(exp_1_rf_results["random_forest"]["AF_NE"]["metrics_iterations"]["recall"])
print("------------------------------------------")
print("RF (AF) - F1")
check_dist(exp_1_rf_results["random_forest"]["AF"]["metrics_iterations"]["f1"])
print("------------------------------------------")
print("RF (AF_NE) - F1")
check_dist(exp_1_rf_results["random_forest"]["AF_NE"]["metrics_iterations"]["f1"])

RF (AF) - Accuracy
30.93953023857888 1.9123455622033972e-07
unlikely normal
------------------------------------------
RF (AF_NE) - Accuracy
5.95350895974926 0.05095795082971087
likely normal
------------------------------------------
RF (AF) - Precision
31.077043690748482 1.785277393330735e-07
unlikely normal
------------------------------------------
RF (AF_NE) - Precision
2.512281286411559 0.2847508637759317
likely normal
------------------------------------------
RF (AF) - Recall
1.2885499220937133 0.5250430808229016
likely normal
------------------------------------------
RF (AF_NE) - Recall
25.44044464777389 2.9900443918869904e-06
unlikely normal
------------------------------------------
RF (AF) - F1
24.692813743557476 4.345338608042506e-06
unlikely normal
------------------------------------------
RF (AF_NE) - F1
2.8371273631975806 0.24206144463603435
likely normal


In [70]:
from scipy.stats import ks_2samp
def ks(values1, values2):
    value, pvalue = ks_2samp(values1, values2)
    print(value, pvalue)
    if pvalue > 0.05:
        print('Samples are likely drawn from the same distributions (fail to reject H0)')
    else:
        print('Samples are likely drawn from different distributions (reject H0)')

In [74]:
ks(exp_1_rf_results["random_forest"]["AF"]["metrics_iterations"]["f1"], 
   exp_1_tuned_results["xg_boost"]["AF"]["metrics_iterations"]["f1"])

0.93 5.044580186155621e-47
Samples are likely drawn from different distributions (reject H0)
