In [25]:
# imports for p-value and comparison testing

import scipy
from scipy.stats import ttest_rel
import numpy as np
import pandas as pd
import pprint

In [17]:
# read the results from the csv file
results = pd.read_csv('classifier_results.csv')

In [18]:
results

Unnamed: 0,lg_acc,lg_f1,lg_prec,dt_acc,dt_f1,dt_prec,knn_acc,knn_f1,knn_prec,rf_acc,rf_f1,rf_prec,svm_acc,svm_f1,svm_prec,xgb_acc,xgb_f1,xgb_prec
0,0.922294,0.922294,0.922633,0.824228,0.825585,0.796403,0.885307,0.885307,0.885307,0.897184,0.893451,0.902273,0.945029,0.945029,0.945029,0.899898,0.899898,0.899898
1,0.922633,0.922633,0.922633,0.825925,0.824907,0.797421,0.876824,0.876824,0.876824,0.893451,0.892772,0.89413,0.945029,0.945029,0.945029,0.901595,0.901595,0.901595
2,0.922633,0.922633,0.922633,0.831015,0.831015,0.797421,0.851714,0.851714,0.851714,0.894469,0.901256,0.904309,0.947065,0.947065,0.947065,0.904309,0.904309,0.904309
3,0.922633,0.922633,0.922633,0.831015,0.829997,0.796403,0.887343,0.887343,0.887343,0.899898,0.890736,0.892433,0.945029,0.945029,0.945029,0.896844,0.896844,0.896844
4,0.922294,0.922294,0.922633,0.824567,0.825585,0.796403,0.887343,0.887343,0.887343,0.895487,0.89922,0.892772,0.945029,0.945029,0.945029,0.900916,0.900916,0.900916


In [19]:
# function to create the mean array for each model, pass in abbreviation of the model
# lg = logistic regression
# dt = decision tree
# knn = k nearest neighbours
# rf = random forest
# svm = support vector machine
# xgb = XGradientBoost

def create_mean_arr(model):
    accuracy = model + "_acc"
    f1 = model + "_f1"
    precision = model + "_prec"
    mean_arr = []
    for i in range(5):
        mean = np.mean([results.get(accuracy).iloc[i],results.get(f1).iloc[i],results.get(precision).iloc[i]])
        mean_arr = np.append(mean_arr, mean)
    return mean_arr

create_mean_arr("lg")

array([0.92240697, 0.92263319, 0.92263319, 0.92263319, 0.92240697])

In [20]:
# function to compare one model to all the others based on given statistic, returns dictionary with p-values

def compare_all(model, stat):
    all_models = ["lg", "dt", "knn", "rf", "svm", "xgb"]
    all_models.remove(model)
    statistic = model + "_" + stat
    model_val = results.get(statistic)
    comparisons = {}
    for to_compare in all_models:
        to_compare_stat = to_compare + "_" + stat
        to_compare_val = results.get(to_compare_stat)
        comparisons[model + " vs. " + to_compare + " in " + stat] = scipy.stats.ttest_rel(model_val, to_compare_val)
    return comparisons

# function that compares one model to all others based on average of scores

def compare_all_mean(model):
    all_models = ["lg", "dt", "knn", "rf", "svm", "xgb"]
    all_models.remove(model)
    model_stats = create_mean_arr(model)
    comparisons = {}
    for to_compare in all_models:
        to_compare_stats = create_mean_arr(to_compare)
        comparisons[model + " vs. " + to_compare] = scipy.stats.ttest_rel(model_stats, to_compare_stats)
    return comparisons

svm_comparisons = compare_all_mean("svm")
xg_boost_acc = compare_all("xgb", "acc")

In [36]:
def final_results(model):
    print("Accuracy:")
    pprint.pprint(compare_all(model, "acc"))
    print("F1:")
    pprint.pprint(compare_all(model, "f1"))
    print("Precision:")
    pprint.pprint(compare_all(model, "prec"))
    print("Averaged Meterics:")
    pprint.pprint(compare_all_mean(model))

# XGBoost Comparisons

In [37]:
final_results('xgb')

Accuracy:
{'xgb vs. dt in acc': Ttest_relResult(statistic=37.556009261196344, pvalue=3.001812375048026e-06),
 'xgb vs. knn in acc': Ttest_relResult(statistic=2.9446549216358995, pvalue=0.042193125203644634),
 'xgb vs. lg in acc': Ttest_relResult(statistic=-18.057660742316973, pvalue=5.529399198600825e-05),
 'xgb vs. rf in acc': Ttest_relResult(statistic=2.0350709232588255, pvalue=0.11156717671568189),
 'xgb vs. svm in acc': Ttest_relResult(statistic=-47.07142853155654, pvalue=1.2184736952974837e-06)}
F1:
{'xgb vs. dt in f1': Ttest_relResult(statistic=42.943109842946704, pvalue=1.7579595958235945e-06),
 'xgb vs. knn in f1': Ttest_relResult(statistic=2.9446549216358995, pvalue=0.042193125203644634),
 'xgb vs. lg in f1': Ttest_relResult(statistic=-18.057660742316973, pvalue=5.529399198600825e-05),
 'xgb vs. rf in f1': Ttest_relResult(statistic=4.107031792405998, pvalue=0.014767907352728685),
 'xgb vs. svm in f1': Ttest_relResult(statistic=-47.07142853155654, pvalue=1.2184736952974837e-06)

# Logistic Regression Comparisons

In [38]:
final_results('lg')

Accuracy:
{'lg vs. dt in acc': Ttest_relResult(statistic=65.26226761097134, pvalue=3.3023502765800975e-07),
 'lg vs. knn in acc': Ttest_relResult(statistic=6.5623742307238695, pvalue=0.0027893188188242413),
 'lg vs. rf in acc': Ttest_relResult(statistic=23.103249584593954, pvalue=2.0799553596455053e-05),
 'lg vs. svm in acc': Ttest_relResult(statistic=-60.222865669142145, pvalue=4.5531053085491875e-07),
 'lg vs. xgb in acc': Ttest_relResult(statistic=18.057660742316973, pvalue=5.529399198600825e-05)}
F1:
{'lg vs. dt in f1': Ttest_relResult(statistic=77.2983900049269, pvalue=1.6787462172071523e-07),
 'lg vs. knn in f1': Ttest_relResult(statistic=6.5623742307238695, pvalue=0.0027893188188242413),
 'lg vs. rf in f1': Ttest_relResult(statistic=13.292538414299225, pvalue=0.00018514353616195348),
 'lg vs. svm in f1': Ttest_relResult(statistic=-60.222865669142145, pvalue=4.5531053085491875e-07),
 'lg vs. xgb in f1': Ttest_relResult(statistic=18.057660742316973, pvalue=5.529399198600825e-05)}


# SVM Comparisons

In [39]:
final_results('svm')

Accuracy:
{'svm vs. dt in acc': Ttest_relResult(statistic=89.55519234540664, pvalue=9.320243290052336e-08),
 'svm vs. knn in acc': Ttest_relResult(statistic=9.442234823450509, pvalue=0.0007015454243357284),
 'svm vs. lg in acc': Ttest_relResult(statistic=60.222865669142145, pvalue=4.5531053085491875e-07),
 'svm vs. rf in acc': Ttest_relResult(statistic=36.97938300283138, pvalue=3.1929955379803203e-06),
 'svm vs. xgb in acc': Ttest_relResult(statistic=47.07142853155654, pvalue=1.2184736952974837e-06)}
F1:
{'svm vs. dt in f1': Ttest_relResult(statistic=114.41780039385031, pvalue=3.499095092560624e-08),
 'svm vs. knn in f1': Ttest_relResult(statistic=9.442234823450509, pvalue=0.0007015454243357284),
 'svm vs. lg in f1': Ttest_relResult(statistic=60.222865669142145, pvalue=4.5531053085491875e-07),
 'svm vs. rf in f1': Ttest_relResult(statistic=28.57309922632602, pvalue=8.928609481773407e-06),
 'svm vs. xgb in f1': Ttest_relResult(statistic=47.07142853155654, pvalue=1.2184736952974837e-06)}