##### PLOTTER OF THE RESULTS OF A GRID SEARCH

In [35]:
import pickle
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from scipy.stats import t


In [36]:
output_dir = "/home/zhanna/bachelorarbeit/zbb/experiments/output_grid_search_10_12_23"
output_dir = "/home/zhanna/bachelorarbeit/zbb/experiments/output_grid_search_000"

In [37]:
model_names = [
    "RandomForestW2V",
    # "RandomForest",
    # "LogisticRegression",
    # "LogisticRegressionW2V",
    # "NaiveSA",
    # "AFinn",
    # "Vader"
]

model_name = model_names[0]

In [64]:
import numpy as np
from scipy.stats import t

def get_model_scores(results_df, metric):
    model_scores = results_df.filter(regex=r"split\d*_test_{}".format(metric))
    return model_scores


def corrected_std(differences, n_train, n_test):
    """Corrects standard deviation using Nadeau and Bengio's approach.

    Parameters
    ----------
    differences : ndarray of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    corrected_std : float
        Variance-corrected standard deviation of the set of differences.
    """
    # kr = k times r, r times repeated k-fold crossvalidation,
    # kr equals the number of times the model was evaluated
    kr = len(differences)
    corrected_var = np.var(differences, ddof=1) * (1 / kr + n_test / n_train)
    corrected_std = np.sqrt(corrected_var)
    return corrected_std


def compute_corrected_ttest(differences, df, n_train, n_test):
    """Computes right-tailed paired t-test with corrected variance.

    Parameters
    ----------
    differences : array-like of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    df : int
        Degrees of freedom.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    t_stat : float
        Variance-corrected t-statistic.
    p_val : float
        Variance-corrected p-value.
    """
    mean = np.mean(differences)
    std = corrected_std(differences, n_train, n_test)
    t_stat = mean / std
    p_val = t.sf(np.abs(t_stat), df)  # right-tailed t-test
    return t_stat, p_val

def compare_models(X1, X2, n_train, n_test):
    # print(model_scores)

    differences = X1 - X2

    n = differences.shape[0]  # number of test sets
    df = n - 1
    # n_train = len(list(cv.split(X, y))[0][0])
    # n_test = len(list(cv.split(X, y))[0][1])

    t_stat, p_val = compute_corrected_ttest(differences, df, n_train, n_test)
    # print(f"Corrected t-value: {t_stat:.10f}\nCorrected p-value: {p_val:.10f}")

    # ==============================================================================

    t_stat_uncorrected = np.mean(differences) / np.sqrt(np.var(differences, ddof=1) / n)
    p_val_uncorrected = t.sf(np.abs(t_stat_uncorrected), df)

    # print(
    #    f"Uncorrected t-value: {t_stat_uncorrected:.10f}\n"
    #    f"Uncorrected p-value: {p_val_uncorrected:.10f}"
    # )
    return t_stat, p_val, t_stat_uncorrected, p_val_uncorrected

In [45]:
n_train = 45000
n_test = 5000

In [74]:
from itertools import combinations
from math import factorial

n_comparisons = factorial(len(model_scores)) / (
    factorial(2) * factorial(len(model_scores) - 2)
)

model_names = [
    "RandomForestW2V",  # no
    "RandomForest",     # no
    "LogisticRegression", # ok
    "LogisticRegressionW2V",  # ok
    "NaiveSA",                  # ok   
    "LSTM",
    "LSTM_W2V"
]

model_name = model_names[0]

result_tables = ""
for model_name in model_names:
    print("MODEL ", model_name)
    pairwise_t_test = []
    file_name = os.path.join(output_dir, 'grid_search_{}_cv_results.pkl'.format(model_name))
    with open(file_name, 'rb') as f:
        results = pickle.load(f)
    results_df = pd.DataFrame(results).head(5)

    model_scores = get_model_scores(results_df, "precision")
    for model_i, model_k in combinations(range(len(model_scores)), 2):
        model_i_scores = model_scores.iloc[model_i].values
        model_k_scores = model_scores.iloc[model_k].values
        differences = model_i_scores - model_k_scores
        n = differences.shape[0]  # number of test sets
        df = n - 1
        t_stat, p_val, t_std, p_std = compare_models(model_i_scores, model_k_scores, n_train, n_test)
        p_val *= n_comparisons  # implement Bonferroni correction
        # Bonferroni can output p-values higher than 1
        p_val = 1 if p_val > 1 else p_val
        p_std = 1 if p_std > 1 else p_std
        pairwise_t_test.append(
            [
                model_scores.index[model_i], 
                model_scores.index[model_k], 
                t_stat, 
                t_std, 
                p_val, 
                p_std
                ]
        )

    pairwise_comp_df = pd.DataFrame(
        pairwise_t_test, columns=[
            "$model_1$", 
            "$model_2$", 
            "$t^{Bonferroni}_{|1-2| > 0}$", 
            "$t^{standard}_{|1-2| > 0}$", 
            "$p^{Bonferroni}$", 
            "$p^{std}$"
            ]
    ).round(3)
    print(pairwise_comp_df.to_latex())
    result_tables += "\\begin{table}[H]\n"
    result_tables += "\\centering\n"
    result_tables += pairwise_comp_df.to_latex()
    m = model_name.replace('_', '\_')
    result_tables += "\\caption[T-Test Ergebniss für {}]{{Mietrik für Modell {}}}\n".format(m, m)
    result_tables += "\\end{table}\n\n"
print(result_tables)

MODEL  RandomForestW2V
\begin{tabular}{lrrrrrr}
\toprule
 & $model_1$ & $model_2$ & $t^{Bonferroni}_{|1-2| > 0}$ & $t^{standard}_{|1-2| > 0}$ & $p^{Bonferroni}$ & $p^{std}$ \\
\midrule
0 & 0 & 1 & -4.062000 & -14.137000 & 0.000000 & 0.000000 \\
1 & 0 & 2 & -0.871000 & -3.030000 & 0.579000 & 0.002000 \\
2 & 0 & 3 & -3.465000 & -12.059000 & 0.001000 & 0.000000 \\
3 & 0 & 4 & 17.640000 & 61.390000 & 0.000000 & 0.000000 \\
4 & 1 & 2 & 2.921000 & 10.164000 & 0.006000 & 0.000000 \\
5 & 1 & 3 & 0.282000 & 0.982000 & 1.000000 & 0.164000 \\
6 & 1 & 4 & 21.885000 & 76.163000 & 0.000000 & 0.000000 \\
7 & 2 & 3 & -3.073000 & -10.693000 & 0.004000 & 0.000000 \\
8 & 2 & 4 & 17.215000 & 59.911000 & 0.000000 & 0.000000 \\
9 & 3 & 4 & 19.521000 & 67.935000 & 0.000000 & 0.000000 \\
\bottomrule
\end{tabular}

MODEL  RandomForest
\begin{tabular}{lrrrrrr}
\toprule
 & $model_1$ & $model_2$ & $t^{Bonferroni}_{|1-2| > 0}$ & $t^{standard}_{|1-2| > 0}$ & $p^{Bonferroni}$ & $p^{std}$ \\
\midrule
0 & 0 & 1 & -7.7

In [42]:
print(pairwise_comp_df.to_latex())

\begin{tabular}{lrrrrrr}
\toprule
 & model_1 & model_2 & t_stat & p_val & t_std & p_std \\
\midrule
0 & 0 & 1 & -4.062000 & 0.001000 & -14.137000 & 0.000000 \\
1 & 0 & 2 & -0.871000 & 1.000000 & -3.030000 & 0.023000 \\
2 & 0 & 3 & -3.465000 & 0.006000 & -12.059000 & 0.000000 \\
3 & 0 & 4 & 17.640000 & 0.000000 & 61.390000 & 0.000000 \\
4 & 0 & 5 & 14.504000 & 0.000000 & 50.477000 & 0.000000 \\
5 & 1 & 2 & 2.921000 & 0.032000 & 10.164000 & 0.000000 \\
6 & 1 & 3 & 0.282000 & 1.000000 & 0.982000 & 1.000000 \\
7 & 1 & 4 & 21.885000 & 0.000000 & 76.163000 & 0.000000 \\
8 & 1 & 5 & 19.216000 & 0.000000 & 66.872000 & 0.000000 \\
9 & 2 & 3 & -3.073000 & 0.021000 & -10.693000 & 0.000000 \\
10 & 2 & 4 & 17.215000 & 0.000000 & 59.911000 & 0.000000 \\
11 & 2 & 5 & 15.546000 & 0.000000 & 54.103000 & 0.000000 \\
12 & 3 & 4 & 19.521000 & 0.000000 & 67.935000 & 0.000000 \\
13 & 3 & 5 & 17.589000 & 0.000000 & 61.211000 & 0.000000 \\
14 & 4 & 5 & -3.595000 & 0.004000 & -12.510000 & 0.000000 \\
\bottomru