# Classical LA methods
## $t$-test method

In [1]:
def nonzero_bins(bins):
    """
    Retrieves all bins for which at least one category has a non-zero value.

    :param bins: a set of categories with bins corresponding to that category.
    :return: the indexes of non-zero bins.
    """
    nz_bins = []
    for ix, a, b in zip(range(len(bins[0])), *bins):
        if a == 0 and b == 0:
            continue

        nz_bins.append(ix)

    return nz_bins


In [2]:
import scipy.stats as stats
import numpy as np
import math

def calc_t(ctable):
    mean = [0, 0]
    var = [0, 0]
    n = [0, 0]

    rg = [abs(a - b) for a, b in zip(*ctable)]

    for ix_cat in range(2):
        for ix_bin in range(len(ctable[ix_cat])):
            mean[ix_cat] += ctable[ix_cat][ix_bin] * rg[ix_bin]
            n[ix_cat] += ctable[ix_cat][ix_bin]

        mean[ix_cat] /= n[ix_cat]

        for ix_bin in range(len(ctable[ix_cat])):
            tmp = (rg[ix_bin] - mean[ix_cat])
            var[ix_cat] += tmp ** 2 + ctable[ix_cat][ix_bin]

        var[ix_cat] /= n[ix_cat]

    # t-value
    mean_diff = mean[0] - mean[1]
    var_sum = (var[0] / n[0]) + (var[1] / n[1])
    t_ret = mean_diff / math.sqrt(var_sum)

    # degree of freedom
    denom = ((var[0] / n[0]) * (var[0] / n[0])) / (n[0] - 1) + ((var[1] / n[1]) * (var[1] / n[1])) / (n[1] - 1)
    t_dof_ret = var_sum ** 2 / denom

    # cdf
    t_p_ret = 2 * stats.t(t_dof_ret).cdf(-abs(t_ret))

    return t_ret, t_dof_ret, t_p_ret

## $\chi^2$ method

In [3]:
def calc_chi(ctable):
    """
    Calculates the p value for rejecting H0, among others.
    Small p values give evidence to reject the null hypothesis and conclude that for the
    scenarios presented in ctable the occurrences of the observations are not independent.

    :param ctable: contingency table for different categories of traces.
    :return: A 3-tuple containing: The value for chi, the degrees of freedom,
        the p value for rejecting H0.
    """
    num_cats = len(ctable)
    num_bins = len(ctable[0])

    # chi**2 value
    sum_rows = [0] * num_cats
    sum_cols = [0] * num_bins
    N = 0.0

    # Only check non-zero bins
    nz_bins = nonzero_bins(ctable)

    for ix_bin in nz_bins:
        for ix_cat in range(num_cats):
            # Bin from the contingency table
            c_bin = ctable[ix_cat][ix_bin]

            sum_rows[ix_cat] += c_bin
            sum_cols[ix_bin] += c_bin
            N += c_bin

    chi = 0.0
    for ix_bin in nz_bins:
        for ix_cat in range(num_cats):
            E = (sum_rows[ix_cat] * sum_cols[ix_bin]) / N
            tmp = (ctable[ix_cat][ix_bin] - E)

            chi += tmp ** 2 / E

    # Degrees of freedom
    dof = (num_bins - 1) * (num_cats - 1)
    # p-value for rejecting H0
    p = stats.chi2(dof).cdf(chi)

    return chi, dof, p

In [13]:
from collections import Counter
import pandas as pd

def contingency_table(traces, trace_range):
        """
        Builds a contingency table from traces from the dataset for a given label.

        :param traces: the traces from which the contingency table should be build.
        :param trace_range: the range of values which the traces can take.
        :return: the contingency table as a numpy array.
        """
        df = pd.DataFrame([Counter(bins) for bins in traces])
        res = df.sum().sort_index().reindex(trace_range, fill_value=0).values
        return np.array(res, dtype=int)
