## Classical LA methods applied to the ASCAD database
###### This includes my implementations of $\chi^2$ and Welch's t-test.

In [61]:
from src.settings.ascad import *

In [62]:
import h5py

ascad_hdf = h5py.File(f"{ASCAD_DATA}{ASCAD_DATA_VAR}/{ASCADDataType.default}.h5", 'r')

keys = list(ascad_hdf.keys())
keys

['Attack_traces', 'Profiling_traces']

In [63]:
att_group = ascad_hdf["Attack_traces"]

att_keys = list(att_group.keys())
att_keys

['labels', 'metadata', 'traces']

In [64]:
att_traces = att_group["traces"]

att_traces.shape

(100000, 1400)

In [65]:
att_labels = att_group["labels"]

att_labels.shape

(100000,)

In [66]:
import tqdm

for _ in tqdm.tqdm(att_traces[1:10]):
    pass

100%|██████████| 9/9 [00:00<00:00, 27634.51it/s]


In [67]:
att_head = att_traces[1:10]

In [68]:
from src.settings.nbloader import NotebookLoader

nb = NotebookLoader("../").load_module("tvla.welch_t_test")
tp = nb.TraceProcessor()

importing Jupyter notebook from ./welch_t_test.ipynb
(0.0, 598.0, 1.0) (0.0, 99, nan)
(0.0, 300.17360193005874, 1.0) (154.0474237457329, 99, 0.00017509185909780884)
False True


In [69]:
import numpy as np

fixed_1_ix = np.where(np.array(att_labels) == 1)[0]
fixed_2_ix = np.where(np.array(att_labels) == 2)[0]
random_not_1_ix = np.array(np.where(np.array(att_labels) != 1)[0])

fixed_1_ix.shape

(385,)

In [70]:
fixed_test_size = 10

fixed_1 = np.array(att_traces[fixed_1_ix])
fixed_2 = np.array(att_traces[fixed_2_ix])

len_fixed_1 = fixed_1.shape[0]
fixed_1a = np.array(att_traces[fixed_1_ix][:round(len_fixed_1/2)])
fixed_1b = np.array(att_traces[fixed_1_ix][round(len_fixed_1/2):])

random_not_1 = np.array(att_traces[random_not_1_ix[:len_fixed_1]])

In [71]:
all_ixs = range(att_traces.shape[0])

random_a_ix = sorted(np.random.choice(all_ixs, len_fixed_1, replace=False))
random_b_ix = sorted(np.random.choice(all_ixs, len_fixed_1, replace=False))

semi_random_b_ix = sorted(np.random.choice(list(set(all_ixs).difference(set(random_a_ix))), len_fixed_1, replace=False))
rand_range_a_ix = np.array(np.where(np.array(att_labels) <= 127)[0])
rand_range_b_ix = np.array(np.where(np.array(att_labels) > 127)[0])

In [72]:
random_a = np.array(att_traces[random_a_ix])
random_b = np.array(att_traces[random_b_ix])
semi_random_b = np.array(att_traces[np.sort(semi_random_b_ix)])

rand_range_a = np.array(att_traces[rand_range_a_ix[:10000]])
rand_range_b = np.array(att_traces[rand_range_b_ix[:10000]])

# Classical LA methods
## $t$-test method

In [73]:
def nonzero_bins(bins):
    """
    Retrieves all bins for which at least one category has a non-zero value.

    :param bins: a set of categories with bins corresponding to that category.
    :return: the indexes of non-zero bins.
    """
    nz_bins = []
    for ix, a, b in zip(range(len(bins[0])), *bins):
        if a == 0 and b == 0:
            continue

        nz_bins.append(ix)

    return nz_bins

In [74]:
from collections import Counter

DEFAULT_INDEX = range(-128, 127)

def extract_ctable(traces):
    """
    Builds a contingency table from traces from the ASCAD dataset.

    :param traces: the traces from which the contingency table should be constructed.
    :return: the contingency table.
    """
    df = pd.DataFrame([Counter(bins) for bins in traces])
    res = df.sum().sort_index().reindex(DEFAULT_INDEX, fill_value=0).values

    return np.array(res, dtype=int)

In [75]:
import scipy.stats as stats
import numpy as np
import math

def calc_t(ctable):
    mean = [0, 0]
    var = [0, 0]
    n = [0, 0]

    rg = [abs(a - b) for a, b in zip(*ctable)]

    for ix_cat in range(2):
        for ix_bin in range(len(ctable[ix_cat])):
            mean[ix_cat] += ctable[ix_cat][ix_bin] * rg[ix_bin]
            n[ix_cat] += ctable[ix_cat][ix_bin]

        mean[ix_cat] /= n[ix_cat]

        for ix_bin in range(len(ctable[ix_cat])):
            tmp = (rg[ix_bin] - mean[ix_cat])
            var[ix_cat] += tmp ** 2 + ctable[ix_cat][ix_bin]

        var[ix_cat] /= n[ix_cat]

    # t-value
    mean_diff = mean[0] - mean[1]
    var_sum = (var[0] / n[0]) + (var[1] / n[1])
    t_ret = mean_diff / math.sqrt(var_sum)

    # degree of freedom
    denom = ((var[0] / n[0]) * (var[0] / n[0])) / (n[0] - 1) + ((var[1] / n[1]) * (var[1] / n[1])) / (n[1] - 1)
    t_dof_ret = var_sum ** 2 / denom

    # cdf
    t_p_ret = 2 * stats.t(t_dof_ret).cdf(-abs(t_ret))

    return t_ret, t_dof_ret, t_p_ret

calc_t([extract_ctable(fixed_1), extract_ctable(fixed_1)])

(0.0, 1077998.0, 1.0)

In [76]:
calc_t([extract_ctable(fixed_1), extract_ctable(fixed_2)])

(447.93165173122577, 1058493.3884671174, 0.0)

## $\chi^2$ method

In [77]:
def calc_chi(ctable):
    """
    Calculates the p value for rejecting H0, among others.
    Small p values give evidence to reject the null hypothesis and conclude that for the
    scenarios presented in ctable the occurrences of the observations are not independent.

    :param ctable: contingency table for different categories of traces.
    :return: A 3-tuple containing: The value for chi, the degrees of freedom,
        the p value for rejecting H0.
    """
    num_cats = len(ctable)
    num_bins = len(ctable[0])

    # chi**2 value
    sum_rows = [0] * num_cats
    sum_cols = [0] * num_bins
    N = 0.0

    # Only check non-zero bins
    nz_bins = nonzero_bins(ctable)

    for ix_bin in nz_bins:
        for ix_cat in range(num_cats):
            # Bin from the contingency table
            c_bin = ctable[ix_cat][ix_bin]

            sum_rows[ix_cat] += c_bin
            sum_cols[ix_bin] += c_bin
            N += c_bin

    chi = 0.0
    for ix_bin in nz_bins:
        for ix_cat in range(num_cats):
            E = (sum_rows[ix_cat] * sum_cols[ix_bin]) / N
            tmp = (ctable[ix_cat][ix_bin] - E)

            chi += tmp ** 2 / E

    # Degrees of freedom
    dof = (num_bins - 1) * (num_cats - 1)
    # p-value for rejecting H0
    p = stats.chi2(dof).cdf(chi)

    return chi, dof, p

In [78]:
def p_value_chi(a, b):
    print(f"p-value: {calc_chi([extract_ctable(a), extract_ctable(b)])[2]:.3f}")

In [79]:
fixed_1

array([[-23, -26, -29, ..., -84, -86, -85],
       [-21, -25, -29, ..., -85, -84, -86],
       [-20, -24, -29, ..., -86, -85, -86],
       ...,
       [-21, -21, -25, ..., -84, -84, -85],
       [-20, -26, -29, ..., -84, -85, -85],
       [-21, -21, -26, ..., -85, -85, -85]], dtype=int8)

In [80]:
random_b

array([[-20, -23, -27, ..., -85, -85, -85],
       [-21, -22, -27, ..., -85, -84, -85],
       [-21, -20, -26, ..., -84, -85, -87],
       ...,
       [-21, -21, -27, ..., -84, -85, -84],
       [-21, -26, -29, ..., -84, -86, -85],
       [-20, -20, -26, ..., -85, -85, -85]], dtype=int8)

In [81]:
cases = {
    "Fixed vs. fixed, equal traces.": (fixed_1, fixed_1),
    "Fixed vs. fixed, equal key.": (fixed_1a, fixed_1b),
    "Fixed vs. fixed, different key.": (fixed_1, fixed_2),
    "Fixed vs. semi-random (fixed key is not in random sample).": (fixed_1, random_not_1),
    "Fixed vs. random.": (fixed_1, random_b),
    "Random vs. random.": (random_a, random_b),
    "Random vs. semi-random": (random_a, semi_random_b),
    "Semi-random vs. semi-random": (rand_range_a, rand_range_b)
}

In [82]:
cases_chi = {
    "Scenario": list(cases.keys()),
    "p-value $t$": [f"{calc_t([extract_ctable(a), extract_ctable(b)])[2]:.3f}" for a, b in cases.values()],
    "p-value $\chi^2$": [f"{calc_chi([extract_ctable(a), extract_ctable(b)])[2]:.3f}" for a, b in cases.values()]
    }

### Examples for $\chi^2$ on different scenarios.
For $\chi^2$, small p-values give reason to reject $H_0$ =
"the occurrences of these observations are independent".
For $t$, large p-values give reason to reject $H_0$ =
"the samples in both sets are drawn from the same population".

In [83]:
pd.DataFrame(cases_chi)


Unnamed: 0,Scenario,p-value $t$,p-value $\chi^2$
0,"Fixed vs. fixed, equal traces.",1.0,0.0
1,"Fixed vs. fixed, equal key.",0.0,0.204
2,"Fixed vs. fixed, different key.",0.0,0.997
3,Fixed vs. semi-random (fixed key is not in ran...,0.0,0.12
4,Fixed vs. random.,0.0,1.0
5,Random vs. random.,0.0,0.463
6,Random vs. semi-random,0.0,0.582
7,Semi-random vs. semi-random,0.0,0.355
