# Statistical Analysis

In [1]:
from json import load, dump
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [2]:
options = {
    'lvl' : ['lvl_1', 'lvl_3', 'lvl_5', 'lvl_9', 'lvl_11', 'lvl_12'],
    'prompt' : ['base', 'cot'],
    'model' : ['llama3', 'qwen', 'deepseek'],
    'encoding' : ['matrix', 'text']
}

In [3]:
def resourceful(entry):
    return entry['contains_keyword']

def interactive(entry):
    return any(entry['results'].values())

def accurate(truth):
    return truth['tp'] > 0

def complete(entry):
    return entry['completion']

In [4]:
def create_data(experiment, idp):

    four_tuples = [[0, 0, 0, 0] for _ in options[idp]]
    norm_vals   = [0 for _ in options[idp]]

    for i, (e1, e2) in enumerate(zip(plan_data, action_data)):
        if e1['meta']['exp'] == experiment and e1['meta']['prompt'] != 'self-consistency':

            # Determine where in four_tuples to index.
            option_dict = {o : i for i, o in enumerate(options[idp])}
            idx         = option_dict[e1['meta'][idp]]

            four_tuples[idx][0] += 1 if resourceful(e1) else 0
            four_tuples[idx][1] += 1 if interactive(e2) else 0
            four_tuples[idx][2] += 1 if accurate(acc_data[i]['truth']) else 0
            four_tuples[idx][3] += 1 if complete(e2) else 0

            norm_vals[idx] += 1

    return list(zip(four_tuples, norm_vals))

In [5]:
# Function that performs statistical analysis.
def do_comparison(
        h0_succ, h1_succ,
        h0_fail, h1_fail,
        h0_count, h1_count,
        metric_idx,
        metric,
        alpha=0.05):
    contingency_table = np.array([
        [h0_succ[metric_idx], h0_fail[metric_idx]],
        [h1_succ[metric_idx], h1_fail[metric_idx]]
    ])

    # Perform chi-square test
    chi2, p, dof, expected = stats.chi2_contingency(
        contingency_table,
        correction=True
    )

    # Get p-values.
    p1 = h0_succ[metric_idx] / h0_count
    p2 = h1_succ[metric_idx] / h1_count

    # Calculate pooled proportion
    p_pooled = (h0_succ[metric_idx] + h1_succ[metric_idx]) / (h0_count + h1_count)

    # Calculate standard error
    se = np.sqrt(p_pooled * (1 - p_pooled) * (1/h0_count + 1/h1_count))

    # Calculate z-score
    z = (p2 - p1) / se

    # Calculate p-value from z-score (two-tailed)
    p_z = 2 * (1 - stats.norm.cdf(abs(z)))

    # Determine which model performed better
    difference = p2 - p1
    if difference > 0:
        better_model = "Model 2"
    elif difference < 0:
        better_model = "Model 1"
    else:
        better_model = "Neither (identical performance)"

    # Determine if difference is significant
    is_significant = p < alpha

    return {
        'Metric Name': metric,
        'Model 1 Rate': f"{h0_succ[metric_idx]}/{h0_count} = {p1:.2%}",
        'Model 2 Rate': f"{h1_succ[metric_idx]}/{h1_count} = {p2:.2%}",
        'Difference': f"{difference:.2%}",
        'X^2 p-value': f"{p:.8f}",
        'Z-test p-val': f"{p_z:.8f}",
        'Significant': is_significant,
        'Better Model': better_model
    }

# Function that formats data correctly then performs statistical analysis. We
# want to test how a new idp value affects average performance across both
# experiments.
def format_data(data, idp, idx1, idx2, metric_idx, metric):
    h0_data = [data['exp1'][idp][idx1], data['exp2'][idp][idx1]]
    h1_data = [data['exp1'][idp][idx2], data['exp2'][idp][idx2]]

    h0_count = sum([pair[1] for pair in h0_data])
    h1_count = sum([pair[1] for pair in h1_data])

    h0_succ = np.array(h0_data[0][0]) + np.array(h0_data[1][0])
    h0_fail = h0_count - h0_succ
    h1_succ = np.array(h1_data[0][0]) + np.array(h1_data[1][0])
    h1_fail = h1_count - h1_succ

    return do_comparison(
        h0_succ, h1_succ,
        h0_fail, h1_fail,
        h0_count, h1_count,
        metric_idx, metric
    )

In [6]:
def extract(data):
    result = []
    for vv in data.values():
        result += vv
    return result

def format_data_overall(data, metric_idx, metric):
    h0_data = extract(data['exp1'])
    h1_data = extract(data['exp2'])

    h0_count = sum([pair[1] for pair in h0_data])
    h1_count = sum([pair[1] for pair in h1_data])

    h0_succ = sum([np.array(pair[0]) for pair in h0_data])
    h0_fail = h0_count - h0_succ
    h1_succ = sum([np.array(pair[0]) for pair in h1_data])
    h1_fail = h1_count - h1_succ

    return do_comparison(
        h0_succ, h1_succ,
        h0_fail, h1_fail,
        h0_count, h1_count,
        metric_idx, metric
    )

## Newly Formatted Data

Requires a bit of preprocessing before throwing it into `do_comparison`.

In [None]:
with open('../data/preprocessed.json') as fp:
    new_data = load(fp)

In [28]:
kk = 'byModel'
m1 = 'qwen'
m2 = 'deepseek'

for i, metric in enumerate(['Resourcefulness', 'Interaction', 'Accuracy', 'Completion']):

    h0_count = new_data[kk][m1]['Total']
    h1_count = new_data[kk][m2]['Total']

    h0_succ = list(new_data[kk][m1].values())
    h1_succ = list(new_data[kk][m2].values())
    h0_fail = list(h0_count - np.array(h0_succ))
    h1_fail = list(h1_count - np.array(h1_succ))

    for k, v in do_comparison(h0_succ, h1_succ, h0_fail, h1_fail, h0_count, h1_count, i, metric).items():
        print(f"{k} : \t{v}")
    print('------------------------------------------------')

Metric Name : 	Resourcefulness
Model 1 Rate : 	309/384 = 80.47%
Model 2 Rate : 	247/384 = 64.32%
Difference : 	-16.15%
X^2 p-value : 	0.00000085
Z-test p-val : 	0.00000056
Significant : 	True
Better Model : 	Model 1
------------------------------------------------
Metric Name : 	Interaction
Model 1 Rate : 	79/384 = 20.57%
Model 2 Rate : 	39/384 = 10.16%
Difference : 	-10.42%
X^2 p-value : 	0.00009519
Z-test p-val : 	0.00006265
Significant : 	True
Better Model : 	Model 1
------------------------------------------------
Metric Name : 	Accuracy
Model 1 Rate : 	52/384 = 13.54%
Model 2 Rate : 	32/384 = 8.33%
Difference : 	-5.21%
X^2 p-value : 	0.02804341
Z-test p-val : 	0.02076153
Significant : 	True
Better Model : 	Model 1
------------------------------------------------
Metric Name : 	Completion
Model 1 Rate : 	27/384 = 7.03%
Model 2 Rate : 	11/384 = 2.86%
Difference : 	-4.17%
X^2 p-value : 	0.01256554
Z-test p-val : 	0.00776223
Significant : 	True
Better Model : 	Model 1
----------------