In [1]:
def two_predictions_test(predictions_file_1, predictions_file_2):
    import os
    import sys
    import gzip
    import json
    import pandas as pd
    from scipy.stats import ttest_ind
    first_name = os.path.basename(predictions_file_1).rstrip(".json.gz")
    second_name = os.path.basename(predictions_file_2).rstrip(".json.gz")

    def get_metrics(doc):
        result = doc['metrics']
        if 'sampled_metrics' in doc:
            for key in doc['sampled_metrics']:
                result[f"sampled_{key}"] = doc['sampled_metrics'][key]
        return result

    def read_data(filename):
        result = []
        data = json.load(gzip.open(filename))
        for doc in data:
            metrics = get_metrics(doc)
            result.append(metrics)
        return pd.DataFrame(result)

    df1 = read_data(predictions_file_1)
    df2 = read_data(predictions_file_2)

    overlap_columns = set(df1.columns).intersection(set(df2.columns))


    docs = []

    for column_name in overlap_columns:
        df1_series = df1[column_name]
        df2_series = df2[column_name]

        mean1 = df1_series.mean()
        mean2 = df2_series.mean()
        doc = {}
        doc["metric_name"] = column_name
        doc[first_name] = mean1
        doc[second_name] = mean2
        doc["difference"] = mean2 - mean1
        doc["difference_pct"] = (mean2 - mean1) * 100 / mean1
        t, pval = ttest_ind(df1_series, df2_series) 
        doc["p_value"] = pval 
        doc["p_value_bonferoni"] = pval * len(overlap_columns)
        docs.append(doc)

    result = pd.DataFrame(docs)
    result['significant_0.05'] = result["p_value_bonferoni"] < 0.05
    result['significant_0.01'] = result["p_value_bonferoni"] < 0.01
    result['significant_0.001'] = result["p_value_bonferoni"] < 0.001
    result['significant_0.0001'] = result["p_value_bonferoni"] < 0.0001
    return result.set_index('metric_name')

In [2]:
import os
def get_predictions_file(n, t):
    return "./results/BERT4rec.ml-1m/ml1m_bert4rec_ft_2023_01_19T13_31_40/predictions/DebSASRec-samples:{}-t:{}.json.gz".format(n, t)

In [4]:
import tqdm
import time
get_predictions_file(256, 1.0)
docs = []
pbar = tqdm.tqdm(total=16)
first = get_predictions_file(1, 0.0)
for n in [1, 4, 16, 64, 256]:
    doc = [n]
    for t in [0.0, 0.25, 0.75, 1.0]:
        second = get_predictions_file(n, t)
        result = two_predictions_test(first, second)
        pval=result.loc['ndcg@10']['p_value']
        doc.append(pval)
        pbar.update(1)
    docs.append(doc)


 38%|██████████████████████████████████████████████████████████████▎                                                                                                       | 6/16 [02:34<04:16, 25.69s/it][A

  6%|██████████▍                                                                                                                                                           | 1/16 [00:09<02:29,  9.94s/it][A
 12%|████████████████████▊                                                                                                                                                 | 2/16 [00:20<02:20, 10.01s/it][A
 19%|███████████████████████████████▏                                                                                                                                      | 3/16 [00:28<02:03,  9.52s/it][A
 25%|█████████████████████████████████████████▌                                                                                                                            | 4

In [9]:
import pandas as pd
df = pd.DataFrame(docs)

In [12]:
df*20 < 0.05

Unnamed: 0,0,1,2,3,4
0,False,False,False,True,True
1,False,True,True,True,True
2,False,True,True,True,True
3,False,True,True,True,True
4,False,True,True,True,True
