In [6]:
from itertools import combinations

import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import wilcoxon, kendalltau

from evaluation.tools import load_data, get_equal_pairs, get_valid
from evaluation.SETTINGS import investigated_metrics

In [2]:
data = load_data()

Annotated data loaded


Some metrics cannot be aggregated via mean (e.g., BLEU). Want to cut these out of the analysis:

In [97]:
mean_to_corpus_diffs, mean_to_corpus_pct_diffs = [], []

for campaign in data:
    for system in data[campaign]:
        human_data = data[campaign][system]['hum_annotations']
        auto_data = data[campaign][system]['hum_only_automatic_metrics']
        if isinstance(auto_data, dict) and auto_data == {}:
            auto_data = data[campaign][system]['automatic_metrics']
        if isinstance(auto_data, pd.DataFrame):
            auto_data = auto_data.set_index('Unnamed: 0').to_dict()[0]
        human_data_means = human_data[[col for col in human_data if col.startswith("metric")]].mean()
        mean_diffs, mean_pct_diffs = {}, {}
        for metric, corpus_val in auto_data.items():
            try:
                sentence_mean = human_data_means[f"metric_{metric}"]
            except KeyError:
                continue
            
            mean_diffs[metric] = (sentence_mean - corpus_val)
            mean_pct_diffs[metric] = (sentence_mean - corpus_val) / corpus_val
        mean_to_corpus_diffs.append(mean_diffs)
        mean_to_corpus_pct_diffs.append(mean_pct_diffs)

mean_to_corpus_diffs = pd.DataFrame(mean_to_corpus_diffs)
mean_to_corpus_pct_diffs = pd.DataFrame(mean_to_corpus_pct_diffs)


  mean_pct_diffs[metric] = (sentence_mean - corpus_val) / corpus_val


In [None]:
max_pct_diffs = mean_to_corpus_pct_diffs.abs().max()
investigated_metrics = list(max_pct_diffs.loc[np.isclose(max_pct_diffs, 0, atol=1e-5)].index)
investigated_metrics

(mean_to_corpus_pct_diffs * 100).

Make a large dataframe

In [116]:
data_df = []
for campaign in data:
    for system in data[campaign]:
        human_data = data[campaign][system]['hum_annotations']
        human_data["campaign"] = campaign
        human_data["system"] = system
        data_df.append(human_data)
data_df = pd.concat(data_df, ignore_index=True)
data_df = data_df.drop("Unnamed: 0", axis=1)

In [117]:
METRIC = "Prism_ref" # just a test

# data_df = get_valid(data_df)

results = []
for campaign in tqdm(data_df.campaign.unique()):
    campaign_df = data_df.loc[data_df.campaign == campaign]
    systems = campaign_df.system.unique()
    for system_a, system_b in combinations(systems, 2):
        system_a_df = campaign_df.loc[campaign_df.system == system_a]
        system_b_df = campaign_df.loc[campaign_df.system == system_b]
        
        # below pulled from the original analysis.py 
        skip_pair = False
        for metric in investigated_metrics:
            if metric not in data[campaign][system_a]['automatic_metrics'] or metric not in data[campaign][system_b]['automatic_metrics']:
                skip_pair = True
        if skip_pair:
            continue

        # get automated score diff
        for metric in investigated_metrics:
            auto_score_diff = system_a_df[f"metric_{METRIC}"].mean() - system_b_df[f"metric_{METRIC}"].mean()

        # get human score diff
        system_a_df, system_b_df = get_valid(system_a_df), get_valid(system_b_df)
        if len(system_a_df) != len(system_b_df) or sum(abs(system_a_df['SegmentID'] - system_b_df['SegmentID'])) != 0:
            system_a_df, system_b_df = get_equal_pairs(system_a_df, system_b_df)
            
            # double check that segment ids are equal
            if sum(abs(system_a_df['SegmentID']-system_b_df['SegmentID'])) != 0:
                raise ValueError("SegmentIDs are not equal")

        human_score_diff = system_a_df["Score"].mean() - system_b_df["Score"].mean()

        differences = (system_a_df["Score"] - system_b_df["Score"]).to_list()
        if np.sum(differences) == 0:
            # print("System appears to have been repeated, dropping")
            continue 

        if len(differences) < 100:
            raise ValueError("There is too few lines, something is wrong!")

        t_stat, p_value_twosided = wilcoxon(differences, alternative='two-sided')

        result = {
            "campaign": campaign,
            "system_a": system_a,
            "system_b": system_b,
            "human_score_diff": human_score_diff,
            "human_p_value": p_value_twosided,
            "auto_score_diff": auto_score_diff,
            "accuracy": np.sign(auto_score_diff) == np.sign(human_score_diff)
        }
        results.append(result)

results = pd.DataFrame(results)


  6%|▌         | 96/1728 [00:15<04:26,  6.13it/s]

System appears to have been repeated, dropping


 14%|█▍        | 242/1728 [00:38<04:10,  5.94it/s]

System appears to have been repeated, dropping


 43%|████▎     | 749/1728 [02:07<02:28,  6.57it/s]

System appears to have been repeated, dropping


100%|██████████| 1728/1728 [04:53<00:00,  5.89it/s]


In [118]:
for alpha in [1, 0.05, 0.01, 0.001]:
    acc = results.loc[results.human_p_value <= alpha, "accuracy"]
    print(f"N: {len(acc)}, {alpha:0.03f}, {acc.mean():0.03f}")

N: 3344, 1.000, 0.806
N: 1717, 0.050, 0.945
N: 1420, 0.010, 0.970
N: 1176, 0.001, 0.983


In [36]:
import pickle

with open("./statistical_data.pickle", "rb") as infile:
    statistical_data = pickle.load(infile)

In [67]:
results_original = pd.DataFrame(statistical_data)

results_original['accuracy'] = np.sign(results_original['human1']) == np.sign(results_original[METRIC])

In [119]:
results = (
    results.replace("\\.xlsx", "", regex=True)
           .rename({
               "system_a": "SystemAid",
               "system_b": "SystemBid",
               "human_p_value": "human_p_value_new",
               "accuracy": "accuracy_new"
            },
            axis='columns',
            )    
)

In [120]:
results_merged = results_original[['campaign', 'SystemAid', 'SystemBid', 'accuracy', 'human_pvalue', METRIC]].merge(results)

In [121]:
results_merged['diff_of_diff'] = np.abs(results_merged[METRIC] - results_merged['auto_score_diff'])

In [124]:
results_merged.sort_values('diff_of_diff').max()

campaign             ffdcaad19e1e58a70563371fad02c141
SystemAid            f7e17531e5935e6d6be48f6f3fba5b00
SystemBid            ffd2d560c712ac3cb62dbca7675790f5
accuracy                                         True
human_pvalue                                      1.0
Prism_ref                                    1.099608
human_score_diff                            28.482105
human_p_value_new                                 1.0
auto_score_diff                              1.099608
accuracy_new                                     True
diff_of_diff                                      0.0
dtype: object

In [115]:

campaign_df = data_df.loc[data_df.campaign == 'd38a89488e5ed4e0706a849a640a99a0']
system_a_df = campaign_df.loc[campaign_df.system == '13f1d5f1838280fa11793328b00eace6.xlsx']
system_b_df = campaign_df.loc[campaign_df.system == 'b91985dbc06ea031b20b09a24d6c3d43.xlsx']

system_a_df.metric_Prism_ref.mean() - system_b_df.metric_Prism_ref.mean()

-0.004476666706314303

# Next steps
* Probably do all analyses with a single language pair, to start.
* Randomly subsample data at multiple sizes (1%, 5%, etc.). For each size, show variance across each "seed" pool (e.g., the initial set of campaigns)
    * Subsample by campaign, but otherwise maintain same strategy (basically what happened above with the reduced release). Equivalent to running fewer system pairs.
    * Subsample by campaign, but pool data and partition to create pseudo-systems.
    * Subsample by language pair: pooled dataset is partitioned to create pseudo-systems.
    * Subsample completely at random, across languages.
* Strategically subsample: pick campaigns such that spread is maximized/minimized across, say, geom mean of the different metric types
    * Strategically partition (?) by weighing (?) examples s.t. differences between pseudo-systems show large variation (not all "close" to one another)
* Redo the above for the 'precision' (since this table is basically 'recall'): when a metric tells you "difference", how often is there a sig. human difference?
* 