# Demo colab for mt_metrics_eval_custom

In [None]:
# Imports

import numpy as np
import scipy.stats
from mt_metrics_eval_custom import meta_info
from mt_metrics_eval_custom import data
from mt_metrics_eval_custom import stats

In [None]:
# Load all available EvalSets (takes about 40s)

all_evs = {}  # name/lp -> evs
for testset in meta_info.DATA:
  for lp in meta_info.DATA[testset]:
    evs = data.EvalSet(testset, lp, True)
    all_evs[f'{testset}/{lp}'] = evs


In [None]:
# Print stats for each EvalSet

print(f'{"name":<20}  segs sys metrics gold  refs std')
for name, evs in all_evs.items():
  nsegs = len(evs.src)
  nsys = len(evs.sys_names)
  nmetrics = len(evs.metric_basenames)
  gold = evs.StdHumanScoreName('sys')
  nrefs = len(evs.ref_names)
  std_ref = evs.std_ref

  print(f'{name:<20} {nsegs:5d} {nsys:3d} {nmetrics:7d} '
        f'{gold:5} {nrefs:4d} {std_ref}') 

In [None]:
# Example: sys-level Pearson MQM correlations and significance matrix for
# wmt21.news en-de, human outputs included in scoring, primary metric 
# submissions only. Takes about 20s due to bootstrapping for significance tests.

# First step is to compile a map from metric-name -> 'Correlation' objects 
# containing sufficient stats.
evs = all_evs['wmt21.news/en-de']
level = 'sys'
corrs = data.GetCorrelations(
    evs=evs,
    level=level,
    main_refs={evs.std_ref},
    close_refs={'refB'},
    include_human=True,
    include_outliers=False,
    gold_name=evs.StdHumanScoreName(level),
    primary_metrics=True)

# Compute and print Pearson correlations. 
pearsons = {m: corr.Pearson()[0] for m, corr in corrs.items()}
pearsons = dict(sorted(pearsons.items(), key=lambda x: -x[1]))
print('System-level +HT Pearson correlations for wmt21.news en-de:') 
for m in pearsons:
  print(f'{m:<21} {pearsons[m]: f}')
print()

# Compute and print signficance matrix.
ranked_metrics = list(pearsons)
n = len(ranked_metrics)
sig_matrix = np.zeros((n, n))
for i in range(n):
  corr1 = corrs[ranked_metrics[i]]
  pearson_fcn = corr1.GenCorrFunction(scipy.stats.pearsonr, averaged=False)
  for j in range(i + 1, n):
    corr2 = corrs[ranked_metrics[j]]
    sig_matrix[i, j] = stats.PermutationSigDiff(corr2, corr1, pearson_fcn)

print('Significant differences in Pearson correlation:')
for i in range(n):
  better = ['>' if sig_matrix[i, j] < 0.05 else '=' for j in range(i + 1, n)]
  print(f'{ranked_metrics[i]:<22} {" ".join(better)}')
