In [19]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import matthews_corrcoef

## Load COMETKiwi 2022 model

In [20]:
# log into Huggingface hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-cometkiwi-da")
model = load_from_checkpoint(model_path)


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v1.9.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../../../.cache/huggingface/hub/models--Unbabel--wmt22-cometkiwi-da/snapshots/b3a8aea5a5fc22db68a554b92b3d96eb6ea75cc9/checkpoints/model.ckpt`
Encoder model frozen.


## Load data

In [None]:
# TODO: set the `main_dir` path to the parent directory of `ARC-MTQE`
main_dir = "/Users/rjersakova/Documents/"

In [None]:
# make predictions for all language pairs listed here
language_pairs = ["encs", "ende", "enja", "enzh"]

In [32]:
# load data and ground truth labels
data_dir = os.path.join(main_dir, "ARC-MTQE", "mlqe-pe", "data")

all_data = {}

for lp in language_pairs:
    path_data = os.path.join(data_dir, "catastrophic_errors", f"{lp}_majority_test_blind.tsv")
    path_labels = os.path.join(data_dir, "catastrophic_errors_goldlabels", f"{lp}_majority_test_goldlabels", "goldlabels.txt")

    # format data into COMETKiwi input style: [{"src":"...", "mt":"..."}, {...}] 
    df_data = pd.read_csv(path_data, sep='\t', header=None, names=["idx", "source", "target"])
    data = []
    for i, row in df_data.iterrows():
        data.append({"src": row["source"], "mt": row["target"]})

    # ground truth labels
    df_labels = pd.read_csv(path_labels, sep='\t', header=None, names=["lang_pair", "ref", "idx", "label"])
    # ERROR = 1, NOT = 0
    df_labels["error"] = np.where(df_labels["label"]=="NOT", 0, 1)

    all_data[lp] = {"data": data, "df_data": df_data, "df_labels": df_labels}

## Make predictions

In [34]:
for lp in language_pairs:
    model_output = model.predict(all_data[lp]["data"], batch_size=8, gpus=0)
    all_data[lp]["df_data"]['comet_score'] = model_output.scores

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 125/125 [02:58<00:00,  1.43s/it]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 125/125 [03:03<00:00,  1.46s/it]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 125/125 [05:20<00:00,  2.56s/it]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 125/125 [03:41<00:00,  1.77s/it]


## Evaluate

In [38]:
# define range of threshold values for turning COMETKiwi output into binary labels
thresholds = np.arange(0.1, 1, 0.1)

In [39]:
# save results
outdir = os.path.join(main_dir, "ARC-MTQE", "results")
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [44]:
for lp in language_pairs:

    print("\n", lp)
    
    lp_data = all_data[lp]

    # this might be unnecessary step but it ensures that the indexes of the data and the gold labels are the same
    # an alternative would be to save the comet scores in df_labels directly
    df_results = pd.merge(lp_data["df_data"], lp_data["df_labels"], on="idx")
    y_true = df_results["error"]

    # save to file
    df_results[["idx", "comet_score", "label", "error"]].to_csv(os.path.join(outdir, f"{lp}_predictions.csv"), index=False)

     # turn COMETKiwi predictions into binary labels for a range of thresholds and evaluate accuracy of prediction
    for t in thresholds:

        # we use 1 to indicate ERROR --> values below threshold are flagged as errors
        y_hat = (df_results['comet_score'] <= t).astype(int)

        print(t, matthews_corrcoef(y_true, y_hat))


 encs
0.1 0.0
0.2 0.0
0.30000000000000004 0.1136298865897307
0.4 0.17361933066582394
0.5 0.2831032951475931
0.6 0.3697324982623221
0.7000000000000001 0.3637324741187898
0.8 0.3030024569183256
0.9 0.015273470515558312

 ende
0.1 0.0
0.2 0.0
0.30000000000000004 0.0862495427090441
0.4 0.16654776089680248
0.5 0.30371715143564143
0.6 0.3080711762589512
0.7000000000000001 0.3318425097976723
0.8 0.2844062103441502
0.9 0.0

 enja
0.1 0.0
0.2 0.0
0.30000000000000004 0.0
0.4 -0.00945589936331561
0.5 0.08305920261797595
0.6 0.1465644355209068
0.7000000000000001 0.1361687054139401
0.8 0.18906994821125991
0.9 0.018940258936452087

 enzh
0.1 0.0
0.2 0.0
0.30000000000000004 0.0
0.4 0.07303730808582047
0.5 0.10333648087524111
0.6 0.1891758611977713
0.7000000000000001 0.17787309242027585
0.8 0.19351236731818536
0.9 0.0
