## Evaluation for training-free method result

In [1]:
!pip install unbabel-comet
from comet import download_model, load_from_checkpoint



In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [3]:
from google.colab import files
import io
import math
import torch
import pandas as pd

uploaded = files.upload()   # <- will open a file picker dialog

# Get the name of the uploaded file
filename = next(iter(uploaded.keys()))
print("Uploaded file:", filename)

df = pd.read_csv(io.BytesIO(uploaded[filename]))
df.head()


Saving hindi-english_idioms_results.csv to hindi-english_idioms_results.csv
Uploaded file: hindi-english_idioms_results.csv


Unnamed: 0,src,explanation_used,literal_used,final_translation
0,‡§¶‡§ø‡§≤ ‡§õ‡•ã‡§ü‡§æ ‡§Æ‡§§ ‡§ï‡§∞‡•ã,don‚Äôt lose hope,don‚Äôt make the heart small,Don't lose heart.
1,‡§Ö‡§Ç‡§ß‡•á ‡§ï‡•Ä ‡§≤‡§æ‡§†‡•Ä ‡§õ‡•Ä‡§®‡§®‡§æ,to exploit the helpless,to snatch the blind man‚Äôs staff,To prey on the helpless.
2,‡§ú‡•Å‡§¨‡§æ‡§® ‡§ï‡•Ä ‡§Æ‡§ø‡§†‡§æ‡§∏ ‡§∞‡§ñ‡§®‡§æ,to speak kindly,to keep sweetness of the tongue,Use kind words.
3,‡§¶‡§ø‡§≤ ‡§ï‡§æ ‡§∞‡•ã‡§ó,heartache/grief,disease of the heart,heartache
4,‡§ó‡§≤‡§æ ‡§∏‡•Ç‡§ñ ‡§ú‡§æ‡§®‡§æ,to feel nervous or scared,the throat to dry up,My throat went dry.


In [4]:
# ==========================
# CONFIG
# ==========================
INPUT_CSV = filename   # from your upload cell
OUTPUT_CSV = "evaluated_chinese_english_results.csv"
BATCH_SIZE = 8         # safer for Colab Lightning; adjust if needed

import math
import torch
import pandas as pd
from comet import download_model, load_from_checkpoint

# ==========================
# HELPERS
# ==========================
def _nonempty(x):
    return isinstance(x, str) and len(x.strip()) > 0

def _fmt(x):
    return "nan" if (not isinstance(x, (int,float)) or math.isnan(x)) else f"{x:.3f}"


# ==========================
# LOAD DATA
# ==========================
df = pd.read_csv(INPUT_CSV)

required_cols = ["src", "explanation_used", "literal_used", "final_translation"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Create missing score columns
for col in [
    "QE_final", "QE_literal", "QE_explanation",
    "DA_final", "DA_literal", "DA_explanation",
    "QE_final_gt_literal", "DA_final_gt_literal"
]:
    if col not in df.columns:
        df[col] = float("nan")


# ==========================
# LOAD COMET MODELS
# ==========================
# NOTE: You can keep GPU; with our fixed batching, this works fine.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

qe_model = load_from_checkpoint(download_model("Unbabel/wmt20-comet-qe-da")).eval().to(DEVICE)
da_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da")).eval().to(DEVICE)


# ==========================
# BUILD QE REQUESTS
# ==========================
qe_items = []
qe_meta = []   # pairs: (row index, score column)

LABELS_QE = {
    "QE_final": "final_translation",
    "QE_literal": "literal_used",
    "QE_explanation": "explanation_used",
}

for idx, row in df.iterrows():
    src = row["src"]
    for score_col, text_col in LABELS_QE.items():
        hyp = row.get(text_col, "")
        if _nonempty(hyp):
            qe_items.append({"src": src, "mt": hyp})
            qe_meta.append((idx, score_col))


# ==========================
# RUN QE (ONE SINGLE CALL)
# ==========================
print("Running COMET-QE...")
with torch.no_grad():
    qe_out = qe_model.predict(
        qe_items,
        batch_size=BATCH_SIZE,
        gpus=1 if DEVICE == "cuda" else 0,
        progress_bar=False,
    )

qe_scores = qe_out["scores"]
for (row_idx, score_col), score in zip(qe_meta, qe_scores):
    df.at[row_idx, score_col] = float(score)


# ==========================
# BUILD DA REQUESTS
# ==========================
da_items = []
da_meta = []

LABELS_DA = {
    "DA_final": "final_translation",
    "DA_literal": "literal_used",
    "DA_explanation": "explanation_used",
}

for idx, row in df.iterrows():
    src = row["src"]
    ref = row["explanation_used"]
    if not _nonempty(ref):
        continue
    for score_col, text_col in LABELS_DA.items():
        hyp = row.get(text_col, "")
        if _nonempty(hyp):
            da_items.append({"src": src, "mt": hyp, "ref": ref})
            da_meta.append((idx, score_col))


# ==========================
# RUN DA (ONE SINGLE CALL)
# ==========================
print("Running COMET-DA...")
with torch.no_grad():
    da_out = da_model.predict(
        da_items,
        batch_size=BATCH_SIZE,
        gpus=1 if DEVICE == "cuda" else 0,
        progress_bar=False,
    )

da_scores = da_out["scores"]
for (row_idx, score_col), score in zip(da_meta, da_scores):
    df.at[row_idx, score_col] = float(score)


# ==========================
# SIMPLE COMPARISONS
# ==========================
for idx, row in df.iterrows():
    qf, ql = row["QE_final"], row["QE_literal"]
    df.at[idx, "QE_final_gt_literal"] = (
        float(qf > ql) if (not math.isnan(qf) and not math.isnan(ql)) else math.nan
    )

    df.at[idx, "DA_final_gt_literal"] = (
        float(row["DA_final"] > row["DA_literal"])
        if (not math.isnan(row["DA_final"]) and not math.isnan(row["DA_literal"]))
        else math.nan
    )


# ==========================
# SAVE RESULT
# ==========================
df.to_csv(OUTPUT_CSV, index=False)
print("Saved:", OUTPUT_CSV)

df.head()


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt20-comet-qe-da/snapshots/2e7ffc84fb67d99cf92506611766463bb9230cfb/checkpoints/model.ckpt`
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('med

Running COMET-QE...


INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Running COMET-DA...
Saved: evaluated_chinese_english_results.csv


Unnamed: 0,src,explanation_used,literal_used,final_translation,QE_final,QE_literal,QE_explanation,DA_final,DA_literal,DA_explanation,QE_final_gt_literal,DA_final_gt_literal
0,‡§¶‡§ø‡§≤ ‡§õ‡•ã‡§ü‡§æ ‡§Æ‡§§ ‡§ï‡§∞‡•ã,don‚Äôt lose hope,don‚Äôt make the heart small,Don't lose heart.,0.164334,-0.013064,0.142702,0.914846,0.614163,0.98692,1.0,1.0
1,‡§Ö‡§Ç‡§ß‡•á ‡§ï‡•Ä ‡§≤‡§æ‡§†‡•Ä ‡§õ‡•Ä‡§®‡§®‡§æ,to exploit the helpless,to snatch the blind man‚Äôs staff,To prey on the helpless.,-0.305773,-0.499659,-0.152988,0.826452,0.480772,0.970427,1.0,1.0
2,‡§ú‡•Å‡§¨‡§æ‡§® ‡§ï‡•Ä ‡§Æ‡§ø‡§†‡§æ‡§∏ ‡§∞‡§ñ‡§®‡§æ,to speak kindly,to keep sweetness of the tongue,Use kind words.,0.164411,-0.075051,-0.167649,0.806217,0.477788,0.975221,1.0,1.0
3,‡§¶‡§ø‡§≤ ‡§ï‡§æ ‡§∞‡•ã‡§ó,heartache/grief,disease of the heart,heartache,0.028593,0.606006,0.113203,0.824467,0.530034,0.988624,0.0,1.0
4,‡§ó‡§≤‡§æ ‡§∏‡•Ç‡§ñ ‡§ú‡§æ‡§®‡§æ,to feel nervous or scared,the throat to dry up,My throat went dry.,0.345691,0.230165,-0.703605,0.475933,0.440292,0.974721,1.0,1.0


In [5]:
from google.colab import files
files.download("evaluated_chinese_english_results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>