In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from matplotlib import pyplot as plt

In [None]:
# Categories as specified in the dataset are different to the paper
# This gives the mapping between them
with open("../configs/demetr/cat_correction.yaml") as stream:
    try:
        cat_correction = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
# Map categories to error severity
# Severity is as specified in the paper
with open("../configs/demetr/cat_severity.yaml") as stream:
    try:
        cat_severity = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

### Load results from DEMETR paper

In [None]:
demetr_results = "../data/demetr_paper_results_tidy.csv"

In [None]:
demetr_df = pd.read_csv(demetr_results)

In [None]:
bleu_baseline = demetr_df[demetr_df.metric == "Bleu"]
comet_baseline = demetr_df[demetr_df.metric == "Comet"]

In [None]:
# Sort and sanity check
# We should have 35 categories
bleu_baseline = bleu_baseline.sort_values("category")
print(len(bleu_baseline))

comet_baseline = comet_baseline.sort_values("category")
print(len(comet_baseline))

### Load M4ST results

In [None]:
m4st_res_dir = "../outputs/demetr"

In [None]:
res_files = os.listdir(m4st_res_dir)

In [None]:
# Read all files into a single dataframe
results_dataframes = []

for i in range(len(res_files)):
    try:
        res_df = pd.read_json(os.path.join(m4st_res_dir, res_files[i]))
        metric = res_files[i].split("_")[0]
        id = next(c for c in res_files[i].split("_") if "id" in c)
        cat = int(id.strip("id"))
        res_df = res_df.T
        res_df["metric"] = metric
        res_df["sentence_id"] = res_df.index
        res_df["category"] = cat
        results_dataframes.append(res_df)
    except IsADirectoryError:
        pass

all_res = pd.concat(results_dataframes)

In [None]:
all_res.head(1)

In [None]:
# Correct categories to align with the paper
all_res["category"] = all_res["category"].replace(cat_correction)

In [None]:
# Add column indicating DEMETR accuracy
all_res["correct"] = all_res["mt_score"] > all_res["disfluent_score"]

In [None]:
# Accuracy is reversed for category 35 (reference as translation) so need to adjust that
cat_to_rev = all_res.loc[all_res["category"] == 35]
cat_to_rev

In [None]:
cat_to_rev["correct"] = cat_to_rev["mt_score"] < cat_to_rev["disfluent_score"]
cat_to_rev

In [None]:
# Reassign values to original dataframe
all_res.loc[all_res["category"] == 35, "correct"] = cat_to_rev.correct

In [None]:
# Check result
all_res.loc[all_res["category"] == 35]

In [None]:
# Add column for severity
all_res["severity"] = all_res["category"].map(cat_severity)

In [None]:
all_res.head(5)

In [None]:
all_res.to_csv("../outputs/demetr/all/all.csv", index=False)

In [None]:
# TODO: error bars

fig, axs = plt.subplots()
by_language = all_res.groupby("source_language")["correct"].mean()
axs.plot(by_language, "x")
plt.xticks(np.arange(10), by_language.index, rotation=45)
plt.ylabel("DEMETR accuracy (%)")
plt.xlabel("Source language")
plt.title("Mean performance across all 35 categories")

In [None]:
fig, axs = plt.subplots()
by_severity = all_res.groupby("severity")["correct"].mean()
by_severity.plot(kind="bar")
plt.xticks(np.arange(4), by_severity.index, rotation=0)
plt.ylabel("DEMETR accuracy")
plt.xlabel("Severity")
plt.title("Mean performance for each error type")

In [None]:
# Probably skip in report
fig, axs = plt.subplots()
sev_by_lang = all_res.groupby(["source_language", "severity"])["correct"].mean()
sev_by_lang.unstack().plot(kind="bar", ax=axs)
plt.xticks(rotation=45)
plt.ylabel("DEMETR accuracy")
plt.xlabel("Source language")
plt.title("Mean performance for each severity level, by language")
plt.legend(loc="right", bbox_to_anchor=(1.25, 0.5))

In [None]:
# Tidy up COMET naming
all_res["metric"] = all_res.metric.replace(
    {
        "wmt22-comet-da": "wmt22-COMET",
        "wmt22-cometkiwi-da": "wmt22-COMETKiwi",
        "Bleu": "BLEU",
        "BLASER": "BLASER-2",
    }
)

In [None]:
# TODO: Extract this as a table

fig, axs = plt.subplots()
sev_by_lang = all_res.groupby(["source_language", "metric"])["correct"].mean()
sev_by_lang.unstack().plot(kind="bar", ax=axs)
plt.xticks(rotation=45)
plt.ylabel("DEMETR accuracy")
plt.xlabel("Source language")
plt.title("Mean performance for each metric, by language")
plt.legend(loc="right", bbox_to_anchor=(1.4, 0.5))

In [None]:
to_table = sev_by_lang.reset_index()

In [None]:
fig, axs = plt.subplots()
sev_by_lang = all_res.groupby(["source_language", "metric"])["correct"].mean()
sev_by_lang.unstack().drop(columns=["BLASER-2", "ChrF2", "ChrF1"]).plot(
    kind="bar", ax=axs
)
plt.xticks(rotation=45)
plt.ylabel("DEMETR accuracy")
plt.xlabel("Source language")
plt.title("Mean performance for COMET metrics, by language")
plt.legend(loc="right", bbox_to_anchor=(1.4, 0.5))

In [None]:
fig, axs = plt.subplots()
sorted_overall_mean = (
    all_res.groupby(["metric"])["correct"].mean().sort_values(ascending=False)
)
axs.plot(sorted_overall_mean, "x")
plt.xticks(rotation=30)
plt.xlabel("Metric")
plt.ylabel("Accuracy")
# plt.title("Mean performance across all languages")

In [None]:
all_res

In [None]:
all_res

In [None]:
corr_by_category = (
    all_res.groupby(["metric", "category"])["correct"].mean().reset_index()
)
corr_by_category

In [None]:
grouped = corr_by_category.groupby("metric").median().sort_values(by="correct")

In [None]:
fig, axs = plt.subplots()
g = sns.boxplot(
    corr_by_category,
    x="metric",
    y="correct",
    fill=False,
    ax=axs,
    width=0.5,
    order=grouped.index,
)
axs.set_xticklabels(rotation=30, labels=axs.get_xticklabels())
axs.set_xlabel("Metric")
axs.set_ylabel("Accuracy")
plt.tight_layout()
plt.savefig("../outputs/demetr/plots/metrics-boxplot.png")

In [None]:
# What trend would be desirable here? Include in report

fig, axs = plt.subplots()
sev_by_lang = all_res.groupby(["metric", "severity"])["correct"].mean()
sev_by_lang.unstack().plot(kind="bar", ax=axs)
plt.xticks(rotation=30)
plt.ylabel("DEMETR accuracy")
plt.xlabel("Metric")
# plt.title("Mean performance for each severity level by metric")
plt.legend(loc="right", bbox_to_anchor=(1.25, 0.5))
plt.tight_layout()
plt.savefig("../outputs/demetr/plots/demetr-by-severity.png")

In [None]:
fig, axs = plt.subplots()
sev_by_lang = all_res.groupby(["metric", "severity"])["correct"].mean()
sev_by_lang.unstack().plot(kind="bar", ax=axs)
plt.xticks(rotation=30)
plt.ylabel("DEMETR accuracy")
plt.xlabel("Metric")
# plt.title("Mean performance by severity for COMET metrics")
plt.legend(loc="right", bbox_to_anchor=(1.23, 0.5))

### Comparison between M4ST and original paper

In [None]:
m4st_blaser = (
    all_res[all_res.metric == "BLASER"].groupby("category").correct.mean() * 100
)
m4st_comet = all_res[all_res.metric == "COMET"].groupby("category").correct.mean() * 100
m4st_bleu = all_res[all_res.metric == "Bleu"].groupby("category").correct.mean() * 100

In [None]:
diff_paper = np.array(m4st_blaser) - np.array(comet_baseline.accuracy)
diff_new = np.array(m4st_blaser) - np.array(m4st_comet)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].plot(list(range(1, 36)), diff_paper, "ko")
axs[0].axhline(0, linestyle="--", c="r")
axs[0].set_title("BLASER-2 vs. COMET (Baseline)")
axs[0].set_ylabel("Difference (BLASER-2 - COMET)")
axs[0].set_xlabel("DEMETR category")

# TODO: This plot separated so we BLASER-2 vs the best COMET metric
axs[1].plot(list(range(1, 36)), diff_new, "ko")
axs[1].axhline(0, linestyle="--", c="r")
axs[1].set_title("BLASER-2 vs. COMET (M4ST)")
axs[1].set_ylabel("Difference (BLASER-2 - COMET)")
axs[1].set_xlabel("DEMETR category")

In [None]:
diff_paper = np.array(m4st_blaser) - np.array(bleu_baseline.accuracy)
diff_new = np.array(m4st_blaser) - np.array(m4st_bleu)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].plot(list(range(1, 36)), diff_paper, "ko")
axs[0].axhline(0, linestyle="--", c="r")
axs[0].set_title("BLASER-2 vs. Bleu (Baseline)")
axs[0].set_ylabel("Difference (BLASER-2 - Bleu)")
axs[0].set_xlabel("DEMETR category")

axs[1].plot(list(range(1, 36)), diff_new, "ko")
axs[1].axhline(0, linestyle="--", c="r")
axs[1].set_title("BLASER-2 vs. Bleu (M4ST)")
axs[1].set_ylabel("Difference (BLASER-2 - Bleu)")
axs[1].set_xlabel("DEMETR category")

### Compare BLASER to Bleu/COMET

In [None]:
demetr_df.head(1)

In [None]:
demetr_df["metric"] = demetr_df.metric.replace({"Comet": "COMET", "ChrF": "ChrF1"})

In [None]:
np.unique(all_res.metric)

In [None]:
np.unique(demetr_df.metric)

In [None]:
# Subset DEMETR paper results to match the metrics I used
demetr_df_match = demetr_df[demetr_df.metric.isin(np.unique(all_res.metric))]

In [None]:
demetr_df_match

In [None]:
m4st_by_cat = all_res[["metric", "category", "correct"]].groupby(["category", "metric"])
m4st_by_cat = m4st_by_cat.correct.mean().reset_index()

In [None]:
m4st_by_cat

In [None]:
# Adjust percentage
m4st_by_cat["correct"] = m4st_by_cat["correct"] * 100

In [None]:
# Add column for plotting by source
demetr_df_match["source"] = "Karpinska et al."

In [None]:
m4st_by_cat["source"] = "ARC"
m4st_by_cat = m4st_by_cat.rename(columns={"correct": "accuracy"})

In [None]:
merged = pd.merge(
    demetr_df_match,
    m4st_by_cat,
    on=["category", "metric", "source", "accuracy"],
    how="outer",
)

In [None]:
merged

In [None]:
merged = merged[merged.metric.isin(["Bleu", "COMET", "ChrF1", "ChrF2"])]

In [None]:
g = sns.catplot(
    data=merged[merged.category <= 15],
    kind="bar",
    x="category",
    y="accuracy",
    hue="source",
    errorbar="sd",
    palette="dark",
    alpha=0.6,
    height=6,
    aspect=11.7 / 8.27,
)
g.despine(left=True)
g.set_axis_labels("Category", "Accuracy")
g.legend.set_title("")

In [None]:
g = sns.catplot(
    data=merged[merged.category > 15],
    kind="bar",
    x="category",
    y="accuracy",
    hue="source",
    errorbar="sd",
    palette="dark",
    alpha=0.6,
    height=6,
    aspect=11.7 / 8.27,
)
g.despine(left=True)
g.set_axis_labels("Category", "Accuracy")
g.legend.set_title("")

### BLASER only

In [None]:
m4st_res_dir = "../outputs/demetr"

In [None]:
blaser_new_15 = pd.read_json(
    os.path.join(m4st_res_dir, "BLASER_REF_minor_id15_case.json")
)
blaser_new_8 = pd.read_json(
    os.path.join(m4st_res_dir, "BLASER_Ref_critical_id8_negation.json")
)
blaser_new_6 = pd.read_json(
    os.path.join(m4st_res_dir, "BLASER_Ref_critical_id6_addition.json")
)

In [None]:
blaser_new_15

In [None]:
blaser_new_15 = blaser_new_15.T
blaser_new_8 = blaser_new_8.T
blaser_new_6 = blaser_new_6.T

In [None]:
blaser_new_8

In [None]:
np.unique(blaser_new_15.source_language)

In [None]:
blaser_new_15["diff"] = blaser_new_15.mt_score - blaser_new_15.disfluent_score
blaser_new_8["diff"] = blaser_new_8.mt_score - blaser_new_8.disfluent_score
blaser_new_6["diff"] = blaser_new_6.mt_score - blaser_new_6.disfluent_score

In [None]:
blaser_new_15

In [None]:
fig, axs = plt.subplots()
blaser_new_15.groupby("source_language").mean()["diff"].plot(ax=axs)
blaser_new_8.groupby("source_language").mean()["diff"].plot(ax=axs)
blaser_new_6.groupby("source_language").mean()["diff"].plot(ax=axs)

fig.legend(
    labels=["Pronoun case", "Negation", "Addition"],
    loc="right",
    bbox_to_anchor=(1.15, 0.5),
)
axs.set_ylabel("Score difference")
plt.xticks(np.arange(10), np.unique(blaser_new_15.source_language), rotation=30)