Generate examples for the paper where the scores diverge.

In [None]:
# load models
from metrics_domain_adaptation import utils
import pickle
import numpy as np

data_all = pickle.load(open(f"{utils.ROOT}/computed/scores_all_z.pkl", "rb"))

In [None]:
def feature_importance(model, domain, feature):
    data = [x for lang in ["en-de", "en-ru", "zh-en"] for x in data_all[(domain, lang)]]
    scores_model = np.array([x["scores"][model]  for x in data])
    scores_human = np.array([x["scores"]["human"]  for x in data])

    # compute loss and middle point
    losses = np.abs(scores_human-scores_model)

    losses_med_down = np.percentile(losses, q=25)
    losses_med_up = np.percentile(losses, q=75)

    scores_feature_high_loss = [feature(x) for x, l in zip(data, losses) if l >= losses_med_up]
    scores_feature_low_loss = [feature(x) for x, l in zip(data, losses) if l <= losses_med_down]

    return np.average(scores_feature_high_loss), np.average(scores_feature_low_loss)


def format_cell(value, norm_min, norm_max):
    value_color = (value-norm_min)/(norm_max-norm_min)*50
    value_color = max(value_color, 0)
    value_color = min(value_color, 45)
    return f"\\cellcolor{{black!{value_color:.2f}}} "
    

def feature_importance_all(feature, name="", precision=1):
    print(f"{name:>20}", end=" & ")
    out = ""
    for domain in ["general", "bio"]:
        for model in ["base", "ft"]:
            avg_up, avg_down = feature_importance(model, domain, feature)
            txt_up = f"{avg_up:.10f}"[:-(10-precision)].removesuffix(".")
            txt_down = f"{avg_down:.10f}"[:-(10-precision)].removesuffix(".")
            diff = abs(avg_up-avg_down)
            out += format_cell(diff/(0.01+min(abs(avg_up), abs(avg_down))), norm_min=0.0, norm_max=2) +  f"{txt_up}/{txt_down} & "
    out = out.removesuffix("& ") + r"\\"
    print(out)

feature_importance_all(name="Target length", precision=1, feature=lambda x: len(x["tgt"].split()))
feature_importance_all(name="Human score", precision=1, feature=lambda x: x["scores"]["human"])
print(r"\\[-0.7em]")
feature_importance_all(name="Critical sev.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["severity"] in {"major", "critical"}]))
feature_importance_all(name="Minor severity", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["severity"] == "minor"]))
feature_importance_all(name="Neutral sev.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["severity"] == "neutral"]))
print(r"\\[-0.7em]")
feature_importance_all(name="Spelling cat.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "spelling"]))
feature_importance_all(name="Mistrans. cat.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "mistranslation"]))
feature_importance_all(name="Formatting cat.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "formatting"]))
feature_importance_all(name="Term. cat.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "terminology"]))
feature_importance_all(name="Grammar cat.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "grammar"]))
feature_importance_all(name="Fluency cat.", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "fluency"]))
feature_importance_all(name="Other category", precision=2, feature=lambda x: len([i for i in x["errors"][0] if i["category"] == "other"]))

In [None]:
import statsmodels.api as sm
import scipy.stats
from sklearn.preprocessing import StandardScaler

FEATURES = {
    "Target length"   :lambda x: len(x["tgt"].split()),
    "Human score"     :lambda x: x["scores"]["human"],
    "Critical sev."   :lambda x: len([i for i in x["errors"][0] if i["severity"] in {"major", "critical"}]),
    "Minor severity"  :lambda x: len([i for i in x["errors"][0] if i["severity"] == "minor"]),
    "Neutral sev."    :lambda x: len([i for i in x["errors"][0] if i["severity"] == "neutral"]),

    "Fluency cat."     :lambda x: len([i for i in x["errors"][0] if i["category"] == "fluency"]),
    "Accuracy cat."    :lambda x: len([i for i in x["errors"][0] if i["category"] in {"accuracy", "untranslated"}]),
    "Terminology cat." :lambda x: len([i for i in x["errors"][0] if i["category"] == "terminology"]),
    "Locale cat."      :lambda x: len([i for i in x["errors"][0] if i["category"] == "locale"]),
    "Other category"   :lambda x: len([i for i in x["errors"][0] if i["category"] == "other"]),
}

computed_features = {}

def feature_importance_sm(model, domain):
    data = [x for lang in ["en-de", "en-ru", "zh-en"] for x in data_all[(domain, lang)]]
    scores_model = np.array([x["scores"][model]  for x in data])
    scores_human = np.array([x["scores"]["human"]  for x in data])
    losses = np.abs(scores_human-scores_model)
    features = np.array([[feature(x) for feature in FEATURES.values()] for x in data])

    features = StandardScaler().fit_transform(features)
    losses = StandardScaler().fit_transform(losses.reshape(-1, 1)).reshape(-1)

    computed_features[(model, domain)] = {
        f_name:scipy.stats.pearsonr(losses, features[:,f_i])
        for f_i, f_name in enumerate(FEATURES)
    }

feature_importance_sm("base", "general")
feature_importance_sm("base", "bio")
feature_importance_sm("ft", "general")
feature_importance_sm("ft", "bio")

def format_cell(x, norm_min=0, norm_max=1):
    coef, pval = x
    value_color = (abs(coef)-norm_min)/(norm_max-norm_min)*50
    value_color = max(value_color, 0)
    value_color = min(value_color, 45)
    if pval < 1e-5:
        extra = r"$^{*}$"
    else:
        extra = r"$^{\hspace{1mm}}$"

    return f"\\cellcolor{{black!{value_color:.2f}}} {coef:.2f} {extra}"
    
for f_name in FEATURES:
    if f_name in {"Critical sev.", "Fluency cat."}:
        print(r"\\[-1em]")
    print(f_name + " &")
    print(*[
        format_cell(computed_features[(model, domain)][f_name])
        for domain in ["general", "bio"]
        for model in ["base", "ft"]
    ], sep=" & ", end=r"\\"+"\n")

In [None]:
# find examples

def format_line(line):
    print(
        f"\\lsrc {line['src']} \\newline\n" +
        f"\\lmt {line['tgt']} \\newline\n" +
        f"\\lref {line['ref']} \n" +
        f"& " +
        f"\\lhum {line['scores']['human']:.2f} " +
        f"\\lbase {line['scores']['base']:.2f} " +
        f"\\lft {line['scores']['ft']:.2f} " +
        "\\\\\n"
    )

# en-de

x = sorted(
    data_all[("general", "en-de")],
    # we want high MQM score (good), high base score and low ft score
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["base"]) + x["scores"]["human"],
    reverse=True, # from highest to lowest
)
format_line(x[0])


x = sorted(
    [x for x in data_all[("general", "en-de")] if len(x["src"].split()) < 10],
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["base"]) - x["scores"]["human"]*0.2,
    reverse=True, # from highest to lowest
)
format_line(x[0])


x = sorted(
    [x for x in data_all[("bio", "en-de")] if len(x["src"].split()) < 10],
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["ft"]) + abs(x["scores"]["human"] - x["scores"]["base"]),
    reverse=True, # from highest to lowest
)
format_line(x[1])

# zh-en

x = sorted(
    data_all[("general", "zh-en")],
    # we want high MQM score (good), high base score and low ft score
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["base"]) + x["scores"]["human"]-abs(x["scores"]["human"] - x["scores"]["ft"]),
    reverse=True, # from highest to lowest
)
format_line(x[0])


x = sorted(
    [x for x in data_all[("general", "zh-en")] if len(x["tgt"].split()) < 10],
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["base"]) - x["scores"]["human"]*0.2+abs(x["scores"]["human"] - x["scores"]["ft"]),
    reverse=True, # from highest to lowest
)
format_line(x[5])


x = sorted(
    [x for x in data_all[("bio", "zh-en")] if len(x["tgt"].split()) < 50 and x["scores"]["human"]<0],
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["ft"])*3 + abs(x["scores"]["human"] - x["scores"]["base"]),
    reverse=True, # from highest to lowest
)
format_line(x[0])

# ru-en

x = sorted(
    data_all[("general", "en-ru")],
    # we want high MQM score (good), high base score and low ft score
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["base"]) + x["scores"]["human"]-abs(x["scores"]["human"] - x["scores"]["ft"]),
    reverse=True, # from highest to lowest
)
format_line(x[0])


x = sorted(
    [x for x in data_all[("general", "en-ru")] if len(x["tgt"].split()) < 10],
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["base"]) - x["scores"]["human"]*0.2+abs(x["scores"]["human"] - x["scores"]["ft"]),
    reverse=True, # from highest to lowest
)
format_line(x[4])


x = sorted(
    [x for x in data_all[("bio", "en-ru")] if len(x["tgt"].split()) < 20 and x["scores"]["human"]<0],
    key=lambda x: -abs(x["scores"]["human"] - x["scores"]["ft"])*2 + abs(x["scores"]["human"] - x["scores"]["base"]),
    reverse=True, # from highest to lowest
)
format_line(x[0])