In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats.stats import kendalltau


## Audio vs. text source

In [None]:
results_dir = "../outputs/wmt24/final"
hum_score_dir = "../../mt-metrics-eval-data/mt-metrics-eval-v2/wmt24/human-scores"
docs_dir = "../../mt-metrics-eval-data/mt-metrics-eval-v2/wmt24/documents"
sources_dir = "../../mt-metrics-eval-data/mt-metrics-eval-v2/wmt24/sources"
wmt24_dir = "../../mt-metrics-eval-data/mt-metrics-eval-v2/wmt24/metric-scores"

In [None]:
lang_dirs = os.scandir(wmt24_dir)

for lp in lang_dirs:
    if os.path.isdir(lp):
        res_files = os.scandir(lp)
        mm_mt = [f for f in res_files if ""]

In [None]:
res_files = os.scandir(results_dir)
results = []
lang_pairs = []
for res_file in res_files:
    print(res_file.name)
    if os.path.isfile(res_file):
        res_df = pd.read_csv(res_file)
        lang_pair = res_file.name.split('_')[0]
        lp = lang_pair.split('-')
        from_lang = lp[0]
        to_lang = lp[1]
        src_list_doc = f"{docs_dir}/{from_lang}-{to_lang}.docs"
        src_doc = f"{sources_dir}/{from_lang}-{to_lang}.txt"
        print(src_list_doc)

        with open(src_list_doc) as input_file:
            srcs_list = input_file.readlines()
            speech_sources = [(i, s) for i, s in enumerate(srcs_list) if "speech" in s]

        num_sentences = len(speech_sources)
        wavfiles = [s for (i, s) in speech_sources]

        with open(src_doc) as input_file:
            src_sents = input_file.readlines()
            speech_src_sents = [src_sents[s[0]] for s in speech_sources]

        speech_src_sents = speech_src_sents * int(len(res_df) / num_sentences)
        wavfiles = wavfiles * int(len(res_df) / num_sentences)
        res_df['source_sent'] = speech_src_sents
        res_df['wav'] = wavfiles
        res_df['lang_pair'] = lang_pair
        res_df['mt_system'] = res_df.mt_system.apply(lambda x: x.replace(".txt", ""))
        results.append(res_df)
        lang_pairs.append(lang_pair)

print(len(results))
print(len(lang_pairs))

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(10, 20))
axs = axs.flat

tau_txt = []
tau_audio = []
all_hum_res = []

for ix, lp in enumerate(lang_pairs):
    lp_tuple = lp.split('-')
    from_lang = lp_tuple[0]
    to_lang = lp_tuple[1]
    hum_score_file = glob.glob(f"{hum_score_dir}/{lp}.*.seg.score")[0]
    src_list_doc = f"{docs_dir}/{lp}.docs"

    with open(src_list_doc) as input_file:
        srcs_list = input_file.readlines()

    speech_sources = [i for i, s in enumerate(srcs_list) if "speech" in s]
    hum_score = pd.read_csv(hum_score_file, delimiter="\t",
                            names=["mt_system", "score"])
    mt_systems_mqm = np.unique(hum_score.mt_system)

    hum_speech_only = []
    for system in mt_systems_mqm:
        sys_df = hum_score[hum_score.mt_system == system]
        sys_df = sys_df.iloc[speech_sources]
        hum_speech_only.append(sys_df)

    hum_res = pd.concat(hum_speech_only)
    all_hum_res.append(hum_res)
    blaser_res = results[ix]

    ranked_hum = hum_res.groupby('mt_system').mean().abs()
    acs_order = "mqm" in hum_score_file
    ranked_hum = ranked_hum.reset_index().sort_values('score', ascending=acs_order)
    ranked_hum = ranked_hum.dropna()
    ranked_hum['rank'] = range(1, len(ranked_hum)+1)

    blaser_res = blaser_res[blaser_res.mt_system.isin(ranked_hum.mt_system)]
    ranked_blaser_audio = blaser_res[['mt_system','audio_source']].groupby('mt_system')
    ranked_blaser_audio = ranked_blaser_audio.mean().reset_index().sort_values(
            'audio_source', ascending=False)
    ranked_blaser_audio['rank'] = range(1, len(ranked_blaser_audio)+1)

    ranked_blaser_text = blaser_res[['mt_system', 'text_source']].groupby('mt_system')
    ranked_blaser_text = ranked_blaser_text.mean().reset_index().sort_values(
            'text_source', ascending=False)
    ranked_blaser_text['rank'] = range(1, len(ranked_blaser_text)+1)

    merged_audio = pd.merge(ranked_hum, ranked_blaser_audio, on='mt_system')
    merged_all = pd.merge(merged_audio, ranked_blaser_text, on='mt_system')

    merged_renamed = merged_all[['mt_system',
                             'rank_x',
                             'rank_y',
                             'rank']].rename(
                                 columns={'rank_x': 'human',
                                          'rank_y': 'text_src',
                                          'rank': 'audio_src'})

    corr_text = kendalltau(merged_renamed.human, merged_renamed.text_src)
    corr_audio = kendalltau(merged_renamed.human, merged_renamed.audio_src)
    tau_txt.append(corr_text.statistic)
    tau_audio.append(corr_audio.statistic)

    print(lp)
    print(merged_renamed)
    print(f"Audio Tau = {corr_audio.statistic}")
    print(f"Text Tau = {corr_text.statistic}")

    merged_renamed['rank_diff_txt'] = merged_renamed.human - merged_renamed.text_src
    merged_renamed['rank_diff_au'] = merged_renamed.human - merged_renamed.audio_src
    merged_renamed = merged_renamed.sort_values(by='human')

    g_txt = sns.scatterplot(merged_renamed, x='mt_system',
                        y='rank_diff_txt',
                        ax=axs[ix],
                        s=70,
                        label='Text source')
    g_au = sns.scatterplot(merged_renamed, x='mt_system',
                        y='rank_diff_au',
                        ax=axs[ix],
                        s=70,
                        label='Audio source')
    axs[ix].set_title(lp)
    axs[ix].axhline(y=0, linewidth=2, color='red', ls='--', lw=1)
    g_txt.set_xlabel("Mt system")
    g_txt.set_ylabel("Rank difference (Human - BLASER)")
    axs[ix].set_xticks(ticks=range(len(merged_renamed.mt_system)),
                       labels=merged_renamed.mt_system,
                       rotation=30, fontsize=8,
                       fontdict={'horizontalalignment': 'right'})
    axs[ix].get_legend().set_visible(False)
plt.tight_layout()
plt.savefig("../outputs/wmt24/plots/BLASER2-vs-hum-by-language.png")

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(10, 20))
axs = axs.flat

for ix, lp in enumerate(lang_pairs):
    lp_tuple = lp.split('-')

    blaser_res = results[ix]

    g_txt = sns.histplot(blaser_res.text_source,
                        ax=axs[ix],
                        label='Text source',
                        bins=100)
    g_au = sns.histplot(blaser_res.audio_source,
                        ax=axs[ix],
                        label='Audio source',
                        bins=100)
    axs[ix].set_title(lp)
    axs[ix].set_xlabel('BLASER-2 score')
plt.tight_layout()
plt.savefig("../outputs/wmt24/plots/BLASER2-audio-vs-text-by-lang.png")

In [None]:
fig, axs = plt.subplots()
plt.scatter(range(len(lang_pairs)), tau_txt, label="Text source")
plt.scatter(range(len(lang_pairs)), tau_audio, label="Audio source")
plt.xticks(ticks=range(len(lang_pairs)), labels=lang_pairs)
plt.xlabel("Language pair")
plt.ylabel("Tau (human rank vs. BLASER rank)")
plt.legend()

In [None]:
all_res = pd.concat(results)
grouped_mean_lang = all_res[['audio_source', 'text_source', 'lang_pair']].groupby(
        'lang_pair').mean().reset_index()
fig, axs = plt.subplots()
grouped_mean_lang.plot(kind='bar', ax=axs)
axs.set_xticks(ticks=range(10), labels=grouped_mean_lang.lang_pair, rotation=0)
axs.set_xlabel("Language pair")
axs.set_ylabel("Mean BLASER-2 score")

In [None]:
fig, axs = plt.subplots()
g = sns.histplot(all_res.audio_source, ax=axs, label='Audio source')
g = sns.histplot(all_res.text_source, ax=axs, label="Text source")
g.set_xlabel("BLASER-2 score")
plt.legend()

In [None]:
all_blaser_results = pd.concat(results)

In [None]:
en_es_only = all_blaser_results[all_blaser_results.lang_pair == 'en-es']

In [None]:
en_es_only[(en_es_only.audio_source > 4) & (en_es_only.text_source > 4)]