In [1]:
import pandas as pd
import os
from pathlib import Path

In [2]:
base_dir = Path("../../analysis/")
short_dir = base_dir / "mcif-short"
long_dir = base_dir / "mcif-long"

In [3]:
langs = ["en_de", "en_zh", "en_it"]

In [4]:
def compute_diffs(short_file, long_file, fill_missing_with_zero=False):
    """ long - short """
    short_df = pd.read_csv(short_file)
    long_df = pd.read_csv(long_file)

    merged = short_df.merge(long_df, on='system', how='outer', suffixes=('_sh', '_lg'), indicator=True)

    # Matching column names
    sh_cols = list(merged.filter(like='_sh').columns)
    lg_cols = [c.replace('_sh', '_lg') for c in sh_cols]

    common_pairs = [(s, l) for s, l in zip(sh_cols, lg_cols) if l in merged.columns]

    # result table
    out = merged[['system', '_merge']].copy()

    # compute diff
    for s_col, l_col in common_pairs:
        base = s_col[:-3]  # remove trailing '_sh'
        diff_col = f'{base}_diff'

        # Convert to num
        s_vals = pd.to_numeric(merged[s_col], errors='coerce')
        l_vals = pd.to_numeric(merged[l_col], errors='coerce')

        if fill_missing_with_zero:
            s_vals = s_vals.fillna(0)
            l_vals = l_vals.fillna(0)

        out[diff_col] = (l_vals - s_vals).round(4)

    return out

In [5]:
result = {}
for lang in langs:
    short_file = short_dir / f"mcif_{lang}.csv"
    long_file = long_dir / f"mcif_{lang}.csv"
    
    diff = compute_diffs(short_file, long_file)
    result[lang] = diff

In [6]:
for k, v in result.items():
    print(k)
    print(v, '\n\n')

en_de
              system     _merge  LinguaPy_diff  metricx_qe_score_diff  \
0          canary-v2       both         1.1970                 1.2412   
1          desta2-8b       both        66.1589                10.0604   
2        owsm4.0-ctc       both        -0.2176                -0.5445   
3     phi4multimodal       both        -8.4875                -1.1253   
4      qwen2audio-7b       both        74.7552                 9.6059   
5        seamlessm4t  left_only            NaN                    NaN   
6  voxtral-small-24b       both        -0.2176                 0.0819   

   QEMetricX_24-Strict-linguapy_diff  xcomet_qe_score_diff  \
0                             1.3979               -0.0361   
1                            16.8644               -0.6078   
2                            -0.6362                0.0106   
3                            -1.0892               -0.1279   
4                            17.4091               -0.6012   
5                                NaN 