In [1]:
import pandas as pd
import os
from pathlib import Path

In [2]:
base_dir = Path("../../analysis/")
short_dir = base_dir / "acl6060-short"
long_dir = base_dir / "acl6060-long"

In [3]:
langs = ["en_de", "en_zh", "en_fr", "en_pt"]

In [4]:
def compute_diffs(short_file, long_file, fill_missing_with_zero=False):
    """ long - short """
    short_df = pd.read_csv(short_file)
    long_df = pd.read_csv(long_file)

    merged = short_df.merge(long_df, on='system', how='outer', suffixes=('_sh', '_lg'), indicator=True)

    # Matching column names
    sh_cols = list(merged.filter(like='_sh').columns)
    lg_cols = [c.replace('_sh', '_lg') for c in sh_cols]

    common_pairs = [(s, l) for s, l in zip(sh_cols, lg_cols) if l in merged.columns]

    # result table
    out = merged[['system', '_merge']].copy()

    # compute diff
    for s_col, l_col in common_pairs:
        base = s_col[:-3]  # remove trailing '_sh'
        diff_col = f'{base}_diff'

        # Convert to num
        s_vals = pd.to_numeric(merged[s_col], errors='coerce')
        l_vals = pd.to_numeric(merged[l_col], errors='coerce')

        if fill_missing_with_zero:
            s_vals = s_vals.fillna(0)
            l_vals = l_vals.fillna(0)

        out[diff_col] = (s_vals - l_vals).round(4) # expected higher - expected lower

    return out

In [5]:
result = {}
for lang in langs:
    short_file = short_dir / f"acl6060_{lang}.csv"
    long_file = long_dir / f"acl6060_{lang}.csv"
    
    diff = compute_diffs(short_file, long_file)
    result[lang] = diff

In [7]:
for k, v in result.items():
    v.sort_values("system").to_csv(f"diff_{k}.csv", index=None)