In [1]:
import json
import itertools
from pathlib import Path
import pandas as pd

In [2]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [3]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row in the DataFrame corresponds to a single entry from a .jsonl file,
    augmented with 'direction' and 'system' columns to preserve its origin.

    Args:
        results_data (dict): The nested dictionary produced by the 
                             load_results_summaries function.

    Returns:
        pandas.DataFrame: A tidy DataFrame containing all results.
    """
    # Use a list comprehension for a fast and memory-efficient approach
    # This creates a flat list of records, where each record is a dictionary
    # that includes the original data plus the direction and system.
    all_records = [
        {
            'direction': direction,
            'system': system,
            **record  # Unpack the original record's key-value pairs
        }
        for direction, systems in results_data.items()
        for system, records in systems.items()
        if records is not None  # Gracefully skip any files that were not found
        for record in records
    ]

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    # Convert the list of dictionaries directly into a DataFrame
    df = pd.DataFrame(all_records)

    # Reorder columns to have identifying info first, for better readability
    # Get all columns from the original data, excluding our added keys
    original_cols = [col for col in df.columns if col not in ['direction', 'system']]
    # Create the desired column order
    preferred_order = ['direction', 'system'] + original_cols
    df = df[preferred_order]

    return df

In [None]:
BASE_DIR = '/hearing2translate/evaluation/output_evals/fleurs'
DIRECTION_PAIRS = ['en_de', 'de_en', 'en_es', 'es_en', 'en_fr', 'fr_en', 'en_it', 'it_en', 'en_nl', 'en_pt', 'pt_en', 'en_zh', 'zh_en']
SYSTEM_NAMES = ['canary-v2',  'gemma_canary-v2',  'gemma_seamlessm4t',  'owsm4.0-ctc', 'qwen2audio-7b',  'spirelm', 'tower_owsm4.0-ctc',  'tower_whisper',  'whisper',
                'desta2-8b',  'gemma_owsm4.0-ctc',  'gemma_whisper',    'phi4multimodal',  'seamlessm4t',   'tower_canary-v2',  'tower_seamlessm4t',  'voxtral-small-24b']

# Call the function and store the results
results_data = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)
results_df = convert_results_to_dataframe(results_data)

selected_cols = ['direction', 'system', 'SacreBLEU', 'chrF', 'LinguaPy',
                 'RefMetricX_24-Strict-linguapy', 'QEMetricX_24-Strict-linguapy',
                 'XCOMET-Strict-linguapy', 'XCOMET-QE-Strict-linguapy']
results_df = results_df[selected_cols]

In [5]:
selected_cols = ['direction', 'system', 'LinguaPy', 'QEMetricX_24-Strict-linguapy', 'XCOMET-QE-Strict-linguapy', 'chrF']
results_df = results_df[selected_cols]

In [6]:
results_df

Unnamed: 0,direction,system,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy,chrF
0,en_de,canary-v2,0.0000,2.3163,0.9447,61.9113
1,en_de,gemma_canary-v2,0.0000,1.3387,0.9662,61.9612
2,en_de,gemma_seamlessm4t,0.0000,1.3670,0.9667,61.5218
3,en_de,owsm4.0-ctc,0.1560,9.4253,0.7766,54.9522
4,en_de,qwen2audio-7b,1.8721,4.1800,0.8922,55.1050
...,...,...,...,...,...,...
203,zh_en,phi4multimodal,5.7143,3.7811,0.8353,51.5074
204,zh_en,seamlessm4t,0.0000,3.4874,0.8669,52.3303
205,zh_en,tower_canary-v2,44.3386,19.0474,0.1003,9.2749
206,zh_en,tower_seamlessm4t,0.0000,1.8900,0.9228,57.4825


In [7]:
lang_pairs_order = ['en_es', 'en_fr', 'en_pt', 'en_it', 'en_de', 'en_nl', 'en_zh', 'es_en', 'fr_en', 'pt_en', 'it_en', 'de_en', 'zh_en']

In [8]:
pivoted_xcomet_qe = results_df.pivot(index='system', columns='direction', values='XCOMET-QE-Strict-linguapy')[lang_pairs_order]

In [9]:
pivoted_xcomet_qe.to_csv('pivoted_xcomet_qe.csv')

In [10]:
pivoted_metricx_qe = results_df.pivot(index='system', columns='direction', values='QEMetricX_24-Strict-linguapy')[lang_pairs_order]

In [11]:
pivoted_metricx_qe.to_csv('pivoted_metricx_qe.csv')

In [12]:
pivoted_linguapy = results_df.pivot(index='system', columns='direction', values='LinguaPy')[lang_pairs_order]

In [13]:
pivoted_linguapy.to_csv('pivoted_linguapy.csv')

### Gender Fleurs

In [14]:
def load_all_jsons(base_dir, manifests_dir, direction_pairs, system_names):
    base_path = Path(base_dir)
    manifests_path = Path(manifests_dir)
    all_results = {}
    for direction, system in itertools.product(direction_pairs, system_names):
        results_path = base_path / system / direction / 'results.jsonl'
        direction_aux = '{direction}.jsonl'.format( direction = direction.replace('_', '-') )
        manifest_path = manifests_path / direction_aux

        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with results_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
            with manifest_path.open('r', encoding='utf-8') as f:
                manifests = [json.loads(line) for line in f]
                for it, it_manifests in zip(all_results[direction][system], manifests):
                    it_manifests['gender'] = it_manifests['benchmark_metadata']['gender']
                    it['linguapy_score'] = it['metrics']['linguapy_score'][0]
                    it['xcomet_qe_score'] = it['metrics']['xcomet_qe_score'] if it['linguapy_score'] == 0 else 0
                    it.update(it_manifests)
        
        except FileNotFoundError:
            pass

        except json.JSONDecodeError as e:
            pass

    results = []
    for direction in all_results.keys():
        for system in all_results[direction].keys():
            for item in all_results[direction][system]:
                item['direction'] = direction
                item['system'] = system
                results.append(item)

    results_df = pd.DataFrame(results)

    return results_df

In [None]:
MANIFESTS_DIR = '/hearing2translate/manifests/fleurs'
BASE_DIR = '/hearing2translate/evaluation/output_evals/fleurs'

In [16]:
DIRECTION_PAIRS = ['en_de', 'de_en', 'en_es', 'es_en', 'en_fr', 'fr_en', 'en_it', 'it_en', 'en_nl', 'en_pt', 'pt_en', 'en_zh', 'zh_en']
SYSTEM_NAMES = ['canary-v2',  'gemma_canary-v2',  'gemma_seamlessm4t',  'owsm4.0-ctc', 'qwen2audio-7b',  'spirelm', 'tower_owsm4.0-ctc',  'tower_whisper',  'whisper',
                'desta2-8b',  'gemma_owsm4.0-ctc',  'gemma_whisper',    'phi4multimodal',  'seamlessm4t',   'tower_canary-v2',  'tower_seamlessm4t',  'voxtral-small-24b']

columns_to_keep = ['dataset_id', 'sample_id', 'src_lang', 'tgt_lang', 'xcomet_qe_score', 'linguapy_score', 'gender', 'direction', 'system' ]
# Call the function and store the results
results_and_manifests = load_all_jsons(BASE_DIR, MANIFESTS_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)[columns_to_keep]

In [17]:
results_and_manifests.head()

Unnamed: 0,dataset_id,sample_id,src_lang,tgt_lang,xcomet_qe_score,linguapy_score,gender,direction,system
0,fleurs,1904,en,de,0.978743,0,0,en_de,canary-v2
1,fleurs,1675,en,de,0.948608,0,0,en_de,canary-v2
2,fleurs,1950,en,de,0.977481,0,1,en_de,canary-v2
3,fleurs,1728,en,de,1.0,0,1,en_de,canary-v2
4,fleurs,1972,en,de,0.995062,0,1,en_de,canary-v2


In [58]:
import io

# gender 1 --> female
# gender 0 --> male

def analyze_gender_diff_by_system(df, target_direction, target_system):
    """
    Analyzes the average difference in xcomet_qe_score between gender=0
    and gender=1 pairs for a given direction and system.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        target_direction (str): The direction to filter by (e.g., 'en_de').
        target_system (str): The system to filter by (e.g., 'canary-v2').
    """
    try:
        # Filter for the specified direction and system
        df_filtered = df[(df['direction'] == target_direction) & (df['system'] == target_system)].copy()

        if df_filtered.empty:
            print(f"No data found for direction='{target_direction}' and system='{target_system}'.")
            return

        # Separate gender 0 and gender 1
        # Ensure 'gender' is integer type for comparison
        df_filtered['gender'] = pd.to_numeric(df_filtered['gender'], errors='coerce')
        
        df_gender_0 = df_filtered[df_filtered['gender'] == 0]
        df_gender_1 = df_filtered[df_filtered['gender'] == 1]

        # Select relevant columns for merging
        cols_to_keep = ['sample_id', 'xcomet_qe_score']

        # Merge to find pairs (matching sample_id)
        # This step ensures we only get sample_ids that have *both*
        # a gender=0 and a gender=1 entry.
        df_merged = pd.merge(
            df_gender_0[cols_to_keep],
            df_gender_1[cols_to_keep],
            on='sample_id',
            suffixes=('_g0', '_g1')
        )

        # Check if any pairs were found
        num_pairs = len(df_merged)

        if num_pairs == 0:
            print(f"No matching pairs (gender 0 & 1 for the same sample_id) found for direction='{target_direction}' and system='{target_system}'.")

        # Calculate the difference (gender 1 score - gender 0 score)
        df_merged['abs_score_diff'] = abs( df_merged['xcomet_qe_score_g1'] - df_merged['xcomet_qe_score_g0'] )
        df_merged['score_diff'] = df_merged['xcomet_qe_score_g1'] - df_merged['xcomet_qe_score_g0']  # if negative, male quality better than female quality
        
        # Calculate the average difference
        avg_diff = df_merged['score_diff'].mean()
        abs_diff = df_merged['abs_score_diff'].mean()

        # Disparity score as defined in 
        
        #Giuseppe Attanasio, Beatrice Savoldi, Dennis Fucci, and Dirk Hovy. 2024. 
        #Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit 
        #Gender Performance Gaps. In Proceedings of the 2024 Conference on Empirical 
        #Methods in Natural Language Processing, pages 21318–21340, Miami, Florida, 
        #USA. Association for Computational Linguistics.
        
        phi_g0 = df_merged['xcomet_qe_score_g0'].mean()
        phi_g1 = df_merged['xcomet_qe_score_g1'].mean()
        E_quality = 100.0 * (phi_g1 - phi_g0) / phi_g0

        return (avg_diff, abs_diff, E_quality)

    except Exception as e:
        print(f"An error occurred during analysis for {target_direction}, {target_system}: {e}")

In [None]:
DIRECTION_PAIRS = ['en_de', 'de_en', 'en_es', 'es_en', 'en_fr', 'fr_en', 'en_it', 'it_en', 'en_nl', 'en_pt', 'pt_en', 'en_zh', 'zh_en']
SYSTEM_NAMES = ['canary-v2',  'gemma_canary-v2',  'gemma_seamlessm4t',  'owsm4.0-ctc', 'qwen2audio-7b',  'spirelm', 'tower_owsm4.0-ctc',  'tower_whisper',  'whisper',
                'desta2-8b',  'gemma_owsm4.0-ctc',  'gemma_whisper',    'phi4multimodal',  'seamlessm4t',   'tower_canary-v2',  'tower_seamlessm4t',  'voxtral-small-24b']

results_diffs = []
for direction_pair in DIRECTION_PAIRS:
    for sys in SYSTEM_NAMES:
        diff_metrics = analyze_gender_diff_by_system(results_and_manifests, direction_pair, sys)
        if diff_metrics is None:
            continue
        results_diffs.append({'system': sys, 'direction': direction_pair, 'diff_score': diff_metrics[0], 'abs_diff_score':  diff_metrics[1], 'E_quality': diff_metrics[2] })

df_diffs_scores = pd.DataFrame(results_diffs)

In [49]:
lang_pairs_order = ['en_es', 'en_fr', 'en_pt', 'en_it', 'en_de', 'en_nl', 'en_zh', 'es_en', 'fr_en', 'pt_en', 'it_en', 'de_en', 'zh_en']
df_diffs_scores_pivoted = df_diffs_scores.pivot(index='system', columns='direction', values='diff_score')[lang_pairs_order]

In [50]:
df_diffs_scores_pivoted.round(4).to_csv('pivoted_diff_scores.csv')

In [51]:
df_diffs_scores_pivoted

direction,en_es,en_fr,en_pt,en_it,en_de,en_nl,en_zh,es_en,fr_en,pt_en,it_en,de_en,zh_en
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
canary-v2,0.015879,0.033749,0.009841,0.011951,0.003265,0.003918,0.0,-0.030723,0.029803,,0.023716,,-0.005072917
desta2-8b,0.014417,0.056142,0.01244,0.031035,-0.004199,0.014174,-0.072205,-0.01837,-0.026449,,0.076392,,0.01099179
gemma_canary-v2,0.002988,0.008322,0.00115,0.000698,-0.005871,-0.002147,0.004819,-0.005263,0.006254,,-0.003601,,-0.001842131
gemma_owsm4.0-ctc,-0.006907,-0.005178,7.9e-05,-0.010728,0.000651,-0.001167,0.006825,-0.001728,0.006568,,-0.005137,,-4.071014e-05
gemma_seamlessm4t,-0.001017,0.005686,-0.003758,0.000851,0.001346,-0.001076,0.002978,0.007181,-9.1e-05,,0.001896,,-0.003414776
gemma_whisper,-0.003896,0.002697,0.001435,0.004013,0.000489,-0.005901,0.002215,0.000186,0.0047,,-0.013466,,-0.0009227105
owsm4.0-ctc,-0.009001,-0.004435,-0.260975,-0.027493,-0.001734,-0.023069,0.033044,0.007784,-0.071145,,-0.087207,,-0.01837642
phi4multimodal,0.036476,-0.00843,-0.000834,0.003683,0.005576,0.065865,0.016542,-0.007178,-0.010249,,-0.006458,,-0.007274718
qwen2audio-7b,-0.008711,0.019633,0.03134,0.006433,0.017533,0.015516,0.012967,-0.005733,-0.033348,,0.026358,,-0.003436196
seamlessm4t,0.020123,0.01252,0.016317,0.002535,0.004038,0.000625,0.022535,-0.004783,0.005112,,-0.00144,,-0.003905247


In [52]:
lang_pairs_order = ['en_es', 'en_fr', 'en_pt', 'en_it', 'en_de', 'en_nl', 'en_zh', 'es_en', 'fr_en', 'pt_en', 'it_en', 'de_en', 'zh_en']
df_abs_diffs_scores_pivoted = df_diffs_scores.pivot(index='system', columns='direction', values='abs_diff_score')[lang_pairs_order]

In [53]:
df_abs_diffs_scores_pivoted.round(4).to_csv('pivoted_abs_diff_scores.csv')

In [54]:
df_abs_diffs_scores_pivoted

direction,en_es,en_fr,en_pt,en_it,en_de,en_nl,en_zh,es_en,fr_en,pt_en,it_en,de_en,zh_en
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
canary-v2,0.069597,0.101733,0.073402,0.084761,0.030401,0.042526,0.0,0.100852,0.111725,,0.19313,,0.03762541
desta2-8b,0.151148,0.205906,0.114585,0.156621,0.073496,0.107222,0.280136,0.1255,0.226413,,0.20731,,0.1745037
gemma_canary-v2,0.018375,0.022276,0.016012,0.01601,0.014587,0.014042,0.048266,0.018035,0.016655,,0.013863,,0.00771601
gemma_owsm4.0-ctc,0.026386,0.031,0.020046,0.034995,0.01166,0.016913,0.046435,0.021671,0.024873,,0.030634,,0.04061146
gemma_seamlessm4t,0.015101,0.026945,0.012977,0.020198,0.012513,0.012677,0.044902,0.022264,0.017929,,0.011246,,0.029649
gemma_whisper,0.01611,0.029476,0.01803,0.032885,0.015693,0.02326,0.048426,0.018225,0.01935,,0.02254,,0.02169933
owsm4.0-ctc,0.176542,0.163859,0.372618,0.173606,0.116238,0.130144,0.137651,0.144567,0.186353,,0.188175,,0.06783306
phi4multimodal,0.171581,0.218962,0.142262,0.134014,0.197411,0.152858,0.126016,0.074017,0.090874,,0.117607,,0.09149883
qwen2audio-7b,0.171634,0.139389,0.118448,0.161088,0.123275,0.120797,0.147903,0.116905,0.17421,,0.127176,,0.121502
seamlessm4t,0.07122,0.094179,0.07135,0.075412,0.0483,0.040832,0.103304,0.066464,0.125929,,0.048667,,0.0718947


In [55]:
lang_pairs_order = ['en_es', 'en_fr', 'en_pt', 'en_it', 'en_de', 'en_nl', 'en_zh', 'es_en', 'fr_en', 'pt_en', 'it_en', 'de_en', 'zh_en']
df_E_quality_scores_pivoted = df_diffs_scores.pivot(index='system', columns='direction', values='E_quality')[lang_pairs_order]

In [56]:
df_E_quality_scores_pivoted.round(4).to_csv('pivoted_E_quality_scores.csv')

In [57]:
df_E_quality_scores_pivoted

direction,en_es,en_fr,en_pt,en_it,en_de,en_nl,en_zh,es_en,fr_en,pt_en,it_en,de_en,zh_en
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
canary-v2,1.776991,3.902252,1.107797,1.344291,0.345558,0.422662,,-3.634491,3.546237,,2.940353,,-11.22932
desta2-8b,1.751809,7.799422,1.512309,4.067025,-0.458246,1.641556,-13.868881,-2.186813,-3.655034,,10.709737,,1.476061
gemma_canary-v2,0.325878,0.922782,0.123834,0.075095,-0.608257,-0.225427,0.553883,-0.568238,0.686316,,-0.388071,,-5.344266
gemma_owsm4.0-ctc,-0.757743,-0.577957,0.008718,-1.174015,0.068178,-0.122545,0.790309,-0.19214,0.765932,,-0.58182,,-0.004726556
gemma_seamlessm4t,-0.109359,0.619928,-0.399633,0.091521,0.139382,-0.112155,0.331278,0.793224,-0.010145,,0.212353,,-0.3751686
gemma_whisper,-0.419456,0.296111,0.153452,0.433754,0.050731,-0.617627,0.255253,0.020316,0.522358,,-1.435322,,-0.1007425
owsm4.0-ctc,-1.559678,-1.115365,-52.676332,-5.568599,-0.224927,-4.573439,5.9378,1.499355,-14.042635,,-17.429963,,-8.600125
phi4multimodal,4.724734,-2.033466,-0.104466,0.439952,0.800623,10.236262,2.169491,-0.808585,-1.127218,,-0.738591,,-0.8595537
qwen2audio-7b,-1.10424,2.426073,3.765552,0.809122,2.022281,1.906149,1.627155,-0.685581,-4.131986,,3.226498,,-0.4076187
seamlessm4t,2.267346,1.433299,1.823799,0.280421,0.42987,0.066145,3.102886,-0.5376,0.589702,,-0.156507,,-0.4456253
