In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas tqdm sacrebleu

import os
import pandas as pd
from tqdm import tqdm
from sacrebleu.metrics import BLEU

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.4.3


In [None]:
# Define constants
DIALECTS = ['EAAVE', 'IndE', 'JamE', 'CollSgE', 'ChcE']
DATASETS = ['WSC (659)', 'SST-2 (1000)', 'MultiRC (1000)', 'COPA (500)', 'BoolQ (1000)']
THRESHOLD = 0.7

# Dictionary mapping datasets to their column names
COLUMN_MAPPING = {
    'BoolQ (1000)': ('SAE Passage', 'SAE Passage'),
    'COPA (500)': ('Premise', 'Premise'),
    'MultiRC (1000)': ('Paragraph', 'Paragraph'),
    'SST-2 (1000)': ('Original Sentence', 'Original Sentence'),
    'WSC (659)': ('Original Paragraph', 'Original Paragraph')
}

def create_output_directories(dialect):
    base_path = f'/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/{dialect}/GLUE + SuperGLUE'
    for dataset in DATASETS:
        os.makedirs(f'{base_path}/{dataset}', exist_ok=True)
    return base_path

def calculate_bleu_score(reference, candidate):
    try:
        bleu = BLEU(effective_order=True)
        return bleu.sentence_score(candidate, [reference]).score / 100
    except:
        return 0.0

def process_dataset(dialect, dataset, input_path, output_base_path):
    try:
        # Get column names
        sae_col, dialect_col_suffix = COLUMN_MAPPING[dataset]
        dialect_col = f'{dialect} ({sae_col})'
        bleu_col = f'BLEU Score {sae_col}'

        # Read dataset
        df = pd.read_csv(input_path)

        # Convert columns to string and handle NaN values
        df[sae_col] = df[sae_col].fillna('').astype(str)
        df[dialect_col] = df[dialect_col].fillna('').astype(str)

        # Calculate BLEU scores with progress bar
        tqdm.pandas(desc=f'Calculating BLEU scores for {dialect} - {dataset}')
        df[bleu_col] = df.progress_apply(
            lambda row: calculate_bleu_score(row[sae_col], row[dialect_col]),
            axis=1
        )

        # Save full dataset with BLEU scores
        output_path = f'{output_base_path}/{dataset}'
        df.to_csv(f'{output_path}/{dataset}_bleu_scores.csv', index=False)

        # Save filtered dataset
        filtered_df = df[df[bleu_col] < THRESHOLD]
        filtered_df.to_csv(f'{output_path}/{dataset}_filtered_bleu_scores.csv', index=False)

        # Calculate and save statistics
        total_samples = len(df)
        below_threshold = len(filtered_df)
        percentage = (below_threshold / total_samples) * 100

        with open(f'{output_path}/{dataset}_percentage.txt', 'w') as f:
            f.write(f'Total Samples: {total_samples}\n')
            f.write(f'Samples Below Threshold (<0.7): {below_threshold}\n')
            f.write(f'Percentage Below Threshold: {percentage:.2f}%')

        return total_samples, below_threshold, percentage

    except Exception as e:
        print(f'\nError in process_dataset for {dialect} - {dataset}: {str(e)}')
        return 0, 0, 0

def main():
    for dialect in tqdm(DIALECTS, desc='Processing dialects'):
        output_base_path = create_output_directories(dialect)

        for dataset in tqdm(DATASETS, desc=f'Processing datasets for {dialect}'):
            input_path = f'/content/drive/MyDrive/!!Multi-AAVENUE/GPT 4o + Multi-VALUE Translated Datasets/GPT 4o/{dialect}/Glue + SuperGlue/{dataset}_{dialect}.csv'

            try:
                total, below, percentage = process_dataset(
                    dialect,
                    dataset,
                    input_path,
                    output_base_path
                )
                if total > 0:
                    print(f'\nProcessed {dataset} for {dialect}:')
                    print(f'Total: {total}, Below threshold: {below}, Percentage: {percentage:.2f}%')
            except Exception as e:
                print(f'\nError processing {dataset} for {dialect}: {str(e)}')

if __name__ == '__main__':
    main()

Processing dialects:   0%|          | 0/5 [00:00<?, ?it/s]
Processing datasets for EAAVE:   0%|          | 0/5 [00:00<?, ?it/s][A

Calculating BLEU scores for EAAVE - WSC (659):   0%|          | 0/658 [00:00<?, ?it/s][A[A

Calculating BLEU scores for EAAVE - WSC (659):  25%|██▍       | 164/658 [00:00<00:00, 1632.75it/s][A[A

Calculating BLEU scores for EAAVE - WSC (659):  57%|█████▋    | 374/658 [00:00<00:00, 1902.89it/s][A[A

Calculating BLEU scores for EAAVE - WSC (659): 100%|██████████| 658/658 [00:00<00:00, 1705.60it/s]

Processing datasets for EAAVE:  20%|██        | 1/5 [00:00<00:01,  2.04it/s][A


Processed WSC (659) for EAAVE:
Total: 658, Below threshold: 658, Percentage: 100.00%




Calculating BLEU scores for EAAVE - SST-2 (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for EAAVE - SST-2 (1000):  19%|█▉        | 188/1000 [00:00<00:00, 1829.95it/s][A[A

Calculating BLEU scores for EAAVE - SST-2 (1000):  37%|███▋      | 371/1000 [00:00<00:00, 1525.69it/s][A[A

Calculating BLEU scores for EAAVE - SST-2 (1000):  63%|██████▎   | 634/1000 [00:00<00:00, 1977.40it/s][A[A

Calculating BLEU scores for EAAVE - SST-2 (1000): 100%|██████████| 1000/1000 [00:00<00:00, 2128.22it/s]

Processing datasets for EAAVE:  40%|████      | 2/5 [00:01<00:01,  1.87it/s][A


Processed SST-2 (1000) for EAAVE:
Total: 1000, Below threshold: 1000, Percentage: 100.00%




Calculating BLEU scores for EAAVE - MultiRC (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):   2%|▏         | 17/1000 [00:00<00:05, 166.92it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):   3%|▎         | 34/1000 [00:00<00:06, 146.61it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):   6%|▌         | 58/1000 [00:00<00:05, 183.87it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):   8%|▊         | 83/1000 [00:00<00:04, 208.45it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):  11%|█         | 106/1000 [00:00<00:04, 214.74it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):  13%|█▎        | 128/1000 [00:00<00:04, 215.08it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):  15%|█▌        | 154/1000 [00:00<00:03, 228.98it/s][A[A

Calculating BLEU scores for EAAVE - MultiRC (1000):  18%|█▊        | 181/1000 [00:00<00:03, 241.61it/s][A[A

Calculating BL


Processed MultiRC (1000) for EAAVE:
Total: 1000, Below threshold: 997, Percentage: 99.70%




Calculating BLEU scores for EAAVE - COPA (500):   0%|          | 0/500 [00:00<?, ?it/s][A[A

Calculating BLEU scores for EAAVE - COPA (500): 100%|██████████| 500/500 [00:00<00:00, 3987.06it/s]

Processing datasets for EAAVE:  80%|████████  | 4/5 [00:05<00:01,  1.33s/it]


Processed COPA (500) for EAAVE:
Total: 500, Below threshold: 497, Percentage: 99.40%


[A

Calculating BLEU scores for EAAVE - BoolQ (1000):   0%|          | 0/999 [00:00<?, ?it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  10%|▉         | 98/999 [00:00<00:00, 979.23it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  20%|█▉        | 196/999 [00:00<00:00, 907.07it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  29%|██▉       | 288/999 [00:00<00:00, 824.79it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  37%|███▋      | 372/999 [00:00<00:00, 763.61it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  45%|████▌     | 450/999 [00:00<00:00, 663.78it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  52%|█████▏    | 519/999 [00:00<00:00, 660.45it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  60%|██████    | 602/999 [00:00<00:00, 708.07it/s][A[A

Calculating BLEU scores for EAAVE - BoolQ (1000):  68%|██████▊   | 681/999 [00:00<00:00, 731.33it/s][A[A

Calculating BLEU scores for EAAVE -


Processed BoolQ (1000) for EAAVE:
Total: 999, Below threshold: 997, Percentage: 99.80%



Processing datasets for IndE:   0%|          | 0/5 [00:00<?, ?it/s][A

Calculating BLEU scores for IndE - WSC (659):   0%|          | 0/658 [00:00<?, ?it/s][A[A

Calculating BLEU scores for IndE - WSC (659):  26%|██▌       | 172/658 [00:00<00:00, 1716.96it/s][A[A

Calculating BLEU scores for IndE - WSC (659): 100%|██████████| 658/658 [00:00<00:00, 2447.40it/s]

Processing datasets for IndE:  20%|██        | 1/5 [00:00<00:01,  2.75it/s][A


Processed WSC (659) for IndE:
Total: 658, Below threshold: 511, Percentage: 77.66%




Calculating BLEU scores for IndE - SST-2 (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for IndE - SST-2 (1000):  40%|████      | 402/1000 [00:00<00:00, 4015.10it/s][A[A

Calculating BLEU scores for IndE - SST-2 (1000): 100%|██████████| 1000/1000 [00:00<00:00, 3593.01it/s]

Processing datasets for IndE:  40%|████      | 2/5 [00:00<00:01,  2.80it/s][A


Processed SST-2 (1000) for IndE:
Total: 1000, Below threshold: 807, Percentage: 80.70%




Calculating BLEU scores for IndE - MultiRC (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):   2%|▏         | 24/1000 [00:00<00:04, 239.84it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):   5%|▍         | 48/1000 [00:00<00:04, 214.03it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):   7%|▋         | 70/1000 [00:00<00:05, 182.17it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):   9%|▉         | 93/1000 [00:00<00:04, 197.53it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):  12%|█▏        | 119/1000 [00:00<00:04, 215.64it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):  14%|█▍        | 143/1000 [00:00<00:03, 222.26it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):  17%|█▋        | 168/1000 [00:00<00:03, 228.71it/s][A[A

Calculating BLEU scores for IndE - MultiRC (1000):  20%|██        | 203/1000 [00:00<00:03, 265.23it/s][A[A

Calculating BLEU scores


Processed MultiRC (1000) for IndE:
Total: 1000, Below threshold: 376, Percentage: 37.60%




Calculating BLEU scores for IndE - COPA (500):   0%|          | 0/500 [00:00<?, ?it/s][A[A

Calculating BLEU scores for IndE - COPA (500):  44%|████▎     | 218/500 [00:00<00:00, 2177.59it/s][A[A

Calculating BLEU scores for IndE - COPA (500): 100%|██████████| 500/500 [00:00<00:00, 2259.29it/s]

Processing datasets for IndE:  80%|████████  | 4/5 [00:05<00:01,  1.45s/it][A


Processed COPA (500) for IndE:
Total: 500, Below threshold: 422, Percentage: 84.40%




Calculating BLEU scores for IndE - BoolQ (1000):   0%|          | 0/999 [00:00<?, ?it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):   5%|▌         | 53/999 [00:00<00:01, 528.91it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  11%|█         | 106/999 [00:00<00:01, 504.86it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  16%|█▌        | 160/999 [00:00<00:01, 519.64it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  21%|██▏       | 213/999 [00:00<00:01, 433.57it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  26%|██▌       | 259/999 [00:00<00:01, 407.67it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  30%|███       | 303/999 [00:00<00:01, 411.37it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  35%|███▍      | 345/999 [00:00<00:01, 408.72it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000):  39%|███▊      | 387/999 [00:00<00:01, 409.58it/s][A[A

Calculating BLEU scores for IndE - BoolQ (1000)


Processed BoolQ (1000) for IndE:
Total: 999, Below threshold: 400, Percentage: 40.04%



Processing datasets for JamE:   0%|          | 0/5 [00:00<?, ?it/s][A

Calculating BLEU scores for JamE - WSC (659):   0%|          | 0/658 [00:00<?, ?it/s][A[A

Calculating BLEU scores for JamE - WSC (659):  16%|█▌        | 106/658 [00:00<00:00, 1050.88it/s][A[A

Calculating BLEU scores for JamE - WSC (659):  32%|███▏      | 212/658 [00:00<00:00, 1035.78it/s][A[A

Calculating BLEU scores for JamE - WSC (659):  48%|████▊     | 316/658 [00:00<00:00, 1029.41it/s][A[A

Calculating BLEU scores for JamE - WSC (659):  65%|██████▍   | 425/658 [00:00<00:00, 1052.25it/s][A[A

Calculating BLEU scores for JamE - WSC (659): 100%|██████████| 658/658 [00:00<00:00, 1166.81it/s]

Processing datasets for JamE:  20%|██        | 1/5 [00:00<00:02,  1.50it/s][A


Processed WSC (659) for JamE:
Total: 658, Below threshold: 658, Percentage: 100.00%




Calculating BLEU scores for JamE - SST-2 (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for JamE - SST-2 (1000):  25%|██▌       | 251/1000 [00:00<00:00, 2503.88it/s][A[A

Calculating BLEU scores for JamE - SST-2 (1000):  50%|█████     | 502/1000 [00:00<00:00, 2283.71it/s][A[A

Calculating BLEU scores for JamE - SST-2 (1000):  73%|███████▎  | 732/1000 [00:00<00:00, 2163.99it/s][A[A

Calculating BLEU scores for JamE - SST-2 (1000): 100%|██████████| 1000/1000 [00:00<00:00, 2146.59it/s]

Processing datasets for JamE:  40%|████      | 2/5 [00:01<00:01,  1.65it/s][A


Processed SST-2 (1000) for JamE:
Total: 1000, Below threshold: 919, Percentage: 91.90%




Calculating BLEU scores for JamE - MultiRC (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):   2%|▏         | 16/1000 [00:00<00:06, 158.55it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):   3%|▎         | 32/1000 [00:00<00:06, 146.08it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):   5%|▍         | 47/1000 [00:00<00:06, 144.26it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):   6%|▌         | 62/1000 [00:00<00:06, 143.86it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):   8%|▊         | 77/1000 [00:00<00:07, 126.52it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):   9%|▉         | 90/1000 [00:00<00:07, 123.98it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):  10%|█         | 103/1000 [00:00<00:07, 122.83it/s][A[A

Calculating BLEU scores for JamE - MultiRC (1000):  12%|█▏        | 116/1000 [00:00<00:07, 124.76it/s][A[A

Calculating BLEU scores f


Processed MultiRC (1000) for JamE:
Total: 1000, Below threshold: 1000, Percentage: 100.00%




Calculating BLEU scores for JamE - COPA (500):   0%|          | 0/500 [00:00<?, ?it/s][A[A

Calculating BLEU scores for JamE - COPA (500): 100%|██████████| 500/500 [00:00<00:00, 3106.39it/s]

Processing datasets for JamE:  80%|████████  | 4/5 [00:05<00:01,  1.49s/it][A


Processed COPA (500) for JamE:
Total: 500, Below threshold: 499, Percentage: 99.80%




Calculating BLEU scores for JamE - BoolQ (1000):   0%|          | 0/999 [00:00<?, ?it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):   5%|▌         | 52/999 [00:00<00:01, 510.22it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  10%|█         | 104/999 [00:00<00:01, 485.56it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  17%|█▋        | 169/999 [00:00<00:01, 556.92it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  23%|██▎       | 234/999 [00:00<00:01, 592.64it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  31%|███       | 309/999 [00:00<00:01, 647.95it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  38%|███▊      | 380/999 [00:00<00:00, 667.44it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  46%|████▌     | 460/999 [00:00<00:00, 710.12it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000):  53%|█████▎    | 532/999 [00:00<00:00, 681.35it/s][A[A

Calculating BLEU scores for JamE - BoolQ (1000)


Processed BoolQ (1000) for JamE:
Total: 999, Below threshold: 995, Percentage: 99.60%



Processing datasets for CollSgE:   0%|          | 0/5 [00:00<?, ?it/s][A

Calculating BLEU scores for CollSgE - WSC (659):   0%|          | 0/658 [00:00<?, ?it/s][A[A

Calculating BLEU scores for CollSgE - WSC (659):  40%|████      | 266/658 [00:00<00:00, 2652.17it/s][A[A

Calculating BLEU scores for CollSgE - WSC (659): 100%|██████████| 658/658 [00:00<00:00, 2188.87it/s]

Processing datasets for CollSgE:  20%|██        | 1/5 [00:00<00:01,  2.66it/s][A


Processed WSC (659) for CollSgE:
Total: 658, Below threshold: 648, Percentage: 98.48%




Calculating BLEU scores for CollSgE - SST-2 (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for CollSgE - SST-2 (1000):  33%|███▎      | 328/1000 [00:00<00:00, 3279.93it/s][A[A

Calculating BLEU scores for CollSgE - SST-2 (1000):  66%|██████▌   | 656/1000 [00:00<00:00, 2773.26it/s][A[A

Calculating BLEU scores for CollSgE - SST-2 (1000): 100%|██████████| 1000/1000 [00:00<00:00, 2844.04it/s]

Processing datasets for CollSgE:  40%|████      | 2/5 [00:00<00:01,  2.46it/s][A


Processed SST-2 (1000) for CollSgE:
Total: 1000, Below threshold: 956, Percentage: 95.60%




Calculating BLEU scores for CollSgE - MultiRC (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):   3%|▎         | 28/1000 [00:00<00:03, 274.11it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):   6%|▌         | 56/1000 [00:00<00:03, 265.74it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):   8%|▊         | 83/1000 [00:00<00:03, 263.72it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):  11%|█         | 110/1000 [00:00<00:03, 245.85it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):  14%|█▎        | 135/1000 [00:00<00:03, 244.37it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):  16%|█▌        | 160/1000 [00:00<00:03, 232.40it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):  18%|█▊        | 184/1000 [00:00<00:03, 233.31it/s][A[A

Calculating BLEU scores for CollSgE - MultiRC (1000):  22%|██▏       | 223/1000 [00:00<00:02, 278.55it/s][A


Processed MultiRC (1000) for CollSgE:
Total: 1000, Below threshold: 989, Percentage: 98.90%




Calculating BLEU scores for CollSgE - COPA (500):   0%|          | 0/500 [00:00<?, ?it/s][A[A

Calculating BLEU scores for CollSgE - COPA (500):  50%|████▉     | 249/500 [00:00<00:00, 2489.87it/s][A[A

Calculating BLEU scores for CollSgE - COPA (500): 100%|██████████| 500/500 [00:00<00:00, 1801.32it/s]

Processing datasets for CollSgE:  80%|████████  | 4/5 [00:05<00:01,  1.34s/it][A


Processed COPA (500) for CollSgE:
Total: 500, Below threshold: 487, Percentage: 97.40%




Calculating BLEU scores for CollSgE - BoolQ (1000):   0%|          | 0/999 [00:00<?, ?it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):   5%|▌         | 52/999 [00:00<00:01, 510.58it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  10%|█         | 104/999 [00:00<00:01, 505.22it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  16%|█▌        | 155/999 [00:00<00:01, 498.36it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  21%|██        | 205/999 [00:00<00:01, 469.43it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  25%|██▌       | 253/999 [00:00<00:01, 412.15it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  30%|██▉       | 296/999 [00:00<00:01, 402.89it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  34%|███▍      | 341/999 [00:00<00:01, 415.80it/s][A[A

Calculating BLEU scores for CollSgE - BoolQ (1000):  38%|███▊      | 384/999 [00:00<00:01, 397.74it/s][A[A

Calculating BLEU sco


Processed BoolQ (1000) for CollSgE:
Total: 999, Below threshold: 979, Percentage: 98.00%



Processing datasets for ChcE:   0%|          | 0/5 [00:00<?, ?it/s][A

Calculating BLEU scores for ChcE - WSC (659):   0%|          | 0/658 [00:00<?, ?it/s][A[A

Calculating BLEU scores for ChcE - WSC (659):  17%|█▋        | 111/658 [00:00<00:00, 1106.16it/s][A[A

Calculating BLEU scores for ChcE - WSC (659):  53%|█████▎    | 346/658 [00:00<00:00, 1835.45it/s][A[A

Calculating BLEU scores for ChcE - WSC (659): 100%|██████████| 658/658 [00:00<00:00, 1620.74it/s]

Processing datasets for ChcE:  20%|██        | 1/5 [00:00<00:01,  2.04it/s][A


Processed WSC (659) for ChcE:
Total: 658, Below threshold: 657, Percentage: 99.85%




Calculating BLEU scores for ChcE - SST-2 (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for ChcE - SST-2 (1000):  22%|██▏       | 215/1000 [00:00<00:00, 2149.32it/s][A[A

Calculating BLEU scores for ChcE - SST-2 (1000):  43%|████▎     | 430/1000 [00:00<00:00, 2028.48it/s][A[A

Calculating BLEU scores for ChcE - SST-2 (1000):  63%|██████▎   | 634/1000 [00:00<00:00, 1885.29it/s][A[A

Calculating BLEU scores for ChcE - SST-2 (1000):  82%|████████▏ | 824/1000 [00:00<00:00, 1480.77it/s][A[A

Calculating BLEU scores for ChcE - SST-2 (1000): 100%|██████████| 1000/1000 [00:00<00:00, 1462.47it/s]

Processing datasets for ChcE:  40%|████      | 2/5 [00:01<00:02,  1.47it/s][A


Processed SST-2 (1000) for ChcE:
Total: 1000, Below threshold: 943, Percentage: 94.30%




Calculating BLEU scores for ChcE - MultiRC (1000):   0%|          | 0/1000 [00:00<?, ?it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):   2%|▏         | 16/1000 [00:00<00:06, 153.84it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):   3%|▎         | 32/1000 [00:00<00:07, 137.20it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):   5%|▍         | 46/1000 [00:00<00:06, 136.66it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):   6%|▌         | 61/1000 [00:00<00:06, 140.32it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):   8%|▊         | 76/1000 [00:00<00:06, 136.05it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):   9%|▉         | 90/1000 [00:00<00:06, 132.56it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):  10%|█         | 104/1000 [00:00<00:06, 134.80it/s][A[A

Calculating BLEU scores for ChcE - MultiRC (1000):  12%|█▏        | 119/1000 [00:00<00:06, 137.50it/s][A[A

Calculating BLEU scores f


Processed MultiRC (1000) for ChcE:
Total: 1000, Below threshold: 1000, Percentage: 100.00%




Calculating BLEU scores for ChcE - COPA (500):   0%|          | 0/500 [00:00<?, ?it/s][A[A

Calculating BLEU scores for ChcE - COPA (500): 100%|██████████| 500/500 [00:00<00:00, 3578.86it/s]

Processing datasets for ChcE:  80%|████████  | 4/5 [00:06<00:01,  1.61s/it][A


Processed COPA (500) for ChcE:
Total: 500, Below threshold: 481, Percentage: 96.20%




Calculating BLEU scores for ChcE - BoolQ (1000):   0%|          | 0/999 [00:00<?, ?it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):   9%|▉         | 92/999 [00:00<00:00, 913.77it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  18%|█▊        | 184/999 [00:00<00:00, 865.73it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  27%|██▋       | 271/999 [00:00<00:00, 808.98it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  35%|███▌      | 353/999 [00:00<00:00, 773.29it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  43%|████▎     | 431/999 [00:00<00:00, 743.26it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  51%|█████     | 506/999 [00:00<00:00, 682.23it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  58%|█████▊    | 575/999 [00:00<00:00, 673.35it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000):  65%|██████▍   | 646/999 [00:00<00:00, 682.57it/s][A[A

Calculating BLEU scores for ChcE - BoolQ (1000)


Processed BoolQ (1000) for ChcE:
Total: 999, Below threshold: 994, Percentage: 99.50%



