In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# NEW CODE

import pandas as pd
import os
from tqdm import tqdm

gpt_base = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
multi_value_base = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/Multi-VALUE/"
output_base = "/content/drive/MyDrive/!!Multi-AAVENUE/Aligned Translations/"

datasets = {
    "FOLIO(1000)": {"original_header": "Premises", "file_ext": "csv"},
    "GSM8K(1000)": {"original_header": "Original", "file_ext": "csv"},
    "HumanEVAL(164)": {"original_header": "Prompt", "file_ext": "csv"},
    "Logic Bench MCQ(480)": {"original_header": "Context", "dialect_header": "DIALECT (context)", "file_ext": "csv"},
    "Logic Bench YN(500)": {"original_header": "Context", "dialect_header": "DIALECT (context)", "file_ext": "csv"},
    "MBPP(374)": {"original_header": "Original", "file_ext": "csv"},
    "SVAMP(700)": {"original_header": "Original", "file_ext": "csv"},
    "BoolQ (1000)": {"original_header": "SAE Passage", "file_ext": "csv"},
    "COPA (500)": {"original_header": "Premise", "file_ext": "csv"},
    "MultiRC (1000)": {"original_header": "Paragraph", "file_ext": "csv"},
    "SST-2 (1000)": {"original_header": "Original Sentence", "file_ext": "csv"},
    "WSC (659)": {"original_header": "Original Paragraph", "file_ext": "csv"}
}

dialects = ["AAVE", "IndE", "JamE", "CollSgE", "ChcE"]

def read_csv(file_path, dialect, original_header, dialect_header):
    try:
        df = pd.read_csv(file_path)
        if original_header not in df.columns:
            print(f"Column '{original_header}' not found in {file_path}")
            return pd.DataFrame()
        if dialect_header not in df.columns:
            print(f"Column '{dialect_header}' not found in {file_path}")
            return pd.DataFrame()
        return pd.DataFrame({
            'Original': df[original_header],
            'Translated': df[dialect_header]
        }).drop_duplicates(subset=['Original'], keep='first')
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return pd.DataFrame()

def process_dataset(dialect, dataset):
    is_glue = dataset in ["BoolQ (1000)", "COPA (500)", "MultiRC (1000)", "SST-2 (1000)", "WSC (659)"]

    if is_glue:
        output_dir = os.path.join(output_base, dialect, "GLUE + SuperGLUE")
    else:
        output_dir = os.path.join(output_base, dialect)
    os.makedirs(output_dir, exist_ok=True)

    if is_glue:
        gpt_path = os.path.join(gpt_base, dialect, "GLUE + SuperGLUE", dataset, f"{dataset}_filtered_bleu_scores.csv")
        mv_path = os.path.join(multi_value_base, dialect, "GLUE + SuperGLUE", dataset, f"{dataset}_filtered_bleu_scores.csv")
    else:
        gpt_path = os.path.join(gpt_base, dialect, dataset, f"{dataset}_filtered_bleu_scores.csv")
        mv_path = os.path.join(multi_value_base, dialect, dataset, f"{dataset}_filtered_bleu_scores.csv")

    original_header = datasets[dataset]['original_header']
    dialect_header = datasets[dataset].get('dialect_header', f"{dialect} ({original_header})").replace('DIALECT', dialect)
    gpt_df = read_csv(gpt_path, dialect, original_header, dialect_header)
    mv_df = read_csv(mv_path, dialect, original_header, dialect_header)

    if gpt_df.empty or mv_df.empty:
        print(f"Skipping {dialect} - {dataset} due to empty DataFrame")
        return

    merged_df = pd.merge(
        gpt_df,
        mv_df,
        on='Original',
        how='inner',
        suffixes=('_GPT', '_MV')
    )

    final_df = pd.DataFrame({
        'Original': merged_df['Original'],
        'Filtered GPT 4o': merged_df['Translated_GPT'],
        'Filtered Multi-VALUE': merged_df['Translated_MV']
    })

    print(f"\nProcessed {dialect} - {dataset}")
    print(f"Original GPT rows: {len(gpt_df)}")
    print(f"Original MV rows: {len(mv_df)}")
    print(f"Final merged rows: {len(final_df)}")
    print(f"Final unique originals: {final_df['Original'].nunique()}")

    output_path = os.path.join(output_dir, f"aligned_{dataset.lower().replace(' ', '_').replace('(', '').replace(')', '')}.csv")
    final_df.to_csv(output_path, index=False)

for dialect in tqdm(dialects, desc="Processing dialects"):
    for dataset in tqdm(datasets, desc=f"Processing datasets for {dialect}"):
        try:
            process_dataset(dialect, dataset)
        except Exception as e:
            print(f"Error processing {dialect} - {dataset}: {str(e)}")