In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openai==0.28

import os
import re
import csv
import pandas as pd
import openai
from tqdm import tqdm

openai.api_key = "API-KEY"

FLUENCY_PROMPT = """
You are an expert linguist capable of detailed chain-of-thought reasoning.

You are given two pieces of text:
1) **Original Text (SAE)** – the standard American English version.
2) **Dialect Text** – a translated or adapted version in the {dialect} dialect.

Please evaluate the **Dialect Text** for:
1) **Fluency** in {dialect}:
   - Grammar, syntax, word choice, and overall naturalness in {dialect}.
   - Consistency, flow, and readability in {dialect}.
2) **Meaning Preservation**:
   - Does the Dialect Text retain the same meaning or intent as the Original Text (SAE)?
   - Are there changes or omissions that alter the meaning?

Use the following **1–7** scoring rubric (focused on fluency, but keep meaning in mind):

- **1**: Completely unnatural, pervasive errors, nearly unintelligible.
- **2**: Major issues in accuracy/naturalness, very awkward for {dialect}.
- **3**: Noticeable errors or unnatural phrasing, partial alignment with {dialect}.
- **4**: Average fluency, some issues; mostly understandable in {dialect}.
- **5**: Good fluency, minor errors; consistent with {dialect}.
- **6**: Very good fluency, rare issues; flows smoothly in {dialect}.
- **7**: Excellent fluency, fully natural, error-free, perfectly aligned with {dialect}.

### INSTRUCTIONS:
1. Provide a **chain-of-thought** explanation comparing meaning and evaluating fluency.
2. End with a single line: **"Fluency Score: X"** (where X is an integer 1–7).

Begin your detailed chain-of-thought analysis now.
"""

def get_fluency_score(original_text: str, dialect_text: str, dialect: str) -> int:
    try:
        # Construct the user message with both the Original (SAE) and Dialect text
        user_content = (
            f"Original Text (SAE):\n{original_text}\n\n"
            f"Dialect Text ({dialect}):\n{dialect_text}\n\n"
            "Provide chain-of-thought reasoning, then end with 'Fluency Score: X'."
        )

        response = openai.ChatCompletion.create(
            model="gpt-4o",  # or whichever GPT-4 variant you have
            messages=[
                {"role": "system", "content": FLUENCY_PROMPT.format(dialect=dialect)},
                {"role": "user", "content": user_content}
            ],
            temperature=0.0
        )

        result_text = response["choices"][0]["message"]["content"].strip()

        # Use regex to find 'Fluency Score: X' (X is 1-7)
        match = re.search(r'Fluency Score:\s*([1-7])', result_text)
        if match:
            score = int(match.group(1))
        else:
            score = -1  # If we can't parse the final line

        return score

    except Exception as e:
        print(f"Error scoring text: {e}")
        return -1  # return -1 to indicate an error

DATASET_PATHS = [
    ("FOLIO(1000)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
     False),
    ("BoolQ (1000)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv",
     False),
    ("COPA (500)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
     False),
    ("MultiRC (1000)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
     False),
    ("SST-2 (1000)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
     False),
    ("WSC (659)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
     False),
    ("GSM8K(1000)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
     False),
    ("HumanEVAL(164)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
     False),
    ("Logic Bench MCQ(480)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.csv",
     True),
    ("Logic Bench YN(500)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.csv",
     True),
    ("SVAMP(700)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
     False),
    ("MBPP(374)",
     "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o/AAVE/MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
     False),
]

DATASET_COLS = {
    "FOLIO(1000)":        ("Premises", "AAVE (Premises)"),
    "BoolQ (1000)":       ("SAE Passage", "AAVE (SAE Passage)"),
    "COPA (500)":         ("Premise", "AAVE (Premise)"),
    "MultiRC (1000)":     ("Paragraph", "AAVE (Paragraph)"),
    "SST-2 (1000)":       ("Original Sentence", "AAVE (Original Sentence)"),
    "WSC (659)":          ("Original Paragraph", "AAVE (Original Paragraph)"),
    "GSM8K(1000)":        ("Original", "AAVE (Original)"),
    "HumanEVAL(164)":     ("Prompt", "AAVE (Prompt)"),
    "Logic Bench MCQ(480)": ("Context", "AAVE (context)"),
    "Logic Bench YN(500)":  ("Context", "AAVE (context)"),
    "SVAMP(700)":         ("Original", "AAVE (Original)"),
    "MBPP(374)":          ("Original", "AAVE (Original)"),
}

BASE_OUTPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Metrics/Fluency"

DIALECTS = ["AAVE", "ChcE", "CollSgE", "IndE", "JamE"]

def process_dataset(dialect: str, dataset_name: str, dataset_path_aave: str, is_logic_bench: bool):
    """
    1) Build the dataset path by replacing 'AAVE' with the current dialect in the path.
    2) Read the CSV.
    3) Identify 'original_col' and 'dialect_col_template' from DATASET_COLS,
       then replace 'AAVE' with the actual dialect in the dialect column name.
    4) For each row, call get_fluency_score() to rate the translation.
       - Save each row's result immediately to a CSV file.
    5) After finishing all rows, compute and save average fluency in a .txt file.
    """

    if dataset_name == "FOLIO(1000)" and dialect == "AAVE":
        print(f"Skipping FOLIO(1000) for AAVE (already done).")
        return

    dataset_path = dataset_path_aave.replace("AAVE", dialect)

    try:
        df = pd.read_csv(dataset_path, encoding="utf-8")
    except FileNotFoundError:
        print(f"File not found for: {dataset_path}, skipping...")
        return

    if dataset_name not in DATASET_COLS:
        print(f"ERROR: No column config for dataset: {dataset_name}")
        return

    original_col, dialect_col_template = DATASET_COLS[dataset_name]
    dialect_col = dialect_col_template.replace("AAVE", dialect)

    if original_col not in df.columns:
        print(f"WARNING: Original col '{original_col}' not found in {dataset_path}.")
    if dialect_col not in df.columns:
        print(f"WARNING: Dialect col '{dialect_col}' not found in {dataset_path}.")

    out_folder = os.path.join(BASE_OUTPUT_DIR, dialect, dataset_name)
    os.makedirs(out_folder, exist_ok=True)
    csv_out_path = os.path.join(out_folder, f"{dataset_name}_fluency.csv")
    txt_out_path = os.path.join(out_folder, f"{dataset_name}_average_fluency.txt")

    all_scores = []

    fieldnames = ["Original Text", f"{dialect} Text", "Fluency Score"]
    with open(csv_out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

    print(f"Processing {dataset_name} ({dialect}) - Path: {dataset_path}")
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Scoring {dataset_name} - {dialect}"):
        original_text = str(row.get(original_col, ""))
        dialect_text = str(row.get(dialect_col, ""))

        if not dialect_text.strip():
            score = -1
        else:
            score = get_fluency_score(original_text, dialect_text, dialect)

        all_scores.append(score)

        with open(csv_out_path, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writerow({
                "Original Text": original_text,
                f"{dialect} Text": dialect_text,
                "Fluency Score": score
            })

        print(f"{dataset_name} | {dialect} | Row {i} => Score: {score}")

    valid_scores = [s for s in all_scores if s != -1]
    avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
    with open(txt_out_path, "w", encoding="utf-8") as f:
        f.write(f"Average Fluency Score: {avg_score:.2f}\n")
        f.write(f"Number of Samples: {len(valid_scores)}\n")

def main():
    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

    for dialect in DIALECTS:
        dialect_folder = os.path.join(BASE_OUTPUT_DIR, dialect)
        os.makedirs(dialect_folder, exist_ok=True)

        for (ds_name, ds_path_aave, is_logic_bench) in DATASET_PATHS:
            process_dataset(dialect, ds_name, ds_path_aave, is_logic_bench)

    print("All done! Check your output folders for results.")


if __name__ == "__main__":
    main()