In [None]:
#GPT-4o

from google.colab import drive
drive.mount('/content/drive')

!pip install openai==0.28

import os
import re
import csv
import pandas as pd
import openai
from tqdm import tqdm


openai.api_key = "API-KEY"

DATASET_PATHS = {
    "FOLIO(1000)":               "aligned_folio1000.csv",
    "BoolQ (1000)":              "GLUE + SuperGLUE/aligned_boolq_1000.csv",
    "COPA (500)":                "GLUE + SuperGLUE/aligned_copa_500.csv",
    "MultiRC (1000)":            "GLUE + SuperGLUE/aligned_multirc_1000.csv",
    "SST-2 (1000)":              "GLUE + SuperGLUE/aligned_sst-2_1000.csv",
    "WSC (659)":                 "GLUE + SuperGLUE/aligned_wsc_659.csv",
    "GSM8K(1000)":               "aligned_gsm8k1000.csv",
    "HumanEVAL(164)":            "aligned_humaneval164.csv",
    "Logic Bench MCQ(480)":      "aligned_logic_bench_mcq480.csv",
    "Logic Bench YN(500)":       "aligned_logic_bench_yn500.csv",
    "MBPP(374)":                 "aligned_mbpp374.csv",
    "SVAMP(700)":                "aligned_svamp700.csv",
}

DIALECTS = ["AAVE", "ChcE", "CollSgE", "IndE", "JamE"]

BASE_INPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Aligned Translations"
BASE_OUTPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Metrics/Preference Scores/GPT 4o"

COT_PROMPT = """
You are an expert linguist with a strong command of {dialect}.

You are given:
1) **Original Text (SAE)** – a standard American English version for reference.
2) **Translation A** – a version in the {dialect} dialect.
3) **Translation B** – another version in the {dialect} dialect.

Your task: Decide which translation is better **in the context of the {dialect} dialect** with respect to:
- Fluency (grammar, syntax, word choice, overall naturalness in {dialect})
- Accuracy (faithfulness to the original meaning, but expressed naturally in {dialect})
- Readability (cohesion, clarity, and flow in {dialect})
- Cultural appropriateness (if relevant to {dialect})

Provide a detailed chain-of-thought (reasoning) as to how you weigh these factors.
Then conclude with one final line in the exact format:
**"Final preference score: X"**
(where **X = 1** if you prefer Translation A, or **X = 2** if you prefer Translation B).

Make sure you **reveal** your full thought process, then **end** with:
Final preference score: X
"""

def get_preference_score(original_text: str, trans_a: str, trans_b: str, dialect: str):
    """
    Returns: (model_full_response, preference_int)
       model_full_response: the entire chain-of-thought + final line
       preference_int: 1 or 2 if found, else -1
    """
    try:
        user_content = f"""Original Text (SAE):
{original_text}

Translation A ({dialect}):
{trans_a}

Translation B ({dialect}):
{trans_b}

Please show your detailed reasoning focusing on {dialect} usage, then conclude with:
Final preference score: X
"""

        # Call ChatCompletion
        response = openai.ChatCompletion.create(
            model="gpt-4o",  # or another GPT-4 variant
            messages=[
                {"role": "system", "content": COT_PROMPT.format(dialect=dialect)},
                {"role": "user", "content": user_content}
            ],
            temperature=0.0
        )

        result_text = response["choices"][0]["message"]["content"].strip()

        # Parse final preference
        match = re.search(r"Final preference score:\s*([12])", result_text)
        if match:
            pref = int(match.group(1))
        else:
            pref = -1

        return (result_text, pref)
    except Exception as e:
        print(f"Error retrieving preference: {e}")
        return ("", -1)

def process_dataset(dialect: str, ds_name: str):
    """
    1) Build the input CSV path by combining the base path, the dialect, and the subpath from DATASET_PATHS[ds_name].
    2) Read the CSV -> columns: "Original", "Filtered GPT 4o" (Translation A), "Filtered Multi-VALUE" (Translation B).
    3) For each row, call get_preference_score(), capturing the entire chain-of-thought plus final line.
    4) Write results to CSV:
       [Original, Translation A, Translation B, Chain-of-Thought & Decision, Preference Score]
    5) Summarize how many times the model picked 1 vs 2 in a .txt file.
    """
    if ds_name not in DATASET_PATHS:
        print(f"[!] No path mapped for dataset: {ds_name}")
        return
    subpath = DATASET_PATHS[ds_name]

    input_path = os.path.join(BASE_INPUT_DIR, dialect, subpath)

    if not os.path.isfile(input_path):
        print(f"[!] File not found: {input_path} - skipping.")
        return

    df = pd.read_csv(input_path, encoding="utf-8")

    out_folder = os.path.join(BASE_OUTPUT_DIR, dialect, ds_name)
    os.makedirs(out_folder, exist_ok=True)

    csv_out_path = os.path.join(out_folder, f"{ds_name}_preference.csv")
    txt_out_path = os.path.join(out_folder, f"{ds_name}_preference_summary.txt")

    all_prefs = []

    fieldnames = [
        "Original",
        "Translation A",
        "Translation B",
        "Chain-of-Thought & Decision",
        "Preference Score"
    ]
    with open(csv_out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"{ds_name} - {dialect}"):
        original_text = str(row.get("Original", "")).strip()
        trans_a = str(row.get("Filtered GPT 4o", "")).strip()
        trans_b = str(row.get("Filtered Multi-VALUE", "")).strip()

        if not trans_a or not trans_b:
            chain_of_thought = "(N/A: missing data)"
            pref = -1
        else:
            chain_of_thought, pref = get_preference_score(
                original_text, trans_a, trans_b, dialect
            )

        all_prefs.append(pref)

        # Write row to CSV
        row_dict = {
            "Original": original_text,
            "Translation A": trans_a,
            "Translation B": trans_b,
            "Chain-of-Thought & Decision": chain_of_thought,
            "Preference Score": pref
        }
        with open(csv_out_path, "a", newline="", encoding="utf-8") as f_csv:
            writer = csv.DictWriter(f_csv, fieldnames=fieldnames)
            writer.writerow(row_dict)

    # Compute summary
    num_1 = sum(1 for p in all_prefs if p == 1)
    num_2 = sum(1 for p in all_prefs if p == 2)
    total_valid = num_1 + num_2

    with open(txt_out_path, "w", encoding="utf-8") as f_txt:
        f_txt.write(f"Number of times preference=1 (Translation A): {num_1}\n")
        f_txt.write(f"Number of times preference=2 (Translation B): {num_2}\n")
        f_txt.write(f"Total valid comparisons: {total_valid}\n")

    print(f"[DONE] {ds_name} - {dialect} => preference=1: {num_1}, preference=2: {num_2}")

def main():
    for dialect in DIALECTS:
        for ds_name in DATASET_PATHS.keys():
            process_dataset(dialect, ds_name)

    print("All preference comparisons complete!")

if __name__ == "__main__":
    main()

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Gemini 1.5 Pro

from google.colab import drive
drive.mount('/content/drive')

!pip install openai==0.28

import os
import re
import csv
import pandas as pd
import openai
from tqdm import tqdm

GEMINI_API_KEY = "API-KEY"

DATASET_PATHS = {
    "FOLIO(1000)":               "aligned_folio1000.csv",
    "BoolQ (1000)":              "GLUE + SuperGLUE/aligned_boolq_1000.csv",
    "COPA (500)":                "GLUE + SuperGLUE/aligned_copa_500.csv",
    "MultiRC (1000)":            "GLUE + SuperGLUE/aligned_multirc_1000.csv",
    "SST-2 (1000)":              "GLUE + SuperGLUE/aligned_sst-2_1000.csv",
    "WSC (659)":                 "GLUE + SuperGLUE/aligned_wsc_659.csv",
    "GSM8K(1000)":               "aligned_gsm8k1000.csv",
    "HumanEVAL(164)":            "aligned_humaneval164.csv",
    "Logic Bench MCQ(480)":      "aligned_logic_bench_mcq480.csv",
    "Logic Bench YN(500)":       "aligned_logic_bench_yn500.csv",
    "MBPP(374)":                 "aligned_mbpp374.csv",
    "SVAMP(700)":                "aligned_svamp700.csv",
}

DIALECTS = ["AAVE", "ChcE", "CollSgE", "IndE", "JamE"]

BASE_INPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Aligned Translations"
BASE_OUTPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Metrics/Preference Scores/Gemini 1.5"

COT_PROMPT = """
You are an expert linguist with a strong command of {dialect}.

You are given:
1) **Original Text (SAE)** – a standard American English version for reference.
2) **Translation A** – a version in the {dialect} dialect.
3) **Translation B** – another version in the {dialect} dialect.

Your task: Decide which translation is better **in the context of the {dialect} dialect** with respect to:
- Fluency (grammar, syntax, word choice, overall naturalness in {dialect})
- Accuracy (faithfulness to the original meaning, but expressed naturally in {dialect})
- Readability (cohesion, clarity, and flow in {dialect})
- Cultural appropriateness (if relevant to {dialect})

Provide a detailed chain-of-thought (reasoning) as to how you weigh these factors.
Then conclude with one final line in the exact format:
**"Final preference score: X"**
(where **X = 1** if you prefer Translation A, or **X = 2** if you prefer Translation B).

Make sure you **reveal** your full thought process, then **end** with:
Final preference score: X
"""

def get_preference_score(original_text: str, trans_a: str, trans_b: str, dialect: str):
    """
    Returns: (model_full_response, preference_int)
       model_full_response: the entire chain-of-thought + final line
       preference_int: 1 or 2 if found, else -1
    """
    try:
        # Prompt the user
        user_content = f"""Original Text (SAE):
{original_text}

Translation A ({dialect}):
{trans_a}

Translation B ({dialect}):
{trans_b}

Please show your detailed reasoning focusing on {dialect} usage, then conclude with:
Final preference score: X
"""

        response = openai.ChatCompletion.create(  
            model="gemini-1.5-pro",  
            messages=[
                {"role": "system", "content": COT_PROMPT.format(dialect=dialect)},
                {"role": "user", "content": user_content}
            ],
            temperature=0.0,
            api_key=GEMINI_API_KEY  
        )

        result_text = response["choices"][0]["message"]["content"].strip()

        # Parse final preference
        match = re.search(r"Final preference score:\s*([12])", result_text)
        if match:
            pref = int(match.group(1))
        else:
            pref = -1

        return (result_text, pref)
    except Exception as e:
        print(f"Error retrieving preference: {e}")
        return ("", -1)

def process_dataset(dialect: str, ds_name: str):
    """
    1) Build the input CSV path by combining the base path, the dialect, and the subpath from DATASET_PATHS[ds_name].
    2) Read the CSV -> columns: "Original", "Filtered GPT 4o" (Translation A), "Filtered Multi-VALUE" (Translation B).
    3) For each row, call get_preference_score(), capturing the entire chain-of-thought plus final line.
    4) Write results to CSV:
       [Original, Translation A, Translation B, Chain-of-Thought & Decision, Preference Score]
    5) Summarize how many times the model picked 1 vs 2 in a .txt file.
    """
    # Subpath for this dataset
    if ds_name not in DATASET_PATHS:
        print(f"[!] No path mapped for dataset: {ds_name}")
        return
    subpath = DATASET_PATHS[ds_name]

    input_path = os.path.join(BASE_INPUT_DIR, dialect, subpath)

    if not os.path.isfile(input_path):
        print(f"[!] File not found: {input_path} - skipping.")
        return

    df = pd.read_csv(input_path, encoding="utf-8")

    # Output folder
    out_folder = os.path.join(BASE_OUTPUT_DIR, dialect, ds_name)
    os.makedirs(out_folder, exist_ok=True)

    csv_out_path = os.path.join(out_folder, f"{ds_name}_preference.csv")
    txt_out_path = os.path.join(out_folder, f"{ds_name}_preference_summary.txt")

    all_prefs = []

    fieldnames = [
        "Original",
        "Translation A",
        "Translation B",
        "Chain-of-Thought & Decision",
        "Preference Score"
    ]
    with open(csv_out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"{ds_name} - {dialect}"):
        original_text = str(row.get("Original", "")).strip()
        trans_a = str(row.get("Filtered GPT 4o", "")).strip()
        trans_b = str(row.get("Filtered Multi-VALUE", "")).strip()

        if not trans_a or not trans_b:
            chain_of_thought = "(N/A: missing data)"
            pref = -1
        else:
            chain_of_thought, pref = get_preference_score(
                original_text, trans_a, trans_b, dialect
            )

        all_prefs.append(pref)

        row_dict = {
            "Original": original_text,
            "Translation A": trans_a,
            "Translation B": trans_b,
            "Chain-of-Thought & Decision": chain_of_thought,
            "Preference Score": pref
        }
        with open(csv_out_path, "a", newline="", encoding="utf-8") as f_csv:
            writer = csv.DictWriter(f_csv, fieldnames=fieldnames)
            writer.writerow(row_dict)

    num_1 = sum(1 for p in all_prefs if p == 1)
    num_2 = sum(1 for p in all_prefs if p == 2)
    total_valid = num_1 + num_2

    with open(txt_out_path, "w", encoding="utf-8") as f_txt:
        f_txt.write(f"Number of times preference=1 (Translation A): {num_1}\n")
        f_txt.write(f"Number of times preference=2 (Translation B): {num_2}\n")
        f_txt.write(f"Total valid comparisons: {total_valid}\n")

    print(f"[DONE] {ds_name} - {dialect} => preference=1: {num_1}, preference=2: {num_2}")

def main():
    for dialect in DIALECTS:
        for ds_name in DATASET_PATHS.keys():
            process_dataset(dialect, ds_name)

    print("All preference comparisons complete!")

if __name__ == "__main__":
    main()

In [None]:
# Claude 3.5 Sonnet

from google.colab import drive
drive.mount('/content/drive')

!pip install anthropic

import os
import re
import csv
import pandas as pd
import anthropic
from tqdm import tqdm

anthropic_client = anthropic.Anthropic(api_key="API-KEY")

DATASET_PATHS = {
    "FOLIO(1000)":               "aligned_folio1000.csv",
    "BoolQ (1000)":              "GLUE + SuperGLUE/aligned_boolq_1000.csv",
    "COPA (500)":                "GLUE + SuperGLUE/aligned_copa_500.csv",
    "MultiRC (1000)":            "GLUE + SuperGLUE/aligned_multirc_1000.csv",
    "SST-2 (1000)":              "GLUE + SuperGLUE/aligned_sst-2_1000.csv",
    "WSC (659)":                 "GLUE + SuperGLUE/aligned_wsc_659.csv",
    "GSM8K(1000)":               "aligned_gsm8k1000.csv",
    "HumanEVAL(164)":            "aligned_humaneval164.csv",
    "Logic Bench MCQ(480)":      "aligned_logic_bench_mcq480.csv",
    "Logic Bench YN(500)":       "aligned_logic_bench_yn500.csv",
    "MBPP(374)":                 "aligned_mbpp374.csv",
    "SVAMP(700)":                "aligned_svamp700.csv",
}

DIALECTS = ["AAVE", "ChcE", "CollSgE", "IndE", "JamE"]

BASE_INPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Aligned Translations"
BASE_OUTPUT_DIR = "/content/drive/MyDrive/!!Multi-AAVENUE/Metrics/Preference Scores/Claude 3.5 Sonnet"

COT_PROMPT = """
You are an expert linguist with a strong command of {dialect}.

You are given:
1) **Original Text (SAE)** – a standard American English version for reference.
2) **Translation A** – a version in the {dialect} dialect.
3) **Translation B** – another version in the {dialect} dialect.

Your task: Decide which translation is better **in the context of the {dialect} dialect** with respect to:
- Fluency (grammar, syntax, word choice, overall naturalness in {dialect})
- Accuracy (faithfulness to the original meaning, but expressed naturally in {dialect})
- Readability (cohesion, clarity, and flow in {dialect})
- Cultural appropriateness (if relevant to {dialect})

Provide a detailed chain-of-thought (reasoning) as to how you weigh these factors.
Then conclude with one final line in the exact format:
**"Final preference score: X"**
(where **X = 1** if you prefer Translation A, or **X = 2** if you prefer Translation B).

Make sure you **reveal** your full thought process, then **end** with:
Final preference score: X
"""

def get_preference_score(original_text: str, trans_a: str, trans_b: str, dialect: str):
    """
    Returns: (model_full_response, preference_int)
       model_full_response: the entire chain-of-thought + final line
       preference_int: 1 or 2 if found, else -1
    """
    try:
        # Prompt the user
        user_content = f"""Original Text (SAE):
{original_text}

Translation A ({dialect}):
{trans_a}

Translation B ({dialect}):
{trans_b}

Please show your detailed reasoning focusing on {dialect} usage, then conclude with:
Final preference score: X
"""

        # Call Anthropic API
        response = anthropic_client.messages.create(
            model="claude-3-5-sonnet-latest",  
            system=COT_PROMPT.format(dialect=dialect),  
            messages=[
                {"role": "user", "content": user_content} 
            ],
            max_tokens=1000,
            temperature=0.0
        )

        result_text = response.content[0].text.strip()

        match = re.search(r"Final preference score:\s*([12])", result_text)
        if match:
            pref = int(match.group(1))
        else:
            pref = -1

        return (result_text, pref)
    except Exception as e:
        print(f"Error retrieving preference: {e}")
        return ("", -1)

def process_dataset(dialect: str, ds_name: str):
    """
    1) Build the input CSV path by combining the base path, the dialect, and the subpath from DATASET_PATHS[ds_name].
    2) Read the CSV -> columns: "Original", "Filtered GPT 4o" (Translation A), "Filtered Multi-VALUE" (Translation B).
    3) For each row, call get_preference_score(), capturing the entire chain-of-thought plus final line.
    4) Write results to CSV:
       [Original, Translation A, Translation B, Chain-of-Thought & Decision, Preference Score]
    5) Summarize how many times the model picked 1 vs 2 in a .txt file.
    """
    if ds_name not in DATASET_PATHS:
        print(f"[!] No path mapped for dataset: {ds_name}")
        return
    subpath = DATASET_PATHS[ds_name]

    input_path = os.path.join(BASE_INPUT_DIR, dialect, subpath)

    if not os.path.isfile(input_path):
        print(f"[!] File not found: {input_path} - skipping.")
        return

    df = pd.read_csv(input_path, encoding="utf-8")

    # Output folder
    out_folder = os.path.join(BASE_OUTPUT_DIR, dialect, ds_name)
    os.makedirs(out_folder, exist_ok=True)

    csv_out_path = os.path.join(out_folder, f"{ds_name}_preference.csv")
    txt_out_path = os.path.join(out_folder, f"{ds_name}_preference_summary.txt")

    all_prefs = []

    fieldnames = [
        "Original",
        "Translation A",
        "Translation B",
        "Chain-of-Thought & Decision",
        "Preference Score"
    ]
    with open(csv_out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"{ds_name} - {dialect}"):
        original_text = str(row.get("Original", "")).strip()
        trans_a = str(row.get("Filtered GPT 4o", "")).strip()
        trans_b = str(row.get("Filtered Multi-VALUE", "")).strip()

        if not trans_a or not trans_b:
            chain_of_thought = "(N/A: missing data)"
            pref = -1
        else:
            chain_of_thought, pref = get_preference_score(
                original_text, trans_a, trans_b, dialect
            )

        all_prefs.append(pref)

        row_dict = {
            "Original": original_text,
            "Translation A": trans_a,
            "Translation B": trans_b,
            "Chain-of-Thought & Decision": chain_of_thought,
            "Preference Score": pref
        }
        with open(csv_out_path, "a", newline="", encoding="utf-8") as f_csv:
            writer = csv.DictWriter(f_csv, fieldnames=fieldnames)
            writer.writerow(row_dict)

    num_1 = sum(1 for p in all_prefs if p == 1)
    num_2 = sum(1 for p in all_prefs if p == 2)
    total_valid = num_1 + num_2

    with open(txt_out_path, "w", encoding="utf-8") as f_txt:
        f_txt.write(f"Number of times preference=1 (Translation A): {num_1}\n")
        f_txt.write(f"Number of times preference=2 (Translation B): {num_2}\n")
        f_txt.write(f"Total valid comparisons: {total_valid}\n")

    print(f"[DONE] {ds_name} - {dialect} => preference=1: {num_1}, preference=2: {num_2}")

def main():
    for dialect in DIALECTS:
        for ds_name in DATASET_PATHS.keys():
            process_dataset(dialect, ds_name)

    print("All preference comparisons complete!")

if __name__ == "__main__":
    main()