In [None]:
## NON-COT EVALS

!pip install openai==0.28 pandas numpy regex tqdm

import openai
import pandas as pd
import numpy as np
import json
import os
import re
import csv
import time
from tqdm import tqdm
from openai.error import RateLimitError, APIError

openai.api_key = "API-KEY"
TEMPERATURE = 0.7

input_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
output_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/Evaluation Results"

dialects = ["IndE", "JamE", "AAVE", "CollSgE", "ChcE"]

datasets = {
    "SVAMP": "SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
    "MBPP": "MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
    "LogicBenchYN": "Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.json",
    "LogicBenchMCQ": "Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.json",
    "HumanEVAL": "HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
    "GSM8K": "GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
    "FOLIO": "FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
    "WSC": "GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
    "SST-2": "GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
    "MultiRC": "GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
    "COPA": "GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
    "BoolQ": "GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv"
}

models = ["gpt-4o", "gpt-4o-mini"]

def write_row_to_csv(row: dict, filename: str):
    mode = 'a'
    with open(filename, mode, newline='', encoding="utf-8") as csvfile:
        df = pd.DataFrame([row])
        write_header = csvfile.tell() == 0
        df.to_csv(csvfile, index=False, header=write_header)
        csvfile.flush()

def prompt_gpt_model(model_name: str, system_message: str, user_message: str, retries=5, backoff_factor=2) -> str:
    for attempt in range(retries):
        try:
            response = openai.ChatCompletion.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                temperature=TEMPERATURE
            )
            return response['choices'][0]['message']['content']
        except (RateLimitError, APIError) as e:
            wait_time = backoff_factor ** attempt
            print(f"API error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    raise Exception("Maximum retries exceeded.")

def clean_code(generated_code: str) -> str:
    cleaned_code = re.sub(r"```(?:python)?", "", generated_code, flags=re.DOTALL)
    cleaned_code = re.sub(r"```", "", cleaned_code, flags=re.DOTALL)
    return cleaned_code.strip()

def extract_response(response: str, pattern: str) -> str:
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def evaluate_response(model_answer: str, expected_answer: str) -> bool:
    return model_answer.strip().lower() == expected_answer.strip().lower()

def run_test_cases(generated_code: str, test_cases: str) -> (bool, str):
    try:
        exec_globals = {}
        exec(generated_code, exec_globals)
        exec(test_cases, exec_globals)
        return True, ""
    except AssertionError as e:
        return False, f"AssertionError: {str(e)}"
    except SyntaxError as e:
        return False, f"SyntaxError: {str(e)}"
    except Exception as e:
        return False, f"RuntimeError: {str(e)}"

model_output_mapping = {
    "gpt-4o": os.path.join(output_base_dir, "GPT4o_NonCoT"),
    "gpt-4o-mini": os.path.join(output_base_dir, "GPT4oMini_NonCoT")
}

def process_dataset(model_name: str, dataset_name: str, file_path: str, dialect: str):
    if dataset_name in ["LogicBenchYN", "LogicBenchMCQ"]:
        with open(file_path, 'r', encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = pd.read_csv(file_path)
    if dataset_name not in ["LogicBenchYN", "LogicBenchMCQ"]:
        data = data.drop(columns=[col for col in data.columns if "BLEU Score" in col], errors="ignore")
    if model_name in model_output_mapping:
        model_output_dir = os.path.join(model_output_mapping[model_name], dialect, dataset_name)
    else:
        model_output_dir = os.path.join(output_base_dir, model_name.replace("-", ""), dialect, dataset_name)
    os.makedirs(model_output_dir, exist_ok=True)
    csv_file = os.path.join(model_output_dir, f"{dataset_name}_results.csv")
    if dataset_name == "FOLIO" and model_name == "gpt-4o" and dialect == "AAVE" and os.path.exists(csv_file):
        try:
            processed = pd.read_csv(csv_file)
            offset = len(processed)
            print(f"Resuming FOLIO processing for {dialect} ({model_name}): skipping first {offset} datapoints.")
            if isinstance(data, pd.DataFrame):
                data = data.iloc[offset:]
            else:
                data = data[offset:]
        except Exception as e:
            print(f"Could not determine processed count for FOLIO: {e}")
    correct_count = 0
    total = 0
    if isinstance(data, pd.DataFrame):
        total_rows = len(data)
    else:
        total_rows = len(data)
    if dataset_name == "SVAMP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row[f"{dialect} (Original)"]
            question = row["Question"]
            expected = str(row["Answer"])
            user_prompt = (
                "Given a math word problem, provide the final numeric answer.\n"
                f"Context: {problem}\nQuestion: {question}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(.+)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Question": question,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "MBPP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row[f"{dialect} (Original)"]
            test_cases = row["Test_Cases"]
            user_prompt = (
                "Given a coding problem, produce a Python function.\n"
                "Start it with 'Answer:' on its own line.\n"
                f"Problem: {problem}\nTest Cases: {test_cases}\nAnswer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            generated_code = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(generated_code, pattern)
            is_correct, error_msg = run_test_cases(code, test_cases)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Code": code,
                "Correct": int(is_correct),
                "Error Message": error_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc:.2f}%"})
        pbar.close()
    elif dataset_name == "LogicBenchYN":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task[f"{dialect} (context)"]
            for i in range(1, 5):
                question_key = f"Question {i}"
                answer_key = f"Answer {i}"
                question = task.get(question_key, "")
                expected = task.get(answer_key, "")
                if not question or not expected:
                    continue
                user_prompt = (
                    "Given this context, answer yes or no.\n"
                    f"Context: {context}\nQuestion: {question}\n"
                    "Answer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_gpt_model(model_name, system_prompt, user_prompt)
                pattern = r"(yes|no)"
                match = re.search(pattern, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1).lower()
                else:
                    model_answer = ""
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Context)": context,
                    "Question": question,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "LogicBenchMCQ":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task[f"{dialect} (context)"]
            choices = [task.get(f"Choice {i+1}", "") for i in range(4)]
            expected = task.get("Answer", "")
            if not all(choices) or not expected:
                continue
            user_prompt = (
                "Select the correct choice from 1, 2, 3, or 4.\n"
                f"Context: {context}\nChoice 1: {choices[0]}\nChoice 2: {choices[1]}\n"
                f"Choice 3: {choices[2]}\nChoice 4: {choices[3]}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(choice_\d|\d)"
            model_answer = extract_response(response, pattern)
            is_correct = (model_answer.lower() == expected.lower())
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Context)": context,
                "Choice 1": choices[0],
                "Choice 2": choices[1],
                "Choice 3": choices[2],
                "Choice 4": choices[3],
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
        df_processed = pd.read_csv(csv_file)
        df_processed.to_csv(csv_file, index=False)
    elif dataset_name == "HumanEVAL":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            prompt_text = row[f"{dialect} (Prompt)"]
            test_cases = row["Test_Cases"]
            user_prompt = (
                "Produce Python code. Start with 'Answer:'.\n"
                f"Problem: {prompt_text}\nTest Cases: {test_cases}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            generated_code = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(generated_code, pattern)
            is_correct, error_msg = run_test_cases(code, test_cases)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Prompt)": prompt_text,
                "Code": code,
                "Correct": int(is_correct),
                "Error Message": error_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "GSM8K":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row[f"{dialect} (Original)"]
            expected = str(row["Answer"])
            user_prompt = (
                "Given the math problem:\n"
                f"{problem}\n"
                "Provide your final numeric answer.\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*([0-9\.\-]+)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "FOLIO":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premises = row[f"{dialect} (Premises)"]
            conclusion = row["Conclusion"]
            expected = row["Label"]
            user_prompt = (
                f"Determine if the conclusion follows from the premises.\n"
                f"Premises: {premises}\nConclusion: {conclusion}\n"
                "Answer: True, False, or Uncertain.\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(True|False|Uncertain)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premises)": premises,
                "Conclusion": conclusion,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "WSC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row[f"{dialect} (Original Paragraph)"]
            span1 = row["Span 1"]
            span2 = row["Span 2"]
            expected = str(row["Actual Label"])
            user_prompt = (
                f"Check if Span 2 refers to Span 1 in the paragraph.\n"
                f"Paragraph: {paragraph}\nSpan 1: {span1}\nSpan 2: {span2}\n"
                "Answer (1 if same, 0 if not):"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Paragraph)": paragraph,
                "Span 1": span1,
                "Span 2": span2,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "SST-2":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            sentence = row[f"{dialect} (Original Sentence)"]
            expected = str(row["Actual Label"])
            user_prompt = (
                f"Is the sentiment of this sentence positive (1) or negative (0)?\n"
                f"Sentence: \"{sentence}\"\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Sentence)": sentence,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "MultiRC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row[f"{dialect} (Paragraph)"]
            question = row["Question"]
            answer_choice = row["Answer Choice"]
            expected = str(row["Actual Label"])
            user_prompt = (
                f"Given a paragraph, a question, and an answer choice, is the choice correct (1) or incorrect (0)?\n"
                f"Paragraph: {paragraph}\nQuestion: {question}\nAnswer Choice: {answer_choice}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Paragraph)": paragraph,
                "Question": question,
                "Answer Choice": answer_choice,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "COPA":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premise = row[f"{dialect} (Premise)"]
            choice1 = row["Choice 1"]
            choice2 = row["Choice 2"]
            expected = str(row["Actual Answer"])
            user_prompt = (
                f"Given a premise and two choices, pick which is more plausible (0 or 1).\n"
                f"Premise: {premise}\n"
                f"Choice 1: {choice1}\n"
                f"Choice 2: {choice2}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premise)": premise,
                "Choice 1": choice1,
                "Choice 2": choice2,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "BoolQ":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            passage = row[f"{dialect} (SAE Passage)"]
            question = row["SAE Question"]
            expected = str(row["Actual Label"])
            user_prompt = (
                f"Passage: \"{passage}\"\n"
                f"Question: \"{question}\"\n"
                "Is the answer TRUE or FALSE?\nAnswer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(TRUE|FALSE)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (SAE Passage)": passage,
                "SAE Question": question,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    else:
        print(f"Dataset {dataset_name} not recognized for processing.")
        return
    accuracy = (correct_count / total * 100) if total > 0 else 0
    with open(os.path.join(model_output_dir, f"{dataset_name}_accuracy.txt"), "w", encoding="utf-8") as f:
        f.write(f"Total instances: {total}\n")
        f.write(f"Correct answers: {correct_count}\n")
        f.write(f"Accuracy: {accuracy:.2f}%\n")

for model in models:
    for dialect in dialects:
        for dataset_name, rel_path in datasets.items():
            full_path = os.path.join(input_base_dir, dialect, rel_path)
            if os.path.exists(full_path):
                process_dataset(model, dataset_name, full_path, dialect)
            else:
                print(f"File not found: {full_path}")


print("Non-CoT evaluation complete! Results have been saved.")

In [None]:
## COT Evals

!pip install openai==0.28 pandas numpy regex tqdm

import openai
import pandas as pd
import numpy as np
import json
import os
import re
import csv
import time
from tqdm import tqdm
from openai.error import RateLimitError, APIError

openai.api_key = "API-KEY"
TEMPERATURE = 0.7

input_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
output_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/Evaluation Results"

dialects = ["IndE", "JamE", "AAVE", "CollSgE", "ChcE"]

datasets = {
    "SVAMP": "SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
    "MBPP": "MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
    "LogicBenchYN": "Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.json",
    "LogicBenchMCQ": "Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.json",
    "HumanEVAL": "HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
    "GSM8K": "GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
    "FOLIO": "FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
    "WSC": "GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
    "SST-2": "GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
    "MultiRC": "GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
    "COPA": "GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
    "BoolQ": "GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv"
}

models = ["gpt-4o", "gpt-4o-mini"]

def write_row_to_csv(row: dict, filename: str):
    mode = 'a'
    with open(filename, mode, newline='', encoding="utf-8") as csvfile:
        df = pd.DataFrame([row])
        write_header = csvfile.tell() == 0
        df.to_csv(csvfile, index=False, header=write_header)
        csvfile.flush()

def prompt_gpt_model(model_name: str, system_message: str, user_message: str, retries=5, backoff_factor=2) -> str:
    for attempt in range(retries):
        try:
            response = openai.ChatCompletion.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                temperature=TEMPERATURE
            )
            return response['choices'][0]['message']['content']
        except (RateLimitError, APIError) as e:
            wait_time = backoff_factor ** attempt
            print(f"API error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    raise Exception("Maximum retries exceeded.")

def clean_code(generated_code: str) -> str:
    cleaned_code = re.sub(r"```(?:python)?", "", generated_code, flags=re.DOTALL)
    cleaned_code = re.sub(r"```", "", cleaned_code, flags=re.DOTALL)
    return cleaned_code.strip()

def extract_response(response: str, pattern: str) -> str:
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def evaluate_response(model_answer: str, expected_answer: str) -> bool:
    return model_answer.strip().lower() == expected_answer.strip().lower()

def run_test_cases(generated_code: str, test_cases: str) -> (bool, str):
    try:
        exec_globals = {}
        exec(generated_code, exec_globals)
        exec(test_cases, exec_globals)
        return True, ""
    except AssertionError as e:
        return False, f"AssertionError: {str(e)}"
    except SyntaxError as e:
        return False, f"SyntaxError: {str(e)}"
    except Exception as e:
        return False, f"RuntimeError: {str(e)}"

model_output_mapping = {
    "gpt-4o": os.path.join(output_base_dir, "GPT4o_CoT"),
    "gpt-4o-mini": os.path.join(output_base_dir, "GPT4oMini_CoT")
}

def process_dataset(model_name: str, dataset_name: str, file_path: str, dialect: str):
    if dataset_name in ["LogicBenchYN", "LogicBenchMCQ"]:
        with open(file_path, 'r', encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = pd.read_csv(file_path)
    if dataset_name not in ["LogicBenchYN", "LogicBenchMCQ"]:
        data = data.drop(columns=[col for col in data.columns if "BLEU Score" in col], errors="ignore")
    if model_name in model_output_mapping:
        model_output_dir = os.path.join(model_output_mapping[model_name], dialect, dataset_name)
    else:
        model_output_dir = os.path.join(output_base_dir, model_name.replace("-", ""), dialect, dataset_name)
    os.makedirs(model_output_dir, exist_ok=True)
    csv_file = os.path.join(model_output_dir, f"{dataset_name}_results.csv")
    if dataset_name == "FOLIO" and model_name == "gpt-4o" and dialect == "AAVE" and os.path.exists(csv_file):
        try:
            processed = pd.read_csv(csv_file)
            offset = len(processed)
            print(f"Resuming FOLIO processing for {dialect} ({model_name}): skipping first {offset} datapoints.")
            if isinstance(data, pd.DataFrame):
                data = data.iloc[offset:]
            else:
                data = data[offset:]
        except Exception as e:
            print(f"Could not determine processed count for FOLIO: {e}")
    correct_count = 0
    total = 0
    if isinstance(data, pd.DataFrame):
        total_rows = len(data)
    else:
        total_rows = len(data)
    if dataset_name == "SVAMP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row[f"{dialect} (Original)"]
            question = row["Question"]
            expected = str(row["Answer"])
            user_prompt = (
                "Let's think carefully about the math word problem step by step.\n"
                f"Context: {problem}\nQuestion: {question}\n"
                "Finally, provide your numeric answer in the format: Answer: <number>\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(.+)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Question": question,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "MBPP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row[f"{dialect} (Original)"]
            test_cases = row["Test_Cases"]
            user_prompt = (
                "Let's break down the coding problem step by step. Then write a Python function.\n"
                "Start with 'Answer:' and no markdown.\n"
                f"Problem: {problem}\nTest Cases: {test_cases}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            generated_code = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(generated_code, pattern)
            is_correct, error_msg = run_test_cases(code, test_cases)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Code": code,
                "COT Response": response,
                "Correct": int(is_correct),
                "Error Message": error_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "LogicBenchYN":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task[f"{dialect} (context)"]
            for i in range(1, 5):
                question_key = f"Question {i}"
                answer_key = f"Answer {i}"
                question = task.get(question_key, "")
                expected = task.get(answer_key, "")
                if not question or not expected:
                    continue
                user_prompt = (
                    "Let's reason about the question step by step.\n"
                    f"Context: {context}\nQuestion: {question}\n"
                    "Finally, respond EXACTLY as Answer: yes or Answer: no."
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_gpt_model(model_name, system_prompt, user_prompt)
                pattern = r"Answer:\s*(yes|no)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Context)": context,
                    "Question": question,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()
    elif dataset_name == "LogicBenchMCQ":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task[f"{dialect} (context)"]
            choices = [task.get(f"Choice {i+1}", "") for i in range(4)]
            expected = task.get("Answer", "")
            if not all(choices) or not expected:
                continue
            user_prompt = (
                "Let's analyze the context and each choice step by step.\n"
                "Finally, provide EXACTLY one line like: Answer: choice_2.\n"
                f"Context: {context}\n"
                f"Choice 1: {choices[0]}\nChoice 2: {choices[1]}\nChoice 3: {choices[2]}\nChoice 4: {choices[3]}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(choice_\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Context)": context,
                "Choice 1": choices[0],
                "Choice 2": choices[1],
                "Choice 3": choices[2],
                "Choice 4": choices[3],
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
        df_processed = pd.read_csv(csv_file)
        df_processed.to_csv(csv_file, index=False)
    elif dataset_name == "HumanEVAL":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            prompt_text = row[f"{dialect} (Prompt)"]
            test_cases = row["Test_Cases"]
            user_prompt = (
                "Let's think step by step about the coding task.\n"
                "Finally, provide code beginning with 'Answer:'.\n\n"
                f"Problem: {prompt_text}\nTest Cases: {test_cases}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            generated_code = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(generated_code, pattern)
            is_correct, error_msg = run_test_cases(code, test_cases)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Prompt)": prompt_text,
                "Code": code,
                "COT Response": response,
                "Correct": int(is_correct),
                "Error Message": error_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc:.2f}%"})
        pbar.close()
    elif dataset_name == "GSM8K":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row[f"{dialect} (Original)"]
            expected = str(row["Answer"])
            user_prompt = (
                "Let's think step by step to solve this math problem.\n"
                "Finally, provide a numeric result as: Answer: <number>.\n\n"
                f"Problem: {problem}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*([0-9.\-]+)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "FOLIO":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premises = row[f"{dialect} (Premises)"]
            conclusion = row["Conclusion"]
            expected = row["Label"]
            user_prompt = (
                "Let's analyze whether the conclusion follows from the premises step by step.\n"
                "Finally, provide: Answer: True, False, or Uncertain.\n\n"
                f"Premises: {premises}\nConclusion: {conclusion}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(True|False|Uncertain)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premises)": premises,
                "Conclusion": conclusion,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "WSC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row[f"{dialect} (Original Paragraph)"]
            span1 = row["Span 1"]
            span2 = row["Span 2"]
            expected = str(row["Actual Label"])
            user_prompt = (
                "Let's analyze the reference step by step.\n"
                "Finally, provide: Answer: 1 if Span 2 refers to Span 1, else 0.\n\n"
                f"Paragraph: {paragraph}\nSpan 1: {span1}\nSpan 2: {span2}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Paragraph)": paragraph,
                "Span 1": span1,
                "Span 2": span2,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "SST-2":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            sentence = row[f"{dialect} (Original Sentence)"]
            expected = str(row["Actual Label"])
            user_prompt = (
                "Let's assess the sentiment step by step.\n"
                "Finally, provide: Answer: 1 if positive, 0 if negative.\n\n"
                f"Sentence: \"{sentence}\""
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Sentence)": sentence,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "MultiRC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row[f"{dialect} (Paragraph)"]
            question = row["Question"]
            answer_choice = row["Answer Choice"]
            expected = str(row["Actual Label"])
            user_prompt = (
                "Let's analyze the paragraph and question step by step.\n"
                "Finally, provide: Answer: 1 if correct, 0 if incorrect.\n\n"
                f"Paragraph: {paragraph}\nQuestion: {question}\nAnswer Choice: {answer_choice}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Paragraph)": paragraph,
                "Question": question,
                "Answer Choice": answer_choice,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "COPA":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premise = row[f"{dialect} (Premise)"]
            choice1 = row["Choice 1"]
            choice2 = row["Choice 2"]
            expected = str(row["Actual Answer"])
            user_prompt = (
                "Let's compare the plausibility of these two choices step by step.\n"
                "Finally, provide: Answer: 0 for the first, or 1 for the second.\n\n"
                f"Premise: {premise}\nChoice 1: {choice1}\nChoice 2: {choice2}"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premise)": premise,
                "Choice 1": choice1,
                "Choice 2": choice2,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    elif dataset_name == "BoolQ":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            passage = row[f"{dialect} (SAE Passage)"]
            question = row["SAE Question"]
            expected = str(row["Actual Label"])
            user_prompt = (
                "Let's review the passage and question step by step.\n"
                "Finally, provide: Answer: TRUE or FALSE.\n\n"
                f"Passage: \"{passage}\"\nQuestion: \"{question}\""
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_gpt_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(TRUE|FALSE)"
            model_answer = extract_response(response, pattern)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (SAE Passage)": passage,
                "SAE Question": question,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()
    else:
        print(f"Dataset {dataset_name} not recognized for processing.")
        return
    accuracy = (correct_count / total * 100) if total > 0 else 0
    with open(os.path.join(model_output_dir, f"{dataset_name}_accuracy.txt"), "w", encoding="utf-8") as f:
        f.write(f"Total instances: {total}\n")
        f.write(f"Correct answers: {correct_count}\n")
        f.write(f"Accuracy: {accuracy:.2f}%\n")

for model in models:
    for dialect in dialects:
        for dataset_name, rel_path in datasets.items():
            full_path = os.path.join(input_base_dir, dialect, rel_path)
            if os.path.exists(full_path):
                process_dataset(model, dataset_name, full_path, dialect)
            else:
                print(f"File not found: {full_path}")


print("CoT evaluation complete! Results have been saved.")
