In [None]:
## NON-COT EVALS

from google.colab import drive
drive.mount('/content/drive')

!pip install together pandas numpy regex tqdm

import pandas as pd
import numpy as np
import json
import os
import re
import csv
import time
from tqdm import tqdm

from together import Together
from together.error import RateLimitError, APIError

TOGETHER_API_KEY = "API-KEY"
client = Together(api_key=TOGETHER_API_KEY)

TEMPERATURE = 0.7

input_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
output_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/Evaluation Results/NonCOT_Evals"
noncot_output_dir = os.path.join(output_base_dir, "LLaMa_3_8b_Instruct_Test_NonCOT")

dialects = ["IndE", "JamE", "AAVE", "CollSgE", "ChcE"]

datasets = {
    "SVAMP": "SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
    "MBPP": "MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
    "LogicBenchYN": "Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.json",
    "LogicBenchMCQ": "Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.json",
    "HumanEVAL": "HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
    "GSM8K": "GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
    "FOLIO": "FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
    "WSC": "GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
    "SST-2": "GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
    "MultiRC": "GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
    "COPA": "GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
    "BoolQ": "GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv"
}

models = ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo"]

def write_row_to_csv(row: dict, filename: str):
    mode = 'a'
    with open(filename, mode, newline='', encoding="utf-8") as csvfile:
        df = pd.DataFrame([row])
        write_header = csvfile.tell() == 0
        df.to_csv(csvfile, index=False, header=write_header)
        csvfile.flush()

def prompt_together_model(model_name: str, system_message: str, user_message: str, retries=5, backoff_factor=2):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                temperature=TEMPERATURE,
                max_tokens=2048
            )
            return response.choices[0].message.content
        except (RateLimitError, APIError) as e:
            wait_time = backoff_factor ** attempt
            print(f"API error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    raise Exception("Maximum retries exceeded.")

def clean_code(generated_code: str) -> str:
    cleaned_code = re.sub(r"```(?:python)?", "", generated_code, flags=re.DOTALL)
    cleaned_code = re.sub(r"```", "", cleaned_code, flags=re.DOTALL)
    return cleaned_code.strip()

def extract_response(response: str, pattern: str) -> str:
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def evaluate_response(model_answer: str, expected_answer: str) -> bool:
    return model_answer.strip().lower() == expected_answer.strip().lower()

def run_test_cases(generated_code: str, test_cases: str):
    try:
        exec_globals = {}
        exec(generated_code, exec_globals)
        exec(test_cases, exec_globals)
        return True, ""
    except AssertionError as e:
        return False, f"AssertionError: {str(e)}"
    except SyntaxError as e:
        return False, f"SyntaxError: {str(e)}"
    except Exception as e:
        return False, f"RuntimeError: {str(e)}"

model_output_mapping = {
    "meta-llama/Meta-Llama-3-8B-Instruct-Turbo": os.path.join(noncot_output_dir, "LLaMa_3_8b_Instruct_Test_NonCOT")
}

def process_dataset(model_name: str, dataset_name: str, file_path: str, dialect: str):
    if dataset_name in ["LogicBenchYN", "LogicBenchMCQ"]:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = pd.read_csv(file_path)

    if dataset_name not in ["LogicBenchYN", "LogicBenchMCQ"]:
        data = data.drop(columns=[col for col in data.columns if "BLEU Score" in col], errors="ignore")

    model_output_dir = os.path.join(model_output_mapping[model_name], dialect, dataset_name)
    os.makedirs(model_output_dir, exist_ok=True)
    csv_file = os.path.join(model_output_dir, f"{dataset_name}_results.csv")

    correct_count = 0
    total = 0
    total_rows = len(data)

    if dataset_name == "SVAMP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row.get(f"{dialect} (Original)", "")
            question = row.get("Question", "")
            expected = str(row.get("Answer", ""))

            # Non-CoT prompt: no "step by step" or "chain-of-thought" mention
            user_prompt = (
                "Given a mathematics problem, determine the answer. "
                "Simplify your answer as much as possible and provide a final numeric answer.\n"
                f"Context: {problem}\n"
                f"Question: {question}\n"
                "Final Numeric Answer: "
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Numeric Answer:\s*([0-9.\-]+)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Question": question,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "MBPP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row.get(f"{dialect} (Original)", "")
            test_cases = row.get("Test_Cases", "")
            user_prompt = (
                "Given a coding problem, produce a Python function that solves the problem. "
                "Provide your entire code. Start it with 'Answer:' on its own line.\n"
                f"Problem: {problem}\n"
                f"Test Cases: {test_cases}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            code_block = clean_code(full_response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(code_block, pattern)
            passed, err_msg = run_test_cases(code, test_cases)
            total += 1
            if passed:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Code": code,
                "Correct": int(passed),
                "Error Message": err_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc:.2f}%"})
        pbar.close()

    elif dataset_name == "LogicBenchMCQ":
        def parse_gt_to_int(gt: str) -> str:
            match = re.search(r"choice_(\d+)", gt, re.IGNORECASE)
            if match:
                return match.group(1)
            match2 = re.search(r"\d+", gt)
            if match2:
                return match2.group(0)
            return ""

        def extract_integer_from_response(response_text: str) -> str:
            match = re.search(r"Final Answer:\s*(\d+)", response_text)
            if match:
                return match.group(1).strip()
            return ""

        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task.get(f"{dialect} (context)", "")
            choices = [
                task.get("Choice 1", ""),
                task.get("Choice 2", ""),
                task.get("Choice 3", ""),
                task.get("Choice 4", ""),
            ]
            ground_truth_raw = task.get("Answer", "")
            if not context or not all(choices) or not ground_truth_raw:
                row_dict = {
                    "Dialect": dialect,
                    "Context": context,
                    "Choice 1": choices[0],
                    "Choice 2": choices[1],
                    "Choice 3": choices[2],
                    "Choice 4": choices[3],
                    "Expected Answer": "N/A (incomplete data)",
                    "Model Answer": "N/A (incomplete data)",
                    "Correct": False
                }
                pd.DataFrame([row_dict]).to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))
                continue
            gt_digit = parse_gt_to_int(ground_truth_raw)
            system_prompt = "You are a helpful assistant."
            user_prompt = (
                "Given a context and four choices, pick the correct choice number (1, 2, 3, or 4). "
                "Provide your final answer as: Final Answer: X\n\n"
                f"Context:\n{context}\n\n"
                "Choices:\n"
                f"1) {choices[0]}\n"
                f"2) {choices[1]}\n"
                f"3) {choices[2]}\n"
                f"4) {choices[3]}\n"
                "Final Answer:"
            )
            response_text = prompt_together_model(model_name, system_prompt, user_prompt)
            predicted_digit = extract_integer_from_response(response_text)
            if not predicted_digit:
                predicted_digit = "N/A"
            is_correct = (predicted_digit == gt_digit and bool(gt_digit))
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                "Dialect": dialect,
                "Context": context,
                "Choice 1": choices[0],
                "Choice 2": choices[1],
                "Choice 3": choices[2],
                "Choice 4": choices[3],
                "Expected Answer": ground_truth_raw,
                "Model Answer": predicted_digit,
                "Correct": is_correct
            }
            pd.DataFrame([row_dict]).to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))
            acc_percentage = (correct_count / total) * 100 if total > 0 else 0
            pbar.set_postfix({"Acc": f"{acc_percentage:.1f}%"})
        pbar.close()
        df_processed = pd.read_csv(csv_file)
        df_processed.to_csv(csv_file, index=False)

    elif dataset_name == "HumanEVAL":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            prompt_text = row.get(f"{dialect} (Prompt)", "")
            test_cases = row.get("Test_Cases", "")
            user_prompt = (
                "Given a coding problem, produce a Python function that solves the problem. "
                "Provide your entire code in 'Answer:'.\n"
                f"Problem: {prompt_text}\n"
                f"Test Cases: {test_cases}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            generated_code = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(generated_code, pattern)
            is_correct, error_msg = run_test_cases(code, test_cases)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Prompt)": prompt_text,
                "Code": code,
                "Correct": int(is_correct),
                "Error Message": error_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "WSC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row.get(f"{dialect} (Original Paragraph)", "")
            span1 = row.get("Span 1", "")
            span2 = row.get("Span 2", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Given a pronoun resolution problem, determine whether Span 2 refers to Span 1. "
                "Provide 1 if they refer, 0 if they do not, in 'Final Answer:'.\n"
                f"Paragraph: {paragraph}\n"
                f"Span 1: {span1}\n"
                f"Span 2: {span2}\n"
                "Final Answer:"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Answer:\s*(\d)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Paragraph)": paragraph,
                "Span 1": span1,
                "Span 2": span2,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "GSM8K":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row.get(f"{dialect} (Original)", "")
            expected = str(row.get("Answer", ""))
            user_prompt = (
                "Given a mathematics problem, provide a final numeric answer without decimals.\n"
                f"Problem: {problem}\n"
                "Final Numeric Answer:"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Numeric Answer:\s*([0-9.\-]+)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "FOLIO":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premises = row.get(f"{dialect} (Premises)", "")
            conclusion = row.get("Conclusion", "")
            expected = row.get("Label", "")
            user_prompt = (
                "Given premises and a conclusion, determine whether the conclusion is True or False. "
                "Provide your final answer in 'Final Answer: True' or 'Final Answer: False'.\n"
                f"Premises: {premises}\n"
                f"Conclusion: {conclusion}\n"
                "Final Answer:"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Answer:\s*(True|False)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premises)": premises,
                "Conclusion": conclusion,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "LogicBenchYN":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task.get(f"{dialect} (context)", "")
            for i in range(1, 5):
                question_key = f"Question {i}"
                answer_key = f"Answer {i}"
                question = task.get(question_key, "")
                expected = task.get(answer_key, "")
                if not question or not expected:
                    continue
                user_prompt = (
                    "Given a yes/no question, answer yes or no. Provide your final answer as 'Final Answer: yes' or 'Final Answer: no'.\n"
                    f"Context: {context}\n"
                    f"Question: {question}\n"
                    "Final Answer:"
                )
                system_prompt = "You are a helpful assistant."
                full_response = prompt_together_model(model_name, system_prompt, user_prompt)
                pattern = r"Final Answer:\s*(yes|no)"
                model_answer = extract_response(full_response, pattern)
                if not model_answer:
                    model_answer = "N/A"
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (context)": context,
                    "Question": question,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "SST-2":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            sentence = row.get(f"{dialect} (Original Sentence)", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Given a sentence, determine its sentiment (1 for positive, 0 for negative). "
                "Provide your final answer as: Answer: 1 or Answer: 0.\n"
                f"Sentence: {sentence}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(\d)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Sentence)": sentence,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "MultiRC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row.get(f"{dialect} (Paragraph)", "")
            question = row.get("Question", "")
            answer_choice = row.get("Answer Choice", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Given a paragraph, question, and an answer choice, determine if the answer choice is correct (1) or incorrect (0). "
                "Provide your final answer as: Answer: 1 or Answer: 0.\n"
                f"Paragraph: {paragraph}\n"
                f"Question: {question}\n"
                f"Answer Choice: {answer_choice}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(\d)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Paragraph)": paragraph,
                "Question": question,
                "Answer Choice": answer_choice,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()

    elif dataset_name == "COPA":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premise = row.get(f"{dialect} (Premise)", "")
            choice1 = row.get("Choice 1", "")
            choice2 = row.get("Choice 2", "")
            expected = str(row.get("Actual Answer", ""))
            user_prompt = (
                "Given a premise and two choices, pick which choice (0 or 1) is more plausible. "
                "Provide your final answer as: Answer: 0 or Answer: 1.\n"
                f"Premise: {premise}\n"
                f"Choice 1: {choice1}\n"
                f"Choice 2: {choice2}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(\d)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premise)": premise,
                "Choice 1": choice1,
                "Choice 2": choice2,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()

    elif dataset_name == "BoolQ":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            passage = row.get(f"{dialect} (SAE Passage)", "")
            question = row.get("SAE Question", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Given a passage and a yes/no question, label it as TRUE or FALSE. "
                "Provide your final answer in 'Answer: TRUE' or 'Answer: FALSE'.\n"
                f"Passage: {passage}\n"
                f"Question: {question}\n"
                "Answer:"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(TRUE|FALSE)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(TRUE|FALSE)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1).upper()
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (SAE Passage)": passage,
                "SAE Question": question,
                "Expected Answer": expected,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()

    else:
        print(f"Dataset {dataset_name} not recognized for processing.")
        return

    accuracy = (correct_count / total * 100) if total > 0 else 0
    with open(os.path.join(model_output_dir, f"{dataset_name}_accuracy.txt"), "w", encoding="utf-8") as f:
        f.write(f"Total instances: {total}\n")
        f.write(f"Correct answers: {correct_count}\n")
        f.write(f"Accuracy: {accuracy:.2f}%\n")

def main():
    for model in models:
        for dialect in dialects:
            for dataset_name, rel_path in datasets.items():
                full_path = os.path.join(input_base_dir, dialect, rel_path)
                if os.path.exists(full_path):
                    process_dataset(model, dataset_name, full_path, dialect)
                else:
                    print(f"⚠️ Missing file: {full_path}")
    print("✅ Evaluation complete.")

if __name__ == "__main__":
    main()



In [None]:
## COT EVALS

from google.colab import drive
drive.mount('/content/drive')

!pip install together pandas numpy regex tqdm

import pandas as pd
import numpy as np
import json
import os
import re
import csv
import time
from tqdm import tqdm

from together import Together
from together.error import RateLimitError, APIError

TOGETHER_API_KEY = "API-KEY"
client = Together(api_key=TOGETHER_API_KEY)

TEMPERATURE = 0.7

input_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
output_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/Evaluation Results/COT Evals"
cot_output_dir = os.path.join(output_base_dir, "LLaMa_3_8b_Instruct_Test_CoT")

dialects = ["IndE", "JamE", "AAVE", "CollSgE", "ChcE"]

datasets = {
    "SVAMP": "SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
    "MBPP": "MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
    "LogicBenchYN": "Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.json",
    "LogicBenchMCQ": "Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.json",
    "HumanEVAL": "HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
    "GSM8K": "GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
    "FOLIO": "FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
    "WSC": "GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
    "SST-2": "GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
    "MultiRC": "GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
    "COPA": "GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
    "BoolQ": "GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv"
}

models = ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo"]

def write_row_to_csv(row: dict, filename: str):
    mode = 'a'
    with open(filename, mode, newline='', encoding="utf-8") as csvfile:
        df = pd.DataFrame([row])
        write_header = csvfile.tell() == 0
        df.to_csv(csvfile, index=False, header=write_header)
        csvfile.flush()

def prompt_together_model(model_name: str, system_message: str, user_message: str, retries=5, backoff_factor=2):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                temperature=TEMPERATURE,
                max_tokens=2048
            )
            return response.choices[0].message.content
        except (RateLimitError, APIError) as e:
            wait_time = backoff_factor ** attempt
            print(f"API error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    raise Exception("Maximum retries exceeded.")

def clean_code(generated_code: str) -> str:
    cleaned_code = re.sub(r"```(?:python)?", "", generated_code, flags=re.DOTALL)
    cleaned_code = re.sub(r"```", "", cleaned_code, flags=re.DOTALL)
    return cleaned_code.strip()

def extract_response(response: str, pattern: str) -> str:
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def evaluate_response(model_answer: str, expected_answer: str) -> bool:
    return model_answer.strip().lower() == expected_answer.strip().lower()

def run_test_cases(generated_code: str, test_cases: str):
    try:
        exec_globals = {}
        exec(generated_code, exec_globals)
        exec(test_cases, exec_globals)
        return True, ""
    except AssertionError as e:
        return False, f"AssertionError: {str(e)}"
    except SyntaxError as e:
        return False, f"SyntaxError: {str(e)}"
    except Exception as e:
        return False, f"RuntimeError: {str(e)}"

model_output_mapping = {
    "meta-llama/Meta-Llama-3-8B-Instruct-Turbo": os.path.join(cot_output_dir, "LLaMa_3_8b_Instruct_Test_CoT")
}

def process_dataset(model_name: str, dataset_name: str, file_path: str, dialect: str):
    if dataset_name in ["LogicBenchYN", "LogicBenchMCQ"]:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = pd.read_csv(file_path)

    if dataset_name not in ["LogicBenchYN", "LogicBenchMCQ"]:
        data = data.drop(columns=[col for col in data.columns if "BLEU Score" in col], errors="ignore")

    model_output_dir = os.path.join(model_output_mapping[model_name], dialect, dataset_name)
    os.makedirs(model_output_dir, exist_ok=True)
    csv_file = os.path.join(model_output_dir, f"{dataset_name}_results.csv")

    correct_count = 0
    total = 0
    total_rows = len(data)

    # Chain-of-thought versions of prompts

    if dataset_name == "SVAMP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row.get(f"{dialect} (Original)", "")
            question = row.get("Question", "")
            expected = str(row.get("Answer", ""))
            user_prompt = (
                "Let's think about this step by step to solve the math problem. "
                "Then provide:\nFinal Numeric Answer: <the final integer or decimal>\n\n"
                f"Context: {problem}\n"
                f"Question: {question}\n"
                "No extra text after the numeric answer."
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Numeric Answer:\s*([0-9.\-]+)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Question": question,
                "Expected Answer": expected,
                "COT Response": full_response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "MBPP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row.get(f"{dialect} (Original)", "")
            test_cases = row.get("Test_Cases", "")
            user_prompt = (
                "Let's think step by step about how to solve this coding task. "
                "Then write the final Python function. "
                "Output only Python code starting with 'Answer:'\n"
                f"Problem: {problem}\n"
                f"Test Cases: {test_cases}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            code_block = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(code_block, pattern)
            passed, err_msg = run_test_cases(code, test_cases)
            total += 1
            if passed:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Code": code,
                "COT Response": response,
                "Correct": int(passed),
                "Error Message": err_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc:.2f}%"})
        pbar.close()

    elif dataset_name == "LogicBenchMCQ":
        def parse_gt_to_int(gt: str) -> str:
            match = re.search(r"choice_(\d+)", gt, re.IGNORECASE)
            if match:
                return match.group(1)
            match2 = re.search(r"\d+", gt)
            if match2:
                return match2.group(0)
            return ""

        def extract_integer_from_response(response_text: str) -> str:
            match = re.search(r"Final Answer:\s*(\d+)", response_text)
            if match:
                return match.group(1).strip()
            return ""

        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task.get(f"{dialect} (context)", "")
            choices = [
                task.get("Choice 1", ""),
                task.get("Choice 2", ""),
                task.get("Choice 3", ""),
                task.get("Choice 4", ""),
            ]
            ground_truth_raw = task.get("Answer", "")
            if not context or not all(choices) or not ground_truth_raw:
                row_dict = {
                    "Dialect": dialect,
                    "Context": context,
                    "Choice 1": choices[0],
                    "Choice 2": choices[1],
                    "Choice 3": choices[2],
                    "Choice 4": choices[3],
                    "Expected Answer": "N/A (incomplete data)",
                    "COT Response": "N/A",
                    "Model Answer": "N/A",
                    "Correct": False
                }
                pd.DataFrame([row_dict]).to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))
                continue
            gt_digit = parse_gt_to_int(ground_truth_raw)
            system_prompt = "You are a helpful assistant."
            user_prompt = (
                "Let's analyze this context and each choice step by step. "
                "Finally, provide: 'Final Answer: X' with no extra text.\n"
                f"Context:\n{context}\n\n"
                "Choices:\n"
                f"1) {choices[0]}\n"
                f"2) {choices[1]}\n"
                f"3) {choices[2]}\n"
                f"4) {choices[3]}\n"
            )
            response_text = prompt_together_model(model_name, system_prompt, user_prompt)
            predicted_digit = extract_integer_from_response(response_text)
            if not predicted_digit:
                predicted_digit = "N/A"
            is_correct = (predicted_digit == gt_digit and bool(gt_digit))
            nonlocal_total = 1
            total += nonlocal_total
            if is_correct:
                correct_count += 1
            row_dict = {
                "Dialect": dialect,
                "Context": context,
                "Choice 1": choices[0],
                "Choice 2": choices[1],
                "Choice 3": choices[2],
                "Choice 4": choices[3],
                "Expected Answer": ground_truth_raw,
                "COT Response": response_text,
                "Model Answer": predicted_digit,
                "Correct": is_correct
            }
            pd.DataFrame([row_dict]).to_csv(csv_file, mode='a', index=False, header=not os.path.exists(csv_file))
            acc_percentage = (correct_count / total) * 100 if total > 0 else 0
            pbar.set_postfix({"Acc": f"{acc_percentage:.1f}%"})
        pbar.close()
        df_processed = pd.read_csv(csv_file)
        df_processed.to_csv(csv_file, index=False)

    elif dataset_name == "HumanEVAL":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            prompt_text = row.get(f"{dialect} (Prompt)", "")
            test_cases = row.get("Test_Cases", "")
            user_prompt = (
                "Let's break down the coding problem step by step, then provide the final Python code. "
                "Code should start with 'Answer:'.\n"
                f"Problem: {prompt_text}\n"
                f"Test Cases: {test_cases}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            code_block = clean_code(response)
            pattern = r"Answer:\s*(.+)"
            code = extract_response(code_block, pattern)
            is_correct, error_msg = run_test_cases(code, test_cases)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Prompt)": prompt_text,
                "Code": code,
                "COT Response": response,
                "Correct": int(is_correct),
                "Error Message": error_msg
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc:.2f}%"})
        pbar.close()

    elif dataset_name == "WSC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row.get(f"{dialect} (Original Paragraph)", "")
            span1 = row.get("Span 1", "")
            span2 = row.get("Span 2", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Let's analyze whether Span 2 refers to Span 1 step by step. "
                "Finally, provide 'Final Answer: 1' if yes, 'Final Answer: 0' if no.\n"
                f"Paragraph: {paragraph}\n"
                f"Span 1: {span1}\n"
                f"Span 2: {span2}\n"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Answer:\s*(\d)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Paragraph)": paragraph,
                "Span 1": span1,
                "Span 2": span2,
                "Expected Answer": expected,
                "COT Response": full_response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "GSM8K":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            problem = row.get(f"{dialect} (Original)", "")
            expected = str(row.get("Answer", ""))
            user_prompt = (
                "Let's think about the math problem step by step. "
                "Finally, provide:\nFinal Numeric Answer: <the integer>\n\n"
                f"Problem: {problem}\n"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Numeric Answer:\s*([0-9.\-]+)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original)": problem,
                "Expected Answer": expected,
                "COT Response": full_response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "FOLIO":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premises = row.get(f"{dialect} (Premises)", "")
            conclusion = row.get("Conclusion", "")
            expected = row.get("Label", "")
            user_prompt = (
                "Let's evaluate whether the conclusion follows from the premises step by step. "
                "Finally, provide:\nFinal Answer: True or False\n\n"
                f"Premises: {premises}\n"
                f"Conclusion: {conclusion}\n"
            )
            system_prompt = "You are a helpful assistant."
            full_response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Final Answer:\s*(True|False)"
            model_answer = extract_response(full_response, pattern)
            if not model_answer:
                model_answer = "N/A"
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premises)": premises,
                "Conclusion": conclusion,
                "Expected Answer": expected,
                "COT Response": full_response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "LogicBenchYN":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for task in pbar:
            context = task.get(f"{dialect} (context)", "")
            for i in range(1, 5):
                question_key = f"Question {i}"
                answer_key = f"Answer {i}"
                question = task.get(question_key, "")
                expected = task.get(answer_key, "")
                if not question or not expected:
                    continue
                user_prompt = (
                    "Let's think step by step about the yes/no question. "
                    "Finally, provide:\nFinal Answer: yes or no\n\n"
                    f"Context: {context}\n"
                    f"Question: {question}\n"
                )
                system_prompt = "You are a helpful assistant."
                full_response = prompt_together_model(model_name, system_prompt, user_prompt)
                pattern = r"Final Answer:\s*(yes|no)"
                model_answer = extract_response(full_response, pattern)
                if not model_answer:
                    model_answer = "N/A"
                is_correct = evaluate_response(model_answer, expected)
                nonlocal_total = 1
                total += nonlocal_total
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (context)": context,
                    "Question": question,
                    "Expected Answer": expected,
                    "COT Response": full_response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "SST-2":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            sentence = row.get(f"{dialect} (Original Sentence)", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Let's analyze the sentiment of the sentence step by step. "
                "Finally, provide:\nAnswer: 1 for positive, 0 for negative\n\n"
                f"Sentence: {sentence}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(\d)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Original Sentence)": sentence,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
        pbar.close()

    elif dataset_name == "MultiRC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            paragraph = row.get(f"{dialect} (Paragraph)", "")
            question = row.get("Question", "")
            answer_choice = row.get("Answer Choice", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Let's analyze the paragraph and question step by step. "
                "Finally, provide:\nAnswer: 1 if correct, 0 if incorrect\n\n"
                f"Paragraph: {paragraph}\n"
                f"Question: {question}\n"
                f"Answer Choice: {answer_choice}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(\d)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Paragraph)": paragraph,
                "Question": question,
                "Answer Choice": answer_choice,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()

    elif dataset_name == "COPA":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            premise = row.get(f"{dialect} (Premise)", "")
            choice1 = row.get("Choice 1", "")
            choice2 = row.get("Choice 2", "")
            expected = str(row.get("Actual Answer", ""))
            user_prompt = (
                "Let's think step by step which choice is more plausible. "
                "Finally, provide: Answer: 0 if the first is correct or 1 if the second is correct.\n"
                f"Premise: {premise}\n"
                f"Choice 1: {choice1}\n"
                f"Choice 2: {choice2}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(\d)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(\d)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1)
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (Premise)": premise,
                "Choice 1": choice1,
                "Choice 2": choice2,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()

    elif dataset_name == "BoolQ":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            passage = row.get(f"{dialect} (SAE Passage)", "")
            question = row.get("SAE Question", "")
            expected = str(row.get("Actual Label", ""))
            user_prompt = (
                "Let's consider the passage and question carefully step by step. "
                "Finally, provide: Answer: TRUE or Answer: FALSE\n"
                f"Passage: {passage}\n"
                f"Question: {question}\n"
            )
            system_prompt = "You are a helpful assistant."
            response = prompt_together_model(model_name, system_prompt, user_prompt)
            pattern = r"Answer:\s*(TRUE|FALSE)"
            model_answer = extract_response(response, pattern)
            if not model_answer:
                pattern_alt = r"Answer:\s*(TRUE|FALSE)"
                match = re.search(pattern_alt, response, re.IGNORECASE)
                if match:
                    model_answer = match.group(1).upper()
            is_correct = evaluate_response(model_answer, expected)
            total += 1
            if is_correct:
                correct_count += 1
            row_dict = {
                f"{dialect} (SAE Passage)": passage,
                "SAE Question": question,
                "Expected Answer": expected,
                "COT Response": response,
                "Model Answer": model_answer,
                "Correct": is_correct
            }
            write_row_to_csv(row_dict, csv_file)
            if total % 2 == 0:
                acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"} )
        pbar.close()

    else:
        print(f"Dataset {dataset_name} not recognized for processing.")
        return

    accuracy = (correct_count / total * 100) if total > 0 else 0
    with open(os.path.join(model_output_dir, f"{dataset_name}_accuracy.txt"), "w", encoding="utf-8") as f:
        f.write(f"Total instances: {total}\n")
        f.write(f"Correct answers: {correct_count}\n")
        f.write(f"Accuracy: {accuracy:.2f}%\n")

def main():
    for model in models:
        for dialect in dialects:
            for dataset_name, rel_path in datasets.items():
                full_path = os.path.join(input_base_dir, dialect, rel_path)
                if os.path.exists(full_path):
                    process_dataset(model, dataset_name, full_path, dialect)
                else:
                    print(f"⚠️ Missing file: {full_path}")
    print("✅ Evaluation complete.")

if __name__ == "__main__":
    main()