In [None]:
## NON-COT EVALS

from google.colab import drive
drive.mount('/content/drive')

!pip install openai pandas numpy regex tqdm

from openai import OpenAI
import pandas as pd
import numpy as np
import json
import os
import re
import csv
import time
import signal
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, TimeoutError

DEEPSEEK_API_KEY = "API-KEY"
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/")
TEMPERATURE = 0.7

input_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
output_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/Evaluation Results"
dialects = ["IndE", "AAVE", "JamE", "ChcE", "CollSgE"]

datasets = {
    "SVAMP": "SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
    "MBPP": "MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
    "LogicBenchYN": "Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.json",
    "LogicBenchMCQ": "Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.json",
    "HumanEVAL": "HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
    "GSM8K": "GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
    "FOLIO": "FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
    "WSC": "GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
    "SST-2": "GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
    "MultiRC": "GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
    "COPA": "GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
    "BoolQ": "GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv"
}

models = ["deepseek-chat"]

def write_row_to_csv(row: dict, filename: str):
    mode = 'a'
    with open(filename, mode, newline='', encoding="utf-8") as csvfile:
        df = pd.DataFrame([row])
        write_header = csvfile.tell() == 0
        df.to_csv(csvfile, index=False, header=write_header)
        csvfile.flush()

def prompt_deepseek_model_with_timeout(model_name: str, system_message: str, user_message: str, timeout: int = 25, retries=5, backoff_factor=2) -> str:
    def prompt():
        return client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            temperature=TEMPERATURE,
            max_tokens=4096
        ).choices[0].message.content

    for attempt in range(retries):
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(prompt)
            try:
                response = future.result(timeout=timeout)
                return response
            except TimeoutError:
                print(f"Timeout: Skipping this row after {timeout} seconds.")
                return None
            except Exception as e:
                wait_time = backoff_factor ** attempt
                print(f"API error: {e}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
    print("Maximum retries exceeded. Skipping this row.")
    return None

def clean_code(generated_code: str) -> str:
    cleaned_code = re.sub(r"```(?:python)?", "", generated_code, flags=re.DOTALL)
    cleaned_code = re.sub(r"```", "", cleaned_code, flags=re.DOTALL)
    return cleaned_code.strip()

def extract_response(response: str, pattern: str) -> str:
    if not response:
        return ""
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def evaluate_response(model_answer: str, expected_answer: str) -> bool:
    return model_answer.strip().lower() == expected_answer.strip().lower()

class TimeoutException(Exception):
    pass

def alarm_handler(signum, frame):
    raise TimeoutException("Execution timed out!")

def run_test_cases(generated_code: str, test_cases: str, timeout=5) -> (bool, str):
    exec_globals = {}
    try:
        signal.signal(signal.SIGALRM, alarm_handler)
        signal.alarm(timeout)
        exec(generated_code, exec_globals)
        exec(test_cases, exec_globals)
        signal.alarm(0)
        return True, ""
    except TimeoutException as e:
        signal.alarm(0)
        return False, str(e)
    except AssertionError as e:
        signal.alarm(0)
        return False, f"AssertionError: {str(e)}"
    except SyntaxError as e:
        signal.alarm(0)
        return False, f"SyntaxError: {str(e)}"
    except Exception as e:
        signal.alarm(0)
        return False, f"RuntimeError: {str(e)}"

def parse_gt_to_int(gt: str) -> str:
    match = re.search(r"choice_(\d+)", gt, re.IGNORECASE)
    if match: return match.group(1)
    match2 = re.search(r"\d+", gt)
    if match2: return match2.group(0)
    return ""

def extract_integer_from_response(response_text: str) -> str:
    match = re.search(r"Answer:\s*(\d+)", response_text)
    return match.group(1).strip() if match else ""

def check_mcq_correctness(model_digit: str, gt_digit: str) -> bool:
    return (model_digit == gt_digit) and bool(model_digit)

model_output_mapping = {
    "deepseek-chat": os.path.join(output_base_dir, "DeepSeek-V3_NonCoT")
}

def process_dataset(model_name: str, dataset_name: str, file_path: str, dialect: str):
    try:
        if dataset_name in ["LogicBenchYN", "LogicBenchMCQ"]:
            with open(file_path, 'r', encoding="utf-8") as f:
                data = json.load(f)
        else:
            data = pd.read_csv(file_path)
    except Exception as e:
        print(f"Failed to load dataset {dataset_name} for dialect {dialect}: {e}")
        return

    if dataset_name not in ["LogicBenchYN", "LogicBenchMCQ"]:
        data = data.drop(columns=[col for col in data.columns if "BLEU Score" in col], errors="ignore")

    if model_name in model_output_mapping:
        model_output_dir = os.path.join(model_output_mapping[model_name], dialect, dataset_name)
    else:
        model_output_dir = os.path.join(output_base_dir, model_name.replace("-", ""), dialect, dataset_name)
    os.makedirs(model_output_dir, exist_ok=True)
    csv_file = os.path.join(model_output_dir, f"{dataset_name}_results.csv")

    correct_count = 0
    total = 0

    if isinstance(data, pd.DataFrame):
        total_rows = len(data)
    else:
        total_rows = len(data)

    # NON-COT Prompts

    if dataset_name == "SVAMP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                problem = row[f"{dialect} (Original)"]
                question = row["Question"]
                expected = str(row["Answer"])
                user_prompt = (
                    "Given a mathematics problem, determine the answer.\n"
                    f"Context: {problem}\nQuestion: {question}\n"
                    "Final Numeric Answer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Final Numeric Answer:\s*(.+)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original)": problem,
                    "Question": question,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "MBPP":
        pbar = tqdm(data.iterrows(), total=total_rows,
                    desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                problem = row[f"{dialect} (Original)"]
                test_cases = row["Test_Cases"]
                user_prompt = (
                    "Given a coding problem, produce a Python function that solves it.\n"
                    "Output the code starting with 'Answer:' on its own line.\n"
                    f"Problem: {problem}\nTest Cases: {test_cases}\nAnswer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                generated_code = clean_code(response)
                pattern = r"Answer:\s*(.+)"
                code = extract_response(generated_code, pattern)
                if not code:
                    continue
                is_correct, error_msg = run_test_cases(code, test_cases, timeout=5)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original)": problem,
                    "Code": code,
                    "Correct": int(is_correct),
                    "Error Message": error_msg
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "LogicBenchYN":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="task")
        for task in pbar:
            try:
                context = task[f"{dialect} (context)"]
                for i in range(1, 5):
                    question_key = f"Question {i}"
                    answer_key = f"Answer {i}"
                    question = task.get(question_key, "")
                    expected = task.get(answer_key, "")
                    if not question or not expected:
                        continue
                    user_prompt = (
                        "Given the context below, answer yes or no.\n"
                        f"Context: {context}\nQuestion: {question}\n"
                        "Final Answer (yes/no):"
                    )
                    system_prompt = "You are a helpful assistant."
                    response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                    if response is None:
                        continue
                    pattern = r"(yes|no)"
                    model_answer = extract_response(response, pattern)
                    is_correct = evaluate_response(model_answer, expected)
                    total += 1
                    if is_correct:
                        correct_count += 1
                    row_dict = {
                        f"{dialect} (Context)": context,
                        "Question": question,
                        "Expected Answer": expected,
                        "Model Answer": model_answer,
                        "Correct": is_correct
                    }
                    write_row_to_csv(row_dict, csv_file)
                    if total % 2 == 0:
                        acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                        pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing task: {e}")
                continue
        pbar.close()

    elif dataset_name == "LogicBenchMCQ":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="task")
        for task in pbar:
            try:
                context = task[f"{dialect} (context)"]
                choices = [task.get(f"Choice {i+1}", "") for i in range(4)]
                ground_truth_raw = task.get("Answer", "")
                gt_digit = parse_gt_to_int(ground_truth_raw)
                system_prompt = "You are a helpful assistant."
                user_prompt = (
                    f"Context:\n{context}\n\n"
                    "Choices:\n"
                    f"1) {choices[0]}\n"
                    f"2) {choices[1]}\n"
                    f"3) {choices[2]}\n"
                    f"4) {choices[3]}\n"
                    "Final Answer (1,2,3,4):"
                )
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                predicted_digit = extract_integer_from_response(response)
                was_correct = check_mcq_correctness(predicted_digit, gt_digit)
                total += 1
                if was_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Context)": context,
                    "Choice 1": choices[0],
                    "Choice 2": choices[1],
                    "Choice 3": choices[2],
                    "Choice 4": choices[3],
                    "Expected Answer": ground_truth_raw,
                    "Model Answer": response,
                    "Correct": was_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing MCQ: {e}")
                continue
        pbar.close()

    elif dataset_name == "HumanEVAL":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                prompt_text = row[f"{dialect} (Prompt)"]
                test_cases = row["Test_Cases"]
                user_prompt = (
                    "Given a coding problem, produce Python code.\n"
                    "Start with 'Answer:' on its own line.\n"
                    f"Problem: {prompt_text}\nTest Cases: {test_cases}\nAnswer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                generated_code = clean_code(response)
                pattern = r"Answer:\s*(.+)"
                code = extract_response(generated_code, pattern)
                if not code:
                    continue
                is_correct, error_msg = run_test_cases(code, test_cases)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Prompt)": prompt_text,
                    "Code": code,
                    "Correct": int(is_correct),
                    "Error Message": error_msg
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "GSM8K":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                problem = row[f"{dialect} (Original)"]
                expected = str(row["Answer"])
                user_prompt = (
                    "Given the math problem:\n"
                    f"{problem}\n"
                    "Provide a single numeric answer.\n"
                    "Answer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*([0-9.\-]+)"
                model_answer = extract_response(response, pattern)
                if not model_answer:
                    pattern_alt = r"Answer:\s*([0-9.\-]+)"
                    match = re.search(pattern_alt, response, re.IGNORECASE)
                    if match:
                        model_answer = match.group(1)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original)": problem,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "FOLIO":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                premises = row[f"{dialect} (Premises)"]
                conclusion = row["Conclusion"]
                expected = row["Label"]
                user_prompt = (
                    f"Determine if the conclusion is True, False, or Uncertain.\n"
                    f"Premises: {premises}\nConclusion: {conclusion}\n"
                    "Answer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(True|False|Uncertain)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Premises)": premises,
                    "Conclusion": conclusion,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "WSC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                paragraph = row[f"{dialect} (Original Paragraph)"]
                span1 = row["Span 1"]
                span2 = row["Span 2"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    f"Determine if Span 2 refers to Span 1.\n"
                    f"Paragraph: {paragraph}\nSpan 1: {span1}\nSpan 2: {span2}\n"
                    "Answer (1 for same, 0 for not):"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original Paragraph)": paragraph,
                    "Span 1": span1,
                    "Span 2": span2,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "SST-2":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                sentence = row[f"{dialect} (Original Sentence)"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    f"Given the sentence: \"{sentence}\", is it positive (1) or negative (0)?\n"
                    "Answer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original Sentence)": sentence,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "MultiRC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                paragraph = row[f"{dialect} (Paragraph)"]
                question = row["Question"]
                answer_choice = row["Answer Choice"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    f"Paragraph: {paragraph}\n"
                    f"Question: {question}\n"
                    f"Answer Choice: {answer_choice}\n"
                    "Is it correct (1) or incorrect (0)?\nAnswer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Paragraph)": paragraph,
                    "Question": question,
                    "Answer Choice": answer_choice,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "COPA":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                premise = row[f"{dialect} (Premise)"]
                choice1 = row["Choice 1"]
                choice2 = row["Choice 2"]
                expected = str(row["Actual Answer"])
                user_prompt = (
                    f"Given the premise and two choices:\nPremise: {premise}\n"
                    f"Choice 1: {choice1}\nChoice 2: {choice2}\n"
                    "Which is more plausible, 0 or 1?\nAnswer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Premise)": premise,
                    "Choice 1": choice1,
                    "Choice 2": choice2,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "BoolQ":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                passage = row[f"{dialect} (SAE Passage)"]
                question = row["SAE Question"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    f"Passage: \"{passage}\"\n"
                    f"Question: \"{question}\"\n"
                    "Is the answer TRUE or FALSE?\nAnswer:"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(TRUE|FALSE)"
                model_answer = extract_response(response, pattern)
                if not model_answer:
                    pattern_alt = r"Answer:\s*(TRUE|FALSE)"
                    match = re.search(pattern_alt, str(response), re.IGNORECASE)
                    if match:
                        model_answer = match.group(1).upper()
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (SAE Passage)": passage,
                    "SAE Question": question,
                    "Expected Answer": expected,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    else:
        print(f"Dataset {dataset_name} not recognized for processing.")
        return

    try:
        accuracy = (correct_count / total * 100) if total > 0 else 0
        with open(os.path.join(model_output_dir, f"{dataset_name}_accuracy.txt"), "w", encoding="utf-8") as f:
            f.write(f"Total instances: {total}\n")
            f.write(f"Correct answers: {correct_count}\n")
            f.write(f"Accuracy: {accuracy:.2f}%\n")
    except Exception as e:
        print(f"Error writing accuracy file for dataset {dataset_name}: {e}")

for model in models:
    for dialect in dialects:
        for dataset_name, rel_path in datasets.items():
            full_path = os.path.join(input_base_dir, dialect, rel_path)
            if os.path.exists(full_path):
                process_dataset(model, dataset_name, full_path, dialect)
            else:
                print(f"File not found: {full_path}")

print("Non-CoT evaluation complete! Results have been saved.")
try:
    from google.colab import runtime
    runtime.unassign()
except ImportError:
    pass

In [None]:
## COT EVALS

from google.colab import drive
drive.mount('/content/drive')

!pip install openai pandas numpy regex tqdm

from openai import OpenAI
import pandas as pd
import numpy as np
import json
import os
import re
import csv
import time
import signal
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, TimeoutError

DEEPSEEK_API_KEY = "API-KEY"
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/")
TEMPERATURE = 0.7

input_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Filtered Datasets/GPT 4o"
output_base_dir = "/content/drive/MyDrive/!!Multi-AAVENUE/Evaluation Results"
dialects = ["IndE", "AAVE", "JamE", "ChcE", "CollSgE"]

datasets = {
    "SVAMP": "SVAMP(700)/SVAMP(700)_filtered_bleu_scores.csv",
    "MBPP": "MBPP(374)/MBPP(374)_filtered_bleu_scores.csv",
    "LogicBenchYN": "Logic Bench YN(500)/Logic Bench YN(500)_filtered_bleu_scores.json",
    "LogicBenchMCQ": "Logic Bench MCQ(480)/Logic Bench MCQ(480)_filtered_bleu_scores.json",
    "HumanEVAL": "HumanEVAL(164)/HumanEVAL(164)_filtered_bleu_scores.csv",
    "GSM8K": "GSM8K(1000)/GSM8K(1000)_filtered_bleu_scores.csv",
    "FOLIO": "FOLIO(1000)/FOLIO(1000)_filtered_bleu_scores.csv",
    "WSC": "GLUE + SuperGLUE/WSC (659)/WSC (659)_filtered_bleu_scores.csv",
    "SST-2": "GLUE + SuperGLUE/SST-2 (1000)/SST-2 (1000)_filtered_bleu_scores.csv",
    "MultiRC": "GLUE + SuperGLUE/MultiRC (1000)/MultiRC (1000)_filtered_bleu_scores.csv",
    "COPA": "GLUE + SuperGLUE/COPA (500)/COPA (500)_filtered_bleu_scores.csv",
    "BoolQ": "GLUE + SuperGLUE/BoolQ (1000)/BoolQ (1000)_filtered_bleu_scores.csv"
}

models = ["deepseek-chat"]

def write_row_to_csv(row: dict, filename: str):
    mode = 'a'
    with open(filename, mode, newline='', encoding="utf-8") as csvfile:
        df = pd.DataFrame([row])
        write_header = csvfile.tell() == 0
        df.to_csv(csvfile, index=False, header=write_header)
        csvfile.flush()

def prompt_deepseek_model_with_timeout(model_name: str, system_message: str, user_message: str, timeout: int = 25, retries=5, backoff_factor=2) -> str:
    def prompt():
        return client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            temperature=TEMPERATURE,
            max_tokens=4096
        ).choices[0].message.content

    for attempt in range(retries):
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(prompt)
            try:
                response = future.result(timeout=timeout)
                return response
            except TimeoutError:
                print(f"Timeout: Skipping this row after {timeout} seconds.")
                return None
            except Exception as e:
                wait_time = backoff_factor ** attempt
                print(f"API error: {e}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
    print("Maximum retries exceeded. Skipping this row.")
    return None

def clean_code(generated_code: str) -> str:
    cleaned_code = re.sub(r"```(?:python)?", "", generated_code, flags=re.DOTALL)
    cleaned_code = re.sub(r"```", "", cleaned_code, flags=re.DOTALL)
    return cleaned_code.strip()

def extract_response(response: str, pattern: str) -> str:
    if not response:
        return ""
    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def evaluate_response(model_answer: str, expected_answer: str) -> bool:
    return model_answer.strip().lower() == expected_answer.strip().lower()

class TimeoutException(Exception):
    pass

def alarm_handler(signum, frame):
    raise TimeoutException("Execution timed out!")

def run_test_cases(generated_code: str, test_cases: str, timeout=5) -> (bool, str):
    exec_globals = {}
    try:
        signal.signal(signal.SIGALRM, alarm_handler)
        signal.alarm(timeout)
        exec(generated_code, exec_globals)
        exec(test_cases, exec_globals)
        signal.alarm(0)
        return True, ""
    except TimeoutException as e:
        signal.alarm(0)
        return False, str(e)
    except AssertionError as e:
        signal.alarm(0)
        return False, f"AssertionError: {str(e)}"
    except SyntaxError as e:
        signal.alarm(0)
        return False, f"SyntaxError: {str(e)}"
    except Exception as e:
        signal.alarm(0)
        return False, f"RuntimeError: {str(e)}"

def parse_gt_to_int(gt: str) -> str:
    match = re.search(r"choice_(\d+)", gt, re.IGNORECASE)
    if match: return match.group(1)
    match2 = re.search(r"\d+", gt)
    if match2: return match2.group(0)
    return ""

def extract_integer_from_response(response_text: str) -> str:
    match = re.search(r"Answer:\s*(\d+)", response_text)
    return match.group(1).strip() if match else ""

def check_mcq_correctness(model_digit: str, gt_digit: str) -> bool:
    return (model_digit == gt_digit) and bool(model_digit)

model_output_mapping = {
    "deepseek-chat": os.path.join(output_base_dir, "DeepSeek-V3_CoT")
}

def process_dataset(model_name: str, dataset_name: str, file_path: str, dialect: str):
    try:
        if dataset_name in ["LogicBenchYN", "LogicBenchMCQ"]:
            with open(file_path, 'r', encoding="utf-8") as f:
                data = json.load(f)
        else:
            data = pd.read_csv(file_path)
    except Exception as e:
        print(f"Failed to load dataset {dataset_name} for dialect {dialect}: {e}")
        return

    if dataset_name not in ["LogicBenchYN", "LogicBenchMCQ"]:
        data = data.drop(columns=[col for col in data.columns if "BLEU Score" in col], errors="ignore")

    if model_name in model_output_mapping:
        model_output_dir = os.path.join(model_output_mapping[model_name], dialect, dataset_name)
    else:
        model_output_dir = os.path.join(output_base_dir, model_name.replace("-", ""), dialect, dataset_name)
    os.makedirs(model_output_dir, exist_ok=True)
    csv_file = os.path.join(model_output_dir, f"{dataset_name}_results.csv")

    correct_count = 0
    total = 0

    if isinstance(data, pd.DataFrame):
        total_rows = len(data)
    else:
        total_rows = len(data)

    # COT Prompts

    if dataset_name == "SVAMP":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                problem = row[f"{dialect} (Original)"]
                question = row["Question"]
                expected = str(row["Answer"])
                user_prompt = (
                    "Let's carefully reason about this math problem step by step.\n"
                    f"Context: {problem}\nQuestion: {question}\n"
                    "Finally, provide your numeric answer as: Answer: <number>\n"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(.+)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original)": problem,
                    "Question": question,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "MBPP":
        pbar = tqdm(data.iterrows(), total=total_rows,
                    desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                problem = row[f"{dialect} (Original)"]
                test_cases = row["Test_Cases"]
                user_prompt = (
                    "Let's break down the coding problem step by step.\n"
                    "Then write a Python function. Output code starting with 'Answer:'\n"
                    f"Problem: {problem}\nTest Cases: {test_cases}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                generated_code = clean_code(response)
                pattern = r"Answer:\s*(.+)"
                code = extract_response(generated_code, pattern)
                if not code:
                    continue
                is_correct, error_msg = run_test_cases(code, test_cases, timeout=5)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original)": problem,
                    "Code": code,
                    "COT Response": response,
                    "Correct": int(is_correct),
                    "Error Message": error_msg
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "LogicBenchYN":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="task")
        for task in pbar:
            try:
                context = task[f"{dialect} (context)"]
                for i in range(1, 5):
                    question_key = f"Question {i}"
                    answer_key = f"Answer {i}"
                    question = task.get(question_key, "")
                    expected = task.get(answer_key, "")
                    if not question or not expected:
                        continue
                    user_prompt = (
                        "Let's think step by step about whether the answer is yes or no.\n"
                        f"Context: {context}\nQuestion: {question}\n"
                        "Finally, respond as: Answer: yes or Answer: no.\n"
                    )
                    system_prompt = "You are a helpful assistant."
                    response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                    if response is None:
                        continue
                    pattern = r"Answer:\s*(yes|no)"
                    model_answer = extract_response(response, pattern)
                    is_correct = evaluate_response(model_answer, expected)
                    total += 1
                    if is_correct:
                        correct_count += 1
                    row_dict = {
                        f"{dialect} (Context)": context,
                        "Question": question,
                        "Expected Answer": expected,
                        "COT Response": response,
                        "Model Answer": model_answer,
                        "Correct": is_correct
                    }
                    write_row_to_csv(row_dict, csv_file)
                    if total % 2 == 0:
                        acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                        pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing task: {e}")
                continue
        pbar.close()

    elif dataset_name == "LogicBenchMCQ":
        pbar = tqdm(data, total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="task")
        for task in pbar:
            try:
                context = task[f"{dialect} (context)"]
                choices = [task.get(f"Choice {i+1}", "") for i in range(4)]
                ground_truth_raw = task.get("Answer", "")
                gt_digit = parse_gt_to_int(ground_truth_raw)
                system_prompt = "You are a helpful assistant."
                user_prompt = (
                    "Let's analyze each choice step by step.\n"
                    "Finally, provide EXACTLY one line: Answer: 1,2,3,or4\n\n"
                    f"Context:\n{context}\n\n"
                    "Choices:\n"
                    f"1) {choices[0]}\n"
                    f"2) {choices[1]}\n"
                    f"3) {choices[2]}\n"
                    f"4) {choices[3]}\n"
                )
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                predicted_digit = extract_integer_from_response(response)
                was_correct = check_mcq_correctness(predicted_digit, gt_digit)
                total += 1
                if was_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Context)": context,
                    "Choice 1": choices[0],
                    "Choice 2": choices[1],
                    "Choice 3": choices[2],
                    "Choice 4": choices[3],
                    "Expected Answer": ground_truth_raw,
                    "COT Response": response,
                    "Model Answer": predicted_digit,
                    "Correct": was_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing MCQ: {e}")
                continue
        pbar.close()

    elif dataset_name == "HumanEVAL":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                prompt_text = row[f"{dialect} (Prompt)"]
                test_cases = row["Test_Cases"]
                user_prompt = (
                    "Let's consider this coding task step by step.\n"
                    "Finally, provide Python code starting with 'Answer:'.\n\n"
                    f"Problem: {prompt_text}\nTest Cases: {test_cases}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                generated_code = clean_code(response)
                pattern = r"Answer:\s*(.+)"
                code = extract_response(generated_code, pattern)
                if not code:
                    continue
                is_correct, error_msg = run_test_cases(code, test_cases)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Prompt)": prompt_text,
                    "Code": code,
                    "COT Response": response,
                    "Correct": int(is_correct),
                    "Error Message": error_msg
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "GSM8K":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                problem = row[f"{dialect} (Original)"]
                expected = str(row["Answer"])
                user_prompt = (
                    "Let's solve this math word problem step by step.\n"
                    "Finally, provide the numeric result as: Answer: <integer or decimal>\n\n"
                    f"Problem: {problem}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*([0-9.\-]+)"
                model_answer = extract_response(response, pattern)
                if not model_answer:
                    pattern_alt = r"Answer:\s*([0-9.\-]+)"
                    match = re.search(pattern_alt, response, re.IGNORECASE)
                    if match:
                        model_answer = match.group(1)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original)": problem,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "FOLIO":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                premises = row[f"{dialect} (Premises)"]
                conclusion = row["Conclusion"]
                expected = row["Label"]
                user_prompt = (
                    "Let's analyze whether the conclusion follows.\n"
                    "Finally, provide: Answer: True, False, or Uncertain.\n\n"
                    f"Premises: {premises}\nConclusion: {conclusion}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(True|False|Uncertain)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Premises)": premises,
                    "Conclusion": conclusion,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "WSC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                paragraph = row[f"{dialect} (Original Paragraph)"]
                span1 = row["Span 1"]
                span2 = row["Span 2"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    "Let's analyze whether Span 2 refers to Span 1 step by step.\n"
                    "Finally, provide Answer: 1 if same, 0 if not.\n\n"
                    f"Paragraph: {paragraph}\nSpan 1: {span1}\nSpan 2: {span2}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original Paragraph)": paragraph,
                    "Span 1": span1,
                    "Span 2": span2,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "SST-2":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                sentence = row[f"{dialect} (Original Sentence)"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    "Let's analyze the sentiment step by step.\n"
                    "Finally, provide: Answer: 1 (positive) or 0 (negative).\n\n"
                    f"Sentence: \"{sentence}\""
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Original Sentence)": sentence,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "MultiRC":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                paragraph = row[f"{dialect} (Paragraph)"]
                question = row["Question"]
                answer_choice = row["Answer Choice"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    "Let's think step by step about the paragraph and question.\n"
                    "Finally, provide: Answer: 1 if correct, 0 if incorrect.\n\n"
                    f"Paragraph: {paragraph}\nQuestion: {question}\nAnswer Choice: {answer_choice}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Paragraph)": paragraph,
                    "Question": question,
                    "Answer Choice": answer_choice,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "COPA":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                premise = row[f"{dialect} (Premise)"]
                choice1 = row["Choice 1"]
                choice2 = row["Choice 2"]
                expected = str(row["Actual Answer"])
                user_prompt = (
                    "Let's think step by step which choice is more plausible.\n"
                    "Finally, provide: Answer: 0 (first) or 1 (second).\n\n"
                    f"Premise: {premise}\nChoice 1: {choice1}\nChoice 2: {choice2}"
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(\d)"
                model_answer = extract_response(response, pattern)
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (Premise)": premise,
                    "Choice 1": choice1,
                    "Choice 2": choice2,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    elif dataset_name == "BoolQ":
        pbar = tqdm(data.iterrows(), total=total_rows, desc=f"{model_name} | {dataset_name} | {dialect}", unit="row")
        for idx, row in pbar:
            try:
                passage = row[f"{dialect} (SAE Passage)"]
                question = row["SAE Question"]
                expected = str(row["Actual Label"])
                user_prompt = (
                    "Let's analyze this passage carefully.\n"
                    "Finally, provide: Answer: TRUE or FALSE.\n\n"
                    f"Passage: \"{passage}\"\nQuestion: \"{question}\""
                )
                system_prompt = "You are a helpful assistant."
                response = prompt_deepseek_model_with_timeout(model_name, system_prompt, user_prompt)
                if response is None:
                    continue
                pattern = r"Answer:\s*(TRUE|FALSE)"
                model_answer = extract_response(response, pattern)
                if not model_answer:
                    pattern_alt = r"Answer:\s*(TRUE|FALSE)"
                    match = re.search(pattern_alt, str(response), re.IGNORECASE)
                    if match:
                        model_answer = match.group(1).upper()
                is_correct = evaluate_response(model_answer, expected)
                total += 1
                if is_correct:
                    correct_count += 1
                row_dict = {
                    f"{dialect} (SAE Passage)": passage,
                    "SAE Question": question,
                    "Expected Answer": expected,
                    "COT Response": response,
                    "Model Answer": model_answer,
                    "Correct": is_correct
                }
                write_row_to_csv(row_dict, csv_file)
                if total % 2 == 0:
                    acc_percentage = (correct_count / total) * 100 if total > 0 else 0
                    pbar.set_postfix({"Acc": f"{acc_percentage:.2f}%"})
            except Exception as e:
                print(f"Error processing row: {e}")
                continue
        pbar.close()

    else:
        print(f"Dataset {dataset_name} not recognized for processing.")
        return

    try:
        accuracy = (correct_count / total * 100) if total > 0 else 0
        with open(os.path.join(model_output_dir, f"{dataset_name}_accuracy.txt"), "w", encoding="utf-8") as f:
            f.write(f"Total instances: {total}\n")
            f.write(f"Correct answers: {correct_count}\n")
            f.write(f"Accuracy: {accuracy:.2f}%\n")
    except Exception as e:
        print(f"Error writing accuracy file for dataset {dataset_name}: {e}")

for model in models:
    for dialect in dialects:
        for dataset_name, rel_path in datasets.items():
            full_path = os.path.join(input_base_dir, dialect, rel_path)
            if os.path.exists(full_path):
                process_dataset(model, dataset_name, full_path, dialect)
            else:
                print(f"File not found: {full_path}")

print("CoT evaluation complete! Results have been saved.")
try:
    from google.colab import runtime
    runtime.unassign()
except ImportError:
    pass