In [None]:
!pip install groq nltk rouge-score bert-score transformers sentencepiece -q


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:



# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Fill-in-the-blank with Choices




# llama-3.3-70b-versatile -> 69%

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
# Your Groq API key will be accessed securely from Colab's secrets
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
except (ImportError, KeyError):
    print("Secret 'GROQ_API_KEY' not found. Please add it to the Colab secrets manager.")
    GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
# NOTE: The input CSV name suggests a "fill-in-the-blank" task, but the code logic
# is designed for Multiple Choice. Ensure this is the correct file.
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices.csv'

# --- UPDATED: Correct column names as per your specification ---
QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'


# --- Function to Generate Answers for MCQ Task ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API, prompting the model
    to return only the single correct letter.
    """
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              model="llama3-70b-8192", # Using a standard available model
              temperature=0.0,
              max_tokens=5,
            )
            response_text = chat_completion.choices[0].message.content.strip()

            # Clean the response to ensure it's just a single Arabic letter
            arabic_letters = re.findall(r'[\u0621-\u064A]', response_text)
            if arabic_letters:
                return arabic_letters[0]
            else:
                print(f"  -> Warning: Model returned an unexpected response: '{response_text}'. Recording as empty.")
                return "" # Return empty if no letter is found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"


# --- Function to Evaluate MCQ Accuracy ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """
    Calculates and prints the accuracy of the MCQ predictions.
    """
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    # Filter out API errors before calculating accuracy
    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS"]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    api_errors = total_questions - total_valid_predictions


    print(f"Total Questions Attempted: {total_questions}")
    print(f"API Errors/Failed Attempts: {api_errors}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- Main Execution for MCQ Task ---
def main():
    """
    Main function for the Multiple Choice Question Answering task.
    """
    try:
        # Use encoding='utf-8' to handle Arabic characters properly
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return

    # Check if the corrected column names exist in the DataFrame
    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not found in the CSV.")
        print(f"Available columns are: {df.columns.tolist()}")
        return

    # Drop rows where the question or answer is missing
    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)


    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        predictions = predictions_df[0].astype(str).tolist() # Ensure predictions are read as strings
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions}...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Ground Truth: {str(row[ANSWER_COLUMN]).strip()[0]} | Model's Prediction: {answer_letter}")

            # Delay for Groq API rate limit (30 RPM limit)
            if index < total_questions - 1:
                time.sleep(2.1) # Sleep for slightly over 2 seconds

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    # Extracting the first character from the Answer column as the ground truth
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

'/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices.csv' not found. Starting prediction generation process...
Processing question 1/100...
  -> Ground Truth: أ | Model's Prediction: ج
Processing question 2/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 3/100...
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 4/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 5/100...
  -> Ground Truth: د | Model's Prediction: د
Processing question 6/100...
  -> Ground Truth: أ | Model's Prediction: ب
Processing question 7/100...
  -> Ground Truth: ب | Model's Prediction: ب
Processing question 8/100...
  -> Ground Truth: د | Model's Prediction: ج
Processing question 9/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 10/100...
  -> Ground Truth: أ | Model's Prediction: ب
Processing question 11/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 12/100...
  -> Ground Truth: ج | Model's Pred

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
API Errors/Failed Attempts: 0
Valid Predictions to Evaluate: 100
--------------------
Correct Predictions: 69 / 100
📊 Accuracy (on valid responses): 69.00%
✅ Evaluation Complete.



# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Fill-in-the-blank with Choices




# deepseek -> 17%

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
except (ImportError, KeyError):
    print("Secret 'GROQ_API_KEY' not found. Please add it to the Colab secrets manager.")
    GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek.csv'

# Column names from your CSV
QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'


# --- Function to Generate Answers for MCQ Task using DeepSeek ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API, prompting the DeepSeek model
    to return only the single correct letter. Includes cleaning for <think> blocks.
    """
    # --- UPDATED: Added an explicit instruction to not use <think> tags ---
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. Do not use <think> tags. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              model="deepseek-r1-distill-llama-70b",
              temperature=0.0,
              # --- UPDATED: Increased max_tokens significantly to prevent cutoff ---
              max_tokens=150,
            )
            raw_response_text = chat_completion.choices[0].message.content.strip()

            # The cleaning logic is now a robust failsafe.
            # It will work correctly now that the model has enough tokens to provide a complete response.
            cleaned_text = re.sub(r'<think>.*?</think>', '', raw_response_text, flags=re.DOTALL).strip()
            arabic_letters = re.findall(r'[\u0621-\u064A]', cleaned_text)

            if arabic_letters:
                return arabic_letters[0]
            else:
                print(f"  -> Warning: No Arabic letter found after cleaning. Cleaned response: '{cleaned_text}'. Recording as empty.")
                return ""

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"


# --- Function to Evaluate MCQ Accuracy (Unchanged) ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS"]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    api_errors = total_questions - total_valid_predictions

    print(f"Total Questions Attempted: {total_questions}")
    print(f"API Errors/Failed Attempts: {api_errors}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- Main Execution for MCQ Task (Unchanged) ---
def main():
    try:
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not found in the CSV.")
        print(f"Available columns are: {df.columns.tolist()}")
        return

    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        predictions = predictions_df[0].astype(str).tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process with DeepSeek...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with DeepSeek (MCQ Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Ground Truth: {str(row[ANSWER_COLUMN]).strip()[0]} | Model's Prediction: {answer_letter}")

            if index < total_questions - 1:
                time.sleep(2.1)

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]
    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

'/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek.csv' not found. Starting prediction generation process with DeepSeek...
Processing question 1/100 with DeepSeek (MCQ Mode)...
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 2/100 with DeepSeek (MCQ Mode)...
  -> Ground Truth: ج | Model's Prediction: ا
Processing question 3/100 with DeepSeek (MCQ Mode)...
Okay, so I've got this medical question here about pleural effusion. I'm a bit rusty on my respiratory system, but let me try to think this through.

The question is about filling in the blanks in a sentence. It says that in cases of pleural effusion, the decrease or absence of tactile fremitus on the affected side indicates something, which is caused by something else.

First, I need to remember what pleural effusion is. From what I recall, it's when there's an abnormal accumulation of fluid in the pleural space, which is the area between the lungs and the chest wall. This fluid buildup can be due 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
API Errors/Failed Attempts: 0
Valid Predictions to Evaluate: 100
--------------------
Correct Predictions: 17 / 100
📊 Accuracy (on valid responses): 17.00%
✅ Evaluation Complete.


# deep seek cahin of thoughts

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
# It's recommended to use environment variables or a secret manager for API keys.
try:
    # Used when running in Google Colab
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
    print("Successfully loaded GROQ_API_KEY from Colab secrets.")
except (ImportError, KeyError):
    # Fallback for local execution
    if 'GROQ_API_KEY' in os.environ:
        GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
        print("Successfully loaded GROQ_API_KEY from environment variables.")
    else:
        print("GROQ_API_KEY not found in Colab secrets or environment variables.")
        GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
# Ensure this path points to your actual file location
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek_cot.csv' # Changed output file name

# Column names from your CSV
QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'


# --- MODIFIED Function to Generate Answers using Chain of Thought ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API, prompting the DeepSeek model
    to use a Chain of Thought process and return only the single correct letter
    in a structured format.
    """
    # --- NEW: Chain of Thought System Prompt ---
    # This prompt guides the model to first "think" and then provide a final answer.
    system_prompt = """You are an expert medical professional and a meticulous test-taker. Your task is to answer the following multiple-choice medical question.

Follow these steps precisely:
1.  First, carefully analyze the question and all the provided options (أ, ب, ج, د, ه).
2.  Engage in a step-by-step reasoning process to determine the correct answer. Explain the medical context, evaluate each option, and state why you are choosing one and eliminating the others.
3.  Enclose this entire thought process within `<thinking>` and `</thinking>` tags.
4.  After the closing `</thinking>` tag, on a new line, state the final answer clearly and concisely. The final answer line must be in the format: `Final Answer: [letter]`, where `[letter]` is the single Arabic letter corresponding to the correct option.

Example Response Format:
<thinking>
[Your detailed, step-by-step analysis of the question and options goes here...]
</thinking>
Final Answer: ب
"""

    max_retries = 3
    retry_delay = 5 # seconds
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ],
                model="deepseek-r1-distill-llama-70b",
                temperature=0.0,
                # --- UPDATED: Increased max_tokens for Chain of Thought ---
                max_tokens=1024,
            )
            raw_response_text = chat_completion.choices[0].message.content.strip()

            # --- NEW: Robust parsing for "Final Answer: [letter]" format ---
            # This regex looks for "Final Answer:", optional whitespace, and captures the Arabic letter.
            match = re.search(r"Final Answer:\s*([\u0621-\u064A])", raw_response_text)

            if match:
                # The first captured group is the letter we need.
                return match.group(1)
            else:
                # Fallback in case the model doesn't follow the format.
                # Try to find the last Arabic letter in the response as a last resort.
                arabic_letters = re.findall(r'[\u0621-\u064A]', raw_response_text)
                if arabic_letters:
                    print(f"  -> Warning: Model did not use the 'Final Answer:' format. Using last found Arabic letter.")
                    return arabic_letters[-1] # Return the last letter, which is likely the intended answer
                else:
                    print(f"  -> Warning: No final answer or Arabic letter found. Full response: '{raw_response_text}'. Recording as empty.")
                    return ""

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"


# --- Function to Evaluate MCQ Accuracy (Unchanged) ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """Calculates and prints the accuracy of the model's predictions."""
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS", ""]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API or formatting errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    failed_or_empty = total_questions - total_valid_predictions

    print(f"Total Questions Attempted: {total_questions}")
    print(f"API Errors/Failed/Empty: {failed_or_empty}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- Main Execution for MCQ Task (Unchanged) ---
def main():
    """Main function to run the MCQ prediction and evaluation process."""
    try:
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not found.")
        print(f"Available columns are: {df.columns.tolist()}")
        return

    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        predictions = predictions_df[0].astype(str).tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation with DeepSeek (CoT Mode)...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with DeepSeek (CoT Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            # Normalize ground truth for accurate comparison
            ground_truth_letter = str(row[ANSWER_COLUMN]).strip()[0]
            print(f"  -> Ground Truth: {ground_truth_letter} | Model's Prediction: {answer_letter}")

            # Optional: A delay between API calls to avoid rate limiting
            if index < total_questions - 1:
                time.sleep(1) # A small delay can be helpful

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', please use the file browser.")

    # Normalize the ground truth answers to be a single letter
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]
    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

Successfully loaded GROQ_API_KEY from Colab secrets.
'/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek_cot.csv' not found. Starting prediction generation with DeepSeek (CoT Mode)...
Processing question 1/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: أ | Model's Prediction: د
Processing question 2/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 3/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 4/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 5/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: د | Model's Prediction: د
Processing question 6/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: أ | Model's Prediction: د
Processing question 7/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: ب | Model's Prediction: ب
Processing question 8/100 with DeepSeek (CoT Mode)...
  -> Ground Truth: د | Model's Prediction: د
Proces

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
API Errors/Failed/Empty: 1
Valid Predictions to Evaluate: 99
--------------------
Correct Predictions: 70 / 99
📊 Accuracy (on valid responses): 70.71%
✅ Evaluation Complete.


# deep seek no halusination

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
    print("✅ Successfully loaded GROQ_API_KEY from Colab secrets.")
except (ImportError, KeyError):
    if 'GROQ_API_KEY' in os.environ:
        GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
        print("✅ Successfully loaded GROQ_API_KEY from environment variables.")
    else:
        print("⚠️ GROQ_API_KEY not found in Colab secrets or environment variables.")
        GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek_cot_validated.csv' # Changed output file name

QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'


# --- NEW: Helper function to find valid choice letters in the question ---
def extract_choice_letters(question_text):
    """
    Extracts the Arabic letters used as choices from the question text.
    Looks for patterns like 'أ)', 'ب-', 'ج.' etc.
    """
    # This regex finds Arabic letters followed by a parenthesis, dot, or hyphen.
    # It returns a list of unique letters found.
    letters = re.findall(r'([\u0621-\u064A])[.)-]\s', question_text)
    # Return unique letters while preserving order
    return sorted(list(set(letters)), key=letters.index)


# --- MODIFIED Function to Generate Answers using a Constrained Chain of Thought ---
def generate_answer(question, valid_letters):
    """
    Sends a question to the Groq API, instructing it to use Chain of Thought
    and respond with a letter from the provided list of valid choices.
    """
    # --- NEW: Dynamic System Prompt ---
    # It now dynamically includes the list of valid letters to constrain the model.
    system_prompt = f"""You are an expert medical professional and a meticulous test-taker. Your task is to answer the following multiple-choice medical question.

Follow these steps precisely:
1.  First, carefully analyze the question and all the provided options.
2.  Engage in a step-by-step reasoning process to determine the correct answer. Enclose this entire thought process within `<thinking>` and `</thinking>` tags.
3.  After the closing `</thinking>` tag, on a new line, state the final answer clearly.
4.  The final answer must be in the format: `Final Answer: [letter]`.
5.  **Crucially, the `[letter]` you choose MUST be one of these valid options ONLY: {', '.join(valid_letters)}.** Do not invent any other letters.

Example Response Format:
<thinking>
[Your detailed, step-by-step analysis of the question and options goes here...]
</thinking>
Final Answer: ب
"""

    max_retries = 3
    retry_delay = 5 # seconds
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ],
                model="deepseek-r1-distill-llama-70b",
                temperature=0.0,
                max_tokens=1024,
            )
            raw_response_text = chat_completion.choices[0].message.content.strip()

            # --- NEW: Stricter parsing and validation ---
            predicted_letter = None
            match = re.search(r"Final Answer:\s*([\u0621-\u064A])", raw_response_text)

            if match:
                letter = match.group(1)
                # Validate if the extracted letter is one of the allowed choices
                if letter in valid_letters:
                    predicted_letter = letter
                else:
                    print(f"  -> Warning: Model returned a letter '{letter}' not in the valid list {valid_letters}.")
            else:
                 # Fallback: find the last valid letter in the whole response
                all_letters_in_response = re.findall(r'[\u0621-\u064A]', raw_response_text)
                for letter in reversed(all_letters_in_response):
                    if letter in valid_letters:
                        print(f"  -> Info: Using fallback. Found valid letter '{letter}' in response.")
                        predicted_letter = letter
                        break # Stop after finding the last valid one

            if predicted_letter:
                return predicted_letter
            else:
                print(f"  -> Warning: No valid answer letter found in response. Full response: '{raw_response_text}'.")
                return "INVALID_LETTER"

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"


# --- Function to Evaluate MCQ Accuracy (Slightly modified for new error code) ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """Calculates and prints the accuracy of the model's predictions."""
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    # Filter out any kind of error or invalid response for accuracy calculation
    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS", "INVALID_LETTER", ""]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API or formatting errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    failed_or_invalid = total_questions - total_valid_predictions

    print(f"Total Questions Attempted: {total_questions}")
    print(f"Errors or Invalid Letters: {failed_or_invalid}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- MODIFIED Main Execution Logic ---
def main():
    """Main function to run the constrained MCQ prediction and evaluation."""
    try:
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"❌ Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"❌ An error occurred while reading the CSV: {e}")
        return

    # Basic data validation
    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    # --- NEW: Print all unique ground truth letters found in the answer key ---
    unique_ground_truth_letters = sorted(list(set(ground_truths)))
    print("="*50)
    print(f"Found {len(unique_ground_truth_letters)} unique ground truth letters in the answer key:")
    print(f"➡️  {', '.join(unique_ground_truth_letters)}")
    print("="*50)


    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'. Loading for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        # Ensure loaded data is treated as string
        predictions = predictions_df[0].astype(str).apply(lambda x: 'nan' if pd.isna(x) else x).tolist()
    else:
        print(f"ℹ️ '{OUTPUT_CSV}' not found. Starting prediction generation...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            ground_truth_letter = ground_truths[index]

            # 1. Extract valid letters from the question
            valid_letters = extract_choice_letters(question)
            if not valid_letters:
                print(f"Processing question {index + 1}/{total_questions}...")
                print(f"  -> ⚠️ Warning: Could not extract any choice letters from the question. Skipping.")
                predictions.append("INVALID_QUESTION")
                continue

            print(f"Processing question {index + 1}/{total_questions}... (Valid choices: {', '.join(valid_letters)})")

            # 2. Generate answer with the context of valid letters
            answer_letter = generate_answer(question, valid_letters)
            predictions.append(answer_letter)
            print(f"  -> Ground Truth: {ground_truth_letter} | Model's Prediction: {answer_letter}")

            time.sleep(1) # Small delay between API calls

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\n✅ Successfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except (ImportError, NameError):
            print(f"ℹ️ To download '{OUTPUT_CSV}', please use the file browser.")

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

✅ Successfully loaded GROQ_API_KEY from Colab secrets.
Found 5 unique ground truth letters in the answer key:
➡️  أ, إ, ب, ج, د
ℹ️ '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek_cot_validated.csv' not found. Starting prediction generation...
Processing question 1/100... (Valid choices: ة, أ, ب, ج, د)
  -> Info: Using fallback. Found valid letter 'د' in response.
  -> Ground Truth: أ | Model's Prediction: د
Processing question 2/100... (Valid choices: ه, أ, ب, ج, د)
  -> Ground Truth: ج | Model's Prediction: ب
Processing question 3/100... (Valid choices: أ, ب, ج, د)
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 4/100... (Valid choices: أ, ب, ج, د)
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 5/100... (Valid choices: ة, أ, ب, ج, د)
  -> Ground Truth: د | Model's Prediction: د
Processing question 6/100... (Valid choices: ل, ض, أ, ب, ج, د)
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 7/100... (Valid choices: 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
Errors or Invalid Letters: 1
Valid Predictions to Evaluate: 99
--------------------
Correct Predictions: 76 / 99
📊 Accuracy (on valid responses): 76.77%
✅ Evaluation Complete.


# normlized letters

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
    print("✅ Successfully loaded GROQ_API_KEY from Colab secrets.")
except (ImportError, KeyError):
    if 'GROQ_API_KEY' in os.environ:
        GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
        print("✅ Successfully loaded GROQ_API_KEY from environment variables.")
    else:
        print("⚠️ GROQ_API_KEY not found in Colab secrets or environment variables.")
        GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and configuration ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek_cot_normalized.csv'

QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'

# --- UPDATED: Whitelist including all letter variants before normalization ---
VALID_MCQ_LETTERS = {'أ', 'إ', 'ا', 'ب', 'ج', 'د', 'ه', 'ة', 'و', 'ز'}

# --- NEW: Function to normalize Arabic letters ---
def normalize_arabic_letter(char):
    """
    Normalizes different forms of Arabic letters to a canonical form.
    - All forms of Alef (أ, إ, ا) are converted to 'أ'.
    - Taa Marbutah (ة) is converted to Haa (ه).
    """
    if char in ['إ', 'ا']:
        return 'أ'
    if char == 'ة':
        return 'ه'
    return char

# --- Helper function to find and normalize valid choice letters ---
def extract_choice_letters(question_text):
    """
    Extracts, normalizes, and de-duplicates Arabic choice letters from the question.
    """
    pattern = r'(?:^|\n)\s*([\u0621-\u064A])[.)-]\s*'
    letters = re.findall(pattern, question_text)

    # First, filter against the whitelist
    validated_letters = [char for char in letters if char in VALID_MCQ_LETTERS]

    # Second, normalize each validated letter
    normalized_letters = [normalize_arabic_letter(char) for char in validated_letters]

    # Return a unique, sorted list of the final, normalized letters
    return sorted(list(set(normalized_letters)))


# --- Function to Generate Answers ---
def generate_answer(question, valid_letters):
    """
    Sends a question to the Groq API, instructing it to use Chain of Thought
    and respond with a letter from the provided list of valid (and normalized) choices.
    """
    system_prompt = f"""You are an expert medical professional and a meticulous test-taker. Your task is to answer the following multiple-choice medical question.

Follow these steps precisely:
1.  First, carefully analyze the question and all the provided options.
2.  Engage in a step-by-step reasoning process to determine the correct answer. Your entire detailed analysis MUST be enclosed within `<thinking>` and `</thinking>` tags.
3.  After the closing `</thinking>` tag, you MUST provide the final answer on a new line.
4.  This final line must ONLY contain the text in the format: `Final Answer: [letter]`.
5.  **CRITICAL:** The `[letter]` you choose **MUST** be one of these valid options and nothing else: **{', '.join(valid_letters)}**.

There should be absolutely no other text outside the `<thinking>` block except for the 'Final Answer:' line.
"""
    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ],
                model="deepseek-r1-distill-llama-70b",
                temperature=0.0,
                max_tokens=1024,
            )
            raw_response_text = chat_completion.choices[0].message.content.strip()
            predicted_letter = None
            match = re.search(r"Final Answer:\s*([\u0621-\u064A])", raw_response_text)

            if match:
                # Normalize the predicted letter before validation
                letter = normalize_arabic_letter(match.group(1))
                if letter in valid_letters:
                    predicted_letter = letter
                else:
                    print(f"  -> ⚠️ Warning: Model returned letter '{match.group(1)}' which normalizes to '{letter}' and is not in the valid list {valid_letters}.")
            else:
                all_letters_in_response = re.findall(r'[\u0621-\u064A]', raw_response_text)
                for letter_char in reversed(all_letters_in_response):
                    # Normalize the letter from fallback before validation
                    letter = normalize_arabic_letter(letter_char)
                    if letter in valid_letters:
                        print(f"  -> ℹ️ Info: Using fallback. Found valid letter '{letter}' in response.")
                        predicted_letter = letter
                        break

            return predicted_letter if predicted_letter else "INVALID_LETTER"

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> ‼️ An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> ‼️ API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"

# --- Evaluation Function (Unchanged) ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """Calculates and prints the accuracy of the model's predictions."""
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)
    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS", "INVALID_LETTER", "", "INVALID_QUESTION"]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API or formatting errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    failed_or_invalid = total_questions - total_valid_predictions

    print(f"Total Questions Attempted: {total_questions}")
    print(f"Errors or Invalid Responses: {failed_or_invalid}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- Main Execution Logic ---
def main():
    """Main function to run the constrained and normalized MCQ prediction and evaluation."""
    try:
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"❌ Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"❌ An error occurred while reading the CSV: {e}")
        return

    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # --- UPDATED: Normalize ground truths at the start ---
    ground_truths = [normalize_arabic_letter(str(ans).strip()[0]) for ans in df[ANSWER_COLUMN].tolist()]

    unique_ground_truth_letters = sorted(list(set(ground_truths)))
    print("="*50)
    print(f"Found {len(unique_ground_truth_letters)} unique ground truth letters after normalization:")
    print(f"➡️  {', '.join(unique_ground_truth_letters)}")
    print("="*50)

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'. Loading for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        predictions = predictions_df[0].astype(str).apply(lambda x: 'nan' if pd.isna(x) else x).tolist()
    else:
        print(f"ℹ️ '{OUTPUT_CSV}' not found. Starting prediction generation...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            ground_truth_letter = ground_truths[index]

            valid_letters = extract_choice_letters(question)
            if not valid_letters:
                print(f"Processing question {index + 1}/{total_questions}...")
                print(f"  -> ⚠️ Warning: Could not extract any valid choice letters from the question. Skipping.")
                predictions.append("INVALID_QUESTION")
                continue

            print(f"Processing question {index + 1}/{total_questions}... (Normalized choices: {', '.join(valid_letters)})")

            answer_letter = generate_answer(question, valid_letters)
            predictions.append(answer_letter)
            print(f"  -> Ground Truth: {ground_truth_letter} | Model's Prediction: {answer_letter}")
            time.sleep(1)

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\n✅ Successfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except (ImportError, NameError):
            print(f"ℹ️ To download '{OUTPUT_CSV}', please use the file browser.")

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

✅ Successfully loaded GROQ_API_KEY from Colab secrets.
Found 4 unique ground truth letters after normalization:
➡️  أ, ب, ج, د
ℹ️ '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek_cot_normalized.csv' not found. Starting prediction generation...
Processing question 1/100... (Normalized choices: أ, ب, ج, د)
  -> ℹ️ Info: Using fallback. Found valid letter 'د' in response.
  -> Ground Truth: أ | Model's Prediction: د
Processing question 2/100... (Normalized choices: أ, ب, ج, د)
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 3/100... (Normalized choices: أ, ب, ج, د)
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 4/100... (Normalized choices: أ, ب, ج, د)
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 5/100... (Normalized choices: أ, ب, ج, د)
  -> Ground Truth: د | Model's Prediction: د
Processing question 6/100... (Normalized choices: أ, ب, ج, د)
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 7/100..

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
Errors or Invalid Responses: 1
Valid Predictions to Evaluate: 99
--------------------
Correct Predictions: 73 / 99
📊 Accuracy (on valid responses): 73.74%
✅ Evaluation Complete.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
