


# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Multiple Choice QA


 # BiMediX2 : 25.00%

In [None]:
# Main script for Multiple Choice Question Answering with BiMediX2

import os
import re
import pandas as pd
from openai import OpenAI
import time
import numpy as np
from sklearn.metrics import accuracy_score # Import accuracy_score for evaluation

# --- Local Server Configuration ---
client = OpenAI(
    base_url="http://localhost:8000/v1/",
    api_key="DUMMY_KEY",
)

# --- UPDATED: File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/multiple-choice-questions.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_mcq.csv' # As requested
QUESTION_COLUMN = 'Question'
ANSWER_COLUMN = 'Answer'


# --- Function to Generate Answers for MCQ Task ---
def generate_answer(question):
    """
    Sends an MCQ question to the local vLLM server, prompting the model
    to return only the single correct letter.
    """
    # --- NEW: System prompt for the MCQ task ---
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
              model="MBZUAI/BiMediX2-8B-hf",
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              temperature=0.0, # Set to 0 for maximum determinism in a classification task
              max_tokens=5,    # A small value is sufficient for a single letter
            )
            response_text = completion.choices[0].message.content.strip()

            # Clean the response to ensure it's just a single character
            # This will find the first Arabic letter in the response
            arabic_letters = re.findall(r'[\u0621-\u064A]', response_text)
            if arabic_letters:
                return arabic_letters[0]
            else:
                return "" # Return empty if no letter is found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                return f"API Error after multiple retries: {e}"
    return f"Failed to get a response after {max_retries} attempts."


# --- NEW: Function to Evaluate MCQ Accuracy ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """
    Calculates and prints the accuracy of the MCQ predictions.
    """
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    # Ensure both lists are of the same length
    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        # Truncate to the shorter length for comparison
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    accuracy = accuracy_score(ground_truths, predictions)
    correct_predictions = sum(p == g for p, g in zip(predictions, ground_truths))
    total_predictions = len(ground_truths)

    print(f"Correct Predictions: {correct_predictions} / {total_predictions}")
    print(f"📊 Accuracy: {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)

# --- Main Execution for MCQ Task ---
def main():
    """
    Main function for the Multiple Choice Question Answering task.
    """
    try:
        df = pd.read_csv(INPUT_CSV)
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please upload it first.")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not in the CSV file.")
        return

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None)
        predictions = predictions_df[0].tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with BiMediX2 (MCQ Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Generated Answer: {answer_letter}")

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False)
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    # --- UPDATED: Extracting the first character from the Answer column as the ground truth ---
    # We assume the correct letter is the first character of the 'Answer' string (e.g., "د. ...")
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

'/content/drive/MyDrive/AraHealthQA/predictions_mcq.csv' not found. Starting prediction generation process...
Processing question 1/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ج
Processing question 2/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ب
Processing question 3/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ب
Processing question 4/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ه
Processing question 5/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ب
Processing question 6/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ب
Processing question 7/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ج
Processing question 8/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ج
Processing question 9/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: د
Processing question 10/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ب
Processing question 11/100 with BiMediX2 (MCQ Mode)...
  -> Generated Answer: ب
Processing question

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Correct Predictions: 25 / 100
📊 Accuracy: 25.00%
✅ Evaluation Complete.




# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Multiple Choice QA



# deep seek - 61%

In [None]:
# Step 1: Install all necessary libraries
!pip install groq openai nltk rouge-score bert-score transformers sentencepiece -q

import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
# Your Groq API key will be accessed securely from Colab's secrets
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
except (ImportError, KeyError):
    print("Secret 'GROQ_API_KEY' not found. Please add it to the Colab secrets manager.")
    GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/multiple-choice-questions.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_mcq_deepseek.csv' # New file name for clarity
QUESTION_COLUMN = 'Question'
ANSWER_COLUMN = 'Answer'


# --- Function to Generate Answers for MCQ Task with DeepSeek ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API to be processed by DeepSeek,
    and includes cleaning for the <think> block.
    """
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              # --- MODEL UPDATED to DeepSeek ---
              model="deepseek-r1-distill-llama-70b",
              temperature=0.0, # Set to 0 for maximum determinism
              max_tokens=1024, # Kept larger in case the <think> block is long
            )
            raw_response_text = chat_completion.choices[0].message.content.strip()

            # --- Re-introducing the cleaning process for DeepSeek's <think> blocks ---
            # Step 1: Remove the entire <think>...</think> block.
            text_without_think_block = re.sub(r'<think>.*?</think>', '', raw_response_text, flags=re.DOTALL)

            # Step 2: Extract the first Arabic letter from the remaining text
            arabic_letters = re.findall(r'[\u0621-\u064A]', text_without_think_block)
            if arabic_letters:
                return arabic_letters[0]
            else:
                return "" # Return empty if no letter is found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                return f"API Error after multiple retries: {e}"
    return f"Failed to get a response after {max_retries} attempts."


# --- Function to Evaluate MCQ Accuracy ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """
    Calculates and prints the accuracy of the MCQ predictions.
    """
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    accuracy = accuracy_score(ground_truths, predictions)
    correct_predictions = sum(p == g for p, g in zip(predictions, ground_truths))
    total_predictions = len(ground_truths)

    print(f"Correct Predictions: {correct_predictions} / {total_predictions}")
    print(f"📊 Accuracy: {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)

# --- Main Execution for MCQ Task ---
def main():
    """
    Main function for the Multiple Choice Question Answering task.
    """
    try:
        df = pd.read_csv(INPUT_CSV)
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please upload it first.")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not in the CSV file.")
        return

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None)
        predictions = predictions_df[0].tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with DeepSeek (MCQ Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Generated Answer: {answer_letter}")

            # Re-introducing delay for the Groq API rate limit
            if index < total_questions - 1:
                time.sleep(3)

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False)
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    # Extracting the first character from the Answer column as the ground truth
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.19.0 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
vllm 0.6.1.post1 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
xformers 0.0.27.post2 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
vllm-flash-attn 2.6.1 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.[0m[31m
[0m'/content/drive/MyDrive/AraHealthQA/predictions_mcq_deepseek.csv' not found. Starting prediction generation process...
Processing question 1/100 with DeepSeek (MCQ Mode)...
  -> Generated Answer: د
Processing question 2/100 with DeepSeek (MCQ Mode)...
  -> Generated Answer: ب
Processing question 3/100 with DeepSeek (

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Correct Predictions: 61 / 100
📊 Accuracy: 61.00%
✅ Evaluation Complete.





# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Multiple Choice QA



# colosseum_355b_instruct_16k -> 50%

In [None]:
# Step 1: Install all necessary libraries
!pip install openai nltk rouge-score bert-score transformers sentencepiece -q

import os
import re
import pandas as pd
from openai import OpenAI # Using the OpenAI library for the NVIDIA API
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- NVIDIA API Configuration ---
# Your NVIDIA API key will be accessed securely from Colab's secrets
try:
    from google.colab import userdata
    NVIDIA_API_KEY = userdata.get('NVIDIA_API_KEY')
except (ImportError, KeyError):
    print("Secret 'NVIDIA_API_KEY' not found. Please add it to the Colab secrets manager.")
    NVIDIA_API_KEY = getpass('Please enter your NVIDIA API key: ')

# Initialize the OpenAI client to point to the NVIDIA API endpoint
client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = NVIDIA_API_KEY
)

# --- File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/multiple-choice-questions.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_mcq_colosseum.csv' # New file name for clarity
QUESTION_COLUMN = 'Question'
ANSWER_COLUMN = 'Answer'


# --- Function to Generate Answers for MCQ Task with Colosseum ---
def generate_answer(question):
    """
    Sends an MCQ question to the NVIDIA API to be processed by Colosseum,
    prompting the model to return only the single correct letter.
    """
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              # --- MODEL UPDATED to Colosseum ---
              model="igenius/colosseum_355b_instruct_16k",
              temperature=0.0,
              max_tokens=5,
            )
            response_text = completion.choices[0].message.content.strip()

            # Clean the response to ensure it's just a single character
            arabic_letters = re.findall(r'[\u0621-\u064A]', response_text)
            if arabic_letters:
                return arabic_letters[0]
            else:
                return "" # Return empty if no letter is found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                return f"API Error after multiple retries: {e}"
    return f"Failed to get a response after {max_retries} attempts."


# --- Function to Evaluate MCQ Accuracy ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """
    Calculates and prints the accuracy of the MCQ predictions.
    """
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    accuracy = accuracy_score(ground_truths, predictions)
    correct_predictions = sum(p == g for p, g in zip(predictions, ground_truths))
    total_predictions = len(ground_truths)

    print(f"Correct Predictions: {correct_predictions} / {total_predictions}")
    print(f"📊 Accuracy: {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)

# --- Main Execution for MCQ Task ---
def main():
    """
    Main function for the Multiple Choice Question Answering task.
    """
    try:
        df = pd.read_csv(INPUT_CSV)
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please upload it first.")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not in the CSV file.")
        return

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None)
        predictions = predictions_df[0].tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with Colosseum (MCQ Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Generated Answer: {answer_letter}")

            # Conservative delay for the NVIDIA API
            if index < total_questions - 1:
                time.sleep(1)

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False)
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    # Extracting the first character from the Answer column as the ground truth
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.19.0 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
vllm 0.6.1.post1 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
xformers 0.0.27.post2 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
vllm-flash-attn 2.6.1 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.[0m[31m
[0m'/content/drive/MyDrive/AraHealthQA/predictions_mcq_colosseum.csv' not found. Starting prediction generation process...
Processing question 1/100 with Colosseum (MCQ Mode)...
  -> Generated Answer: ه
Processing question 2/100 with Colosseum (MCQ Mode)...
  -> Generated Answer: ب
Processing question 3/100 with Colosseum (MCQ Mode)...
  -> Generated Answer: ج
Processing question 4/100 with Colosseum (MCQ Mode)...
  -> Generated Answer: ه

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Correct Predictions: 50 / 100
📊 Accuracy: 50.00%
✅ Evaluation Complete.




# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Multiple Choice QA




#  llama-3.3-70b-versatile -> 57%

In [None]:
# Step 1: Install all necessary libraries
!pip install groq openai nltk rouge-score bert-score transformers sentencepiece -q

import os
import re
import pandas as pd
import groq # Import the Groq library
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
# Your Groq API key will be accessed securely from Colab's secrets
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
except (ImportError, KeyError):
    print("Secret 'GROQ_API_KEY' not found. Please add it to the Colab secrets manager.")
    GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)

# --- File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/multiple-choice-questions.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_mcq_llama3.3-70b.csv' # New file name
QUESTION_COLUMN = 'Question'
ANSWER_COLUMN = 'Answer'


# --- Function to Generate Answers for MCQ Task with Llama 3.3 ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API to be processed by Llama 3.3,
    prompting the model to return only the single correct letter.
    """
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              # --- MODEL UPDATED to Llama 3.3 70B as requested ---
              model="llama-3.3-70b-versatile",
              temperature=0.0,
              max_tokens=5,
            )
            response_text = chat_completion.choices[0].message.content.strip()

            # Clean the response to ensure it's just a single character
            arabic_letters = re.findall(r'[\u0621-\u064A]', response_text)
            if arabic_letters:
                return arabic_letters[0]
            else:
                return "" # Return empty if no letter is found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                return f"API Error after multiple retries: {e}"
    return f"Failed to get a response after {max_retries} attempts."


# --- Function to Evaluate MCQ Accuracy ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """
    Calculates and prints the accuracy of the MCQ predictions.
    """
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    accuracy = accuracy_score(ground_truths, predictions)
    correct_predictions = sum(p == g for p, g in zip(predictions, ground_truths))
    total_predictions = len(ground_truths)

    print(f"Correct Predictions: {correct_predictions} / {total_predictions}")
    print(f"📊 Accuracy: {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)

# --- Main Execution for MCQ Task ---
def main():
    """
    Main function for the Multiple Choice Question Answering task.
    """
    try:
        df = pd.read_csv(INPUT_CSV)
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please upload it first.")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not in the CSV file.")
        return

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None)
        predictions = predictions_df[0].tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with Llama 3.3 70B (MCQ Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Generated Answer: {answer_letter}")

            # Delay for Groq API rate limit
            if index < total_questions - 1:
                time.sleep(3)

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False)
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    # Extracting the first character from the Answer column as the ground truth
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.19.0 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
vllm 0.6.1.post1 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
xformers 0.0.27.post2 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.
vllm-flash-attn 2.6.1 requires torch==2.4.0, but you have torch 2.7.1 which is incompatible.[0m[31m
[0m'/content/drive/MyDrive/AraHealthQA/predictions_mcq_llama3.3-70b.csv' not found. Starting prediction generation process...
Processing question 1/100 with Llama 3.3 70B (MCQ Mode)...
  -> Generated Answer: ب
Processing question 2/100 with Llama 3.3 70B (MCQ Mode)...
  -> Generated Answer: ب
Processing question 3/100 with Llama 3.3 70B (MCQ Mode)...
  -> Generated Answer: د
Processing question 4/100 with Llama 3.3 70B (MCQ Mode)...
  -> 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Correct Predictions: 57 / 100
📊 Accuracy: 57.00%
✅ Evaluation Complete.




# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Fill-in-the-blank with Choices




# llama-3.3-70b-versatile -> 69%

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
# Your Groq API key will be accessed securely from Colab's secrets
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
except (ImportError, KeyError):
    print("Secret 'GROQ_API_KEY' not found. Please add it to the Colab secrets manager.")
    GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
# NOTE: The input CSV name suggests a "fill-in-the-blank" task, but the code logic
# is designed for Multiple Choice. Ensure this is the correct file.
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices.csv'

# --- UPDATED: Correct column names as per your specification ---
QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'


# --- Function to Generate Answers for MCQ Task ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API, prompting the model
    to return only the single correct letter.
    """
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              model="llama3-70b-8192", # Using a standard available model
              temperature=0.0,
              max_tokens=5,
            )
            response_text = chat_completion.choices[0].message.content.strip()

            # Clean the response to ensure it's just a single Arabic letter
            arabic_letters = re.findall(r'[\u0621-\u064A]', response_text)
            if arabic_letters:
                return arabic_letters[0]
            else:
                print(f"  -> Warning: Model returned an unexpected response: '{response_text}'. Recording as empty.")
                return "" # Return empty if no letter is found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"


# --- Function to Evaluate MCQ Accuracy ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    """
    Calculates and prints the accuracy of the MCQ predictions.
    """
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    # Filter out API errors before calculating accuracy
    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS"]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    api_errors = total_questions - total_valid_predictions


    print(f"Total Questions Attempted: {total_questions}")
    print(f"API Errors/Failed Attempts: {api_errors}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- Main Execution for MCQ Task ---
def main():
    """
    Main function for the Multiple Choice Question Answering task.
    """
    try:
        # Use encoding='utf-8' to handle Arabic characters properly
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return

    # Check if the corrected column names exist in the DataFrame
    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not found in the CSV.")
        print(f"Available columns are: {df.columns.tolist()}")
        return

    # Drop rows where the question or answer is missing
    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)


    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        predictions = predictions_df[0].astype(str).tolist() # Ensure predictions are read as strings
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions}...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Ground Truth: {str(row[ANSWER_COLUMN]).strip()[0]} | Model's Prediction: {answer_letter}")

            # Delay for Groq API rate limit (30 RPM limit)
            if index < total_questions - 1:
                time.sleep(2.1) # Sleep for slightly over 2 seconds

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    # Extracting the first character from the Answer column as the ground truth
    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]

    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

'/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices.csv' not found. Starting prediction generation process...
Processing question 1/100...
  -> Ground Truth: أ | Model's Prediction: ج
Processing question 2/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 3/100...
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 4/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 5/100...
  -> Ground Truth: د | Model's Prediction: د
Processing question 6/100...
  -> Ground Truth: أ | Model's Prediction: ب
Processing question 7/100...
  -> Ground Truth: ب | Model's Prediction: ب
Processing question 8/100...
  -> Ground Truth: د | Model's Prediction: ج
Processing question 9/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 10/100...
  -> Ground Truth: أ | Model's Prediction: ب
Processing question 11/100...
  -> Ground Truth: ج | Model's Prediction: ج
Processing question 12/100...
  -> Ground Truth: ج | Model's Pred

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
API Errors/Failed Attempts: 0
Valid Predictions to Evaluate: 100
--------------------
Correct Predictions: 69 / 100
📊 Accuracy (on valid responses): 69.00%
✅ Evaluation Complete.



# Track 2: MedArabiQ 2025 (General Health)


# Sub-Task 1: Fill-in-the-blank with Choices




# fill in blank deepseek-r1-distill-llama-70b -> 17%

In [None]:
import os
import re
import pandas as pd
import groq
import time
from getpass import getpass
import numpy as np
from sklearn.metrics import accuracy_score

# --- Groq API Configuration ---
try:
    from google.colab import userdata
    GROQ_API_KEY = userdata.get('GROQ_API_KEY')
except (ImportError, KeyError):
    print("Secret 'GROQ_API_KEY' not found. Please add it to the Colab secrets manager.")
    GROQ_API_KEY = getpass('Please enter your Groq API key: ')

# Initialize the Groq client
client = groq.Client(api_key=GROQ_API_KEY)


# --- File paths and column names for the MCQ task ---
INPUT_CSV = '/content/drive/MyDrive/AraHealthQA/fill-in-the-blank-choices.csv'
OUTPUT_CSV = '/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek.csv'

# Column names from your CSV
QUESTION_COLUMN = 'Question - Arabic'
ANSWER_COLUMN = 'Answer - Arabic'


# --- Function to Generate Answers for MCQ Task using DeepSeek ---
def generate_answer(question):
    """
    Sends an MCQ question to the Groq API, prompting the DeepSeek model
    to return only the single correct letter. Includes cleaning for <think> blocks.
    """
    # --- UPDATED: Added an explicit instruction to not use <think> tags ---
    system_prompt = """You are a medical exam answering machine. Your only task is to answer the following multiple-choice medical question. Read the question and the provided options (أ, ب, ج, د, ه). Your response must be ONLY the single Arabic letter corresponding to the correct answer. Do not provide any explanation, reasoning, or any other text. Do not use <think> tags. For example, if option 'ب' is correct, your entire response must be 'ب'."""

    max_retries = 3
    retry_delay = 5
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
              messages=[
                  {"role":"system", "content": system_prompt},
                  {"role":"user","content":question}
              ],
              model="deepseek-r1-distill-llama-70b",
              temperature=0.0,
              # --- UPDATED: Increased max_tokens significantly to prevent cutoff ---
              max_tokens=150,
            )
            raw_response_text = chat_completion.choices[0].message.content.strip()

            # The cleaning logic is now a robust failsafe.
            # It will work correctly now that the model has enough tokens to provide a complete response.
            cleaned_text = re.sub(r'<think>.*?</think>', '', raw_response_text, flags=re.DOTALL).strip()
            arabic_letters = re.findall(r'[\u0621-\u064A]', cleaned_text)

            if arabic_letters:
                return arabic_letters[0]
            else:
                print(f"  -> Warning: No Arabic letter found after cleaning. Cleaned response: '{cleaned_text}'. Recording as empty.")
                return ""

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  -> An error occurred: {e}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"  -> API Error after multiple retries: {e}")
                return "API_ERROR"
    return "FAILED_ATTEMPTS"


# --- Function to Evaluate MCQ Accuracy (Unchanged) ---
def evaluate_mcq_accuracy(predictions, ground_truths):
    print("\n" + "="*50)
    print("🚀 Starting Evaluation...")
    print("="*50)

    if len(predictions) != len(ground_truths):
        print("Warning: Prediction and ground truth lists have different lengths. Evaluation might be inaccurate.")
        min_len = min(len(predictions), len(ground_truths))
        predictions = predictions[:min_len]
        ground_truths = ground_truths[:min_len]

    valid_indices = [i for i, p in enumerate(predictions) if p not in ["API_ERROR", "FAILED_ATTEMPTS"]]
    valid_predictions = [predictions[i] for i in valid_indices]
    valid_ground_truths = [ground_truths[i] for i in valid_indices]

    if not valid_predictions:
        print("No valid predictions to evaluate. Check for widespread API errors.")
        return

    accuracy = accuracy_score(valid_ground_truths, valid_predictions)
    correct_predictions = sum(p == g for p, g in zip(valid_predictions, valid_ground_truths))
    total_valid_predictions = len(valid_predictions)
    total_questions = len(ground_truths)
    api_errors = total_questions - total_valid_predictions

    print(f"Total Questions Attempted: {total_questions}")
    print(f"API Errors/Failed Attempts: {api_errors}")
    print(f"Valid Predictions to Evaluate: {total_valid_predictions}")
    print("-" * 20)
    print(f"Correct Predictions: {correct_predictions} / {total_valid_predictions}")
    print(f"📊 Accuracy (on valid responses): {accuracy * 100:.2f}%")
    print("="*50 + "\n✅ Evaluation Complete.\n" + "="*50)


# --- Main Execution for MCQ Task (Unchanged) ---
def main():
    try:
        df = pd.read_csv(INPUT_CSV, encoding='utf-8')
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please check the path.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return

    if QUESTION_COLUMN not in df.columns or ANSWER_COLUMN not in df.columns:
        print(f"Error: Required columns ('{QUESTION_COLUMN}', '{ANSWER_COLUMN}') not found in the CSV.")
        print(f"Available columns are: {df.columns.tolist()}")
        return

    df.dropna(subset=[QUESTION_COLUMN, ANSWER_COLUMN], inplace=True)
    df.reset_index(drop=True, inplace=True)

    if os.path.exists(OUTPUT_CSV):
        print(f"✅ Found existing prediction file: '{OUTPUT_CSV}'.")
        print("Skipping generation and loading predictions from file for evaluation.")
        predictions_df = pd.read_csv(OUTPUT_CSV, header=None, encoding='utf-8')
        predictions = predictions_df[0].astype(str).tolist()
    else:
        print(f"'{OUTPUT_CSV}' not found. Starting prediction generation process with DeepSeek...")
        predictions = []
        total_questions = len(df)
        for index, row in df.iterrows():
            question = row[QUESTION_COLUMN]
            print(f"Processing question {index + 1}/{total_questions} with DeepSeek (MCQ Mode)...")
            answer_letter = generate_answer(question)
            predictions.append(answer_letter)
            print(f"  -> Ground Truth: {str(row[ANSWER_COLUMN]).strip()[0]} | Model's Prediction: {answer_letter}")

            if index < total_questions - 1:
                time.sleep(2.1)

        predictions_df = pd.DataFrame(predictions)
        predictions_df.to_csv(OUTPUT_CSV, header=False, index=False, encoding='utf-8')
        print(f"\nSuccessfully generated predictions and saved them to '{OUTPUT_CSV}'.")
        try:
            from google.colab import files
            files.download(OUTPUT_CSV)
        except ImportError:
            print(f"To download '{OUTPUT_CSV}', see the file browser on the left.")

    ground_truths = [str(ans).strip()[0] for ans in df[ANSWER_COLUMN].tolist()]
    evaluate_mcq_accuracy(predictions, ground_truths)


if __name__ == "__main__":
    main()

'/content/drive/MyDrive/AraHealthQA/predictions_fitb_choices_deepseek.csv' not found. Starting prediction generation process with DeepSeek...
Processing question 1/100 with DeepSeek (MCQ Mode)...
  -> Ground Truth: أ | Model's Prediction: أ
Processing question 2/100 with DeepSeek (MCQ Mode)...
  -> Ground Truth: ج | Model's Prediction: ا
Processing question 3/100 with DeepSeek (MCQ Mode)...
Okay, so I've got this medical question here about pleural effusion. I'm a bit rusty on my respiratory system, but let me try to think this through.

The question is about filling in the blanks in a sentence. It says that in cases of pleural effusion, the decrease or absence of tactile fremitus on the affected side indicates something, which is caused by something else.

First, I need to remember what pleural effusion is. From what I recall, it's when there's an abnormal accumulation of fluid in the pleural space, which is the area between the lungs and the chest wall. This fluid buildup can be due 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🚀 Starting Evaluation...
Total Questions Attempted: 100
API Errors/Failed Attempts: 0
Valid Predictions to Evaluate: 100
--------------------
Correct Predictions: 17 / 100
📊 Accuracy (on valid responses): 17.00%
✅ Evaluation Complete.
