In [4]:
! pip install anthropic
! pip install openai

Collecting anthropic
  Downloading anthropic-0.75.0-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.75.0-py3-none-any.whl (388 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.2/388.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.75.0


In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,confusion_matrix, classification_report)
from datetime import datetime
import time
from openai import OpenAI
from anthropic import Anthropic

In [7]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
CSV_PATH = "/content/drive/MyDrive/Memo_Dataset.csv"

In [9]:
# api keys OPENAI_API_KEY, ANTHROPIC_API_KEY
from google.colab import userdata
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = userdata.get("ANTHROPIC_API_KEY")

In [10]:
# initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY)

MODELS = {
    "GPT-4o-mini": "gpt-4o-mini-2024-07-18",
    "Claude-Haiku-4.5": "claude-haiku-4-20250514"
}


SYSTEM_PROMPT = """You are a clinical reviewer specializing in post-discharge patient assessments. Your role is to classify patient statements as "Triggered" or "Not Triggered" based on whether they indicate a need for urgent medical follow-up.
You must respond with EXACTLY one word: either "Triggered" or "Not Triggered". Do not include any explanation, punctuation, or additional text."""

USER_PROMPT_TEMPLATE = """Here is a statement from a patient: {last_utterance}

## Task
Determine whether the patient's statement should be classified as a "trigger" response.

A statement is considered "Triggered" if it indicates the patient may require immediate or urgent medical attention, such as follow-up contact or evaluation by a healthcare professional within a short timeframe (e.g., the same day or within 24 hours).

## Output Format
Respond with EXACTLY one of these two options (no punctuation, no explanation):
Triggered
Not Triggered"""


In [11]:
df = pd.read_csv(CSV_PATH)

df = df[['Question', 'Question_eng', 'Trigger']]

print("\nDataset unbalanced shape:", df.shape)
print(df['Trigger'].value_counts())

#balance dataset
min_count = df['Trigger'].value_counts().min()
df_balanced = (df.groupby('Trigger', group_keys=False).apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True))

print("\nBalanced dataset:", len(df_balanced))
print(df_balanced['Trigger'].value_counts())

train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Trigger'])

print(f'\nTest samples: {len(test_df)}')


Dataset unbalanced shape: (13142, 3)
Trigger
1    7687
0    5455
Name: count, dtype: int64

Balanced dataset: 10910
Trigger
0    5455
1    5455
Name: count, dtype: int64

Test samples: 2182


  df_balanced = (df.groupby('Trigger', group_keys=False).apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True))


In [25]:
def call_gpt4o_mini(text, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = openai_client.chat.completions.create(
                model="gpt-4o-mini-2024-07-18",
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": USER_PROMPT_TEMPLATE.format(last_utterance=text)}
                ],
                temperature=0,
                max_tokens=10
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  GPT-4o-mini error (attempt {attempt + 1}): {e}. Retrying...")
                time.sleep(2)
            else:
                print(f"  GPT-4o-mini error after {max_retries} attempts: {e}")
                return None
    return None

def call_claude_haiku(text, max_retries=3):
    """Call Claude Haiku 4.5 API"""
    for attempt in range(max_retries):
        try:
            response = anthropic_client.messages.create(
                model="claude-haiku-4-5-20251001",
                max_tokens=10,
                temperature=0,  # Deterministic output
                system=SYSTEM_PROMPT,
                messages=[
                    {"role": "user", "content": USER_PROMPT_TEMPLATE.format(last_utterance=text)}
                ]
            )
            return response.content[0].text.strip()
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  Claude Haiku error (attempt {attempt + 1}): {e}. Retrying...")
                time.sleep(2)
            else:
                print(f"  Claude Haiku error after {max_retries} attempts: {e}")
                return None
    return None

def parse_response(response_text):
    """
    parses api response to extract 0 or 1.
    Returns (prediction, raw_response)
    Prediction values:
        0 = Not Triggered
        1 = Triggered
        3 = Parsing failed (debugging value)
    """
    if response_text is None:
        return 3, None  #call failed

    response_clean = response_text.strip().lower()
    response_clean = response_clean.replace('.', '').replace(',', '').replace('!', '').replace('?', '')

    # Direct matching
    if response_clean == "triggered":
        return 1, response_text
    elif response_clean == "not triggered":
        return 0, response_text

    # Check for "triggered" WITHOUT "not" before it
    if "not triggered" in response_clean:
        return 0, response_text
    elif "triggered" in response_clean:
        return 1, response_text

    #check for numeric responses
    if response_clean in ["1"]:
        return 1, response_text
    elif response_clean in ["0"]:
        return 0, response_text

    # Unable to parse
    print(f" Warning: Unable to parse response: '{response_text}' - marking as 3 (parse failure)")
    return 3, response_text  # Sentinel value for parse failures

def predict_with_model(texts, model_name, api_call_func):
    predictions = []
    raw_responses = []
    parse_failures = []

    print(f"\nEvaluating with {model_name}...")

    for idx, text in enumerate(texts):
        response = api_call_func(text)
        pred, raw = parse_response(response)
        predictions.append(pred)
        raw_responses.append(raw)

        # Track parse failures (pred == 3)
        if pred == 3:
            parse_failures.append((idx, raw))

        if (idx + 1) % 10 == 0:
            print(f"  Processed {idx + 1}/{len(texts)} samples...")

        #limit api call rate
        time.sleep(0.1)

    if parse_failures:
        print(f"\n WARNING: {len(parse_failures)} responses couldn't be parsed (marked as 3)")
        print(f"  These samples will be EXCLUDED from metrics calculation")
        print(f"  First few examples of failed parses:")
        for idx, raw in parse_failures[:5]:
            print(f"    Sample {idx}: '{raw}'")

    return np.array(predictions), raw_responses

def calculate_metrics(y_true, y_pred):
    """
    Calculate all evaluation metrics. Excludes parse failures where y_pred == 3
    """
    valid_mask = y_pred != 3
    y_true_filtered = y_true[valid_mask]
    y_pred_filtered = y_pred[valid_mask]

    num_excluded = np.sum(~valid_mask)
    if num_excluded > 0:
        print(f"  Note: Excluded {num_excluded} samples with parse failures from metrics")

    metrics = {
        'accuracy': accuracy_score(y_true_filtered, y_pred_filtered),
        'precision': precision_score(y_true_filtered, y_pred_filtered, zero_division=0),
        'recall': recall_score(y_true_filtered, y_pred_filtered, zero_division=0),
        'f1': f1_score(y_true_filtered, y_pred_filtered, zero_division=0),
        'num_samples': len(y_true_filtered),
        'num_excluded': num_excluded
    }

    #confusion matrix
    cm = confusion_matrix(y_true_filtered, y_pred_filtered)
    metrics['confusion_matrix'] = cm.tolist()

    return metrics

def save_predictions(df, predictions, raw_responses, model_name, language, output_dir):
    """Saves detailed predictions to CSV"""
    results_df = df.copy()
    results_df['prediction'] = predictions
    results_df['raw_response'] = raw_responses
    results_df['correct'] = results_df['Trigger'] == results_df['prediction']

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{model_name.replace(' ', '_')}_{language}_{timestamp}.csv"
    # FIX: Join output_dir with filename to create full filepath
    filepath = os.path.join(output_dir, filename)  # ← THIS WAS THE PROBLEM
    results_df.to_csv(filepath, index=False)
    print(f"  Saved predictions to: {filename}")

    return results_df

def print_evaluation_results(model_name, language, metrics):
    """Print formatted evaluation results"""
    print("\n" + "="*70)
    print(f"Model: {model_name} | Language: {language}")
    print("="*70)

    if metrics['num_excluded'] > 0:
        print(f"Evaluated on: {metrics['num_samples']} samples")
        print(f"Excluded: {metrics['num_excluded']} samples (parse failures)")
        print("-"*70)

    print(f"Accuracy:  {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1 Score:  {metrics['f1']:.4f}")

    print("\nConfusion Matrix:")
    cm = metrics['confusion_matrix']
    print(f"                Predicted 0  Predicted 1")
    print(f"Actual 0        {cm[0][0]:>11}  {cm[0][1]:>11}")
    print(f"Actual 1        {cm[1][0]:>11}  {cm[1][1]:>11}")

In [None]:
# Main Evaluation Loop

output_dir = "/content/drive/MyDrive/gpt and claude evaluations"

# Store all results
all_results = []

# Evaluate GPT-4o-mini
print("\n" + "="*70)
print("EVALUATING: GPT-4o-mini-2024-07-18")
print("="*70)

# Arabic
print("\nEvaluating on Arabic text...")
arabic_texts = test_df['Question'].tolist()
gpt_arabic_preds, gpt_arabic_raw = predict_with_model(
    arabic_texts, "GPT-4o-mini", call_gpt4o_mini
)
gpt_arabic_metrics = calculate_metrics(test_df['Trigger'].values, gpt_arabic_preds)

save_predictions(
    test_df, gpt_arabic_preds, gpt_arabic_raw, "GPT-4o-mini", "Arabic", output_dir
)
print_evaluation_results("GPT-4o-mini", "Arabic", gpt_arabic_metrics)

all_results.append({
    'Model': 'GPT-4o-mini',
    'Language': 'Arabic',
    'Accuracy': gpt_arabic_metrics['accuracy'],
    'Precision': gpt_arabic_metrics['precision'],
    'Recall': gpt_arabic_metrics['recall'],
    'F1': gpt_arabic_metrics['f1']
})

# English
print("\nEvaluating on English text...")
english_texts = test_df['Question_eng'].tolist()
gpt_english_preds, gpt_english_raw = predict_with_model(
    english_texts, "GPT-4o-mini", call_gpt4o_mini
)
gpt_english_metrics = calculate_metrics(test_df['Trigger'].values, gpt_english_preds)

save_predictions(
    test_df, gpt_english_preds, gpt_english_raw, "GPT-4o-mini", "English", output_dir
)
print_evaluation_results("GPT-4o-mini", "English", gpt_english_metrics)

all_results.append({
    'Model': 'GPT-4o-mini',
    'Language': 'English',
    'Accuracy': gpt_english_metrics['accuracy'],
    'Precision': gpt_english_metrics['precision'],
    'Recall': gpt_english_metrics['recall'],
    'F1': gpt_english_metrics['f1']
})



In [23]:
print(all_results)

[{'Model': 'GPT-4o-mini', 'Language': 'Arabic', 'Accuracy': 0.5568285976168652, 'Precision': 0.5396419437340153, 'Recall': 0.773602199816682, 'F1': 0.6357815442561205}, {'Model': 'GPT-4o-mini', 'Language': 'English', 'Accuracy': 0.5600366636113657, 'Precision': 0.5507358636715725, 'Recall': 0.6516956920256646, 'F1': 0.5969773299748111}]


In [None]:
# Evaluate Claude Haiku 4.5
print("\n" + "="*70)
print("EVALUATING: Claude Haiku 4.5")
print("="*70)

# Arabic
print("\nEvaluating on Arabic text...")
claude_arabic_preds, claude_arabic_raw = predict_with_model(
    arabic_texts, "Claude-Haiku-4.5", call_claude_haiku
)
claude_arabic_metrics = calculate_metrics(test_df['Trigger'].values, claude_arabic_preds)

save_predictions(
    test_df, claude_arabic_preds, claude_arabic_raw, "Claude-Haiku-4.5", "Arabic", output_dir
)
print_evaluation_results("Claude-Haiku-4.5", "Arabic", claude_arabic_metrics)

all_results.append({
    'Model': 'Claude-Haiku-4.5',
    'Language': 'Arabic',
    'Accuracy': claude_arabic_metrics['accuracy'],
    'Precision': claude_arabic_metrics['precision'],
    'Recall': claude_arabic_metrics['recall'],
    'F1': claude_arabic_metrics['f1']
})

# English
print("\nEvaluating on English text...")
claude_english_preds, claude_english_raw = predict_with_model(
    english_texts, "Claude-Haiku-4.5", call_claude_haiku
)
claude_english_metrics = calculate_metrics(test_df['Trigger'].values, claude_english_preds)

save_predictions(
    test_df, claude_english_preds, claude_english_raw, "Claude-Haiku-4.5", "English", output_dir
)
print_evaluation_results("Claude-Haiku-4.5", "English", claude_english_metrics)

all_results.append({
    'Model': 'Claude-Haiku-4.5',
    'Language': 'English',
    'Accuracy': claude_english_metrics['accuracy'],
    'Precision': claude_english_metrics['precision'],
    'Recall': claude_english_metrics['recall'],
    'F1': claude_english_metrics['f1']
})

# ========================================
# Create Summary Table
# ========================================
print("\n" + "="*70)
print("CREATING SUMMARY TABLE")
print("="*70)

results_df = pd.DataFrame(all_results)

# Save as CSV
summary_csv_path = os.path.join(output_dir, "llm_evaluation_summary.csv")
results_df.to_csv(summary_csv_path, index=False)
print(f"\nSummary table saved to: {summary_csv_path}")

# Display table
print("\n" + "="*70)
print("LLM EVALUATION SUMMARY")
print("="*70)
print(results_df.to_string(index=False))

# ========================================
# Generate LaTeX Table
# ========================================
pivot_df = results_df.pivot(
    index='Model',
    columns='Language',
    values=['Accuracy', 'Precision', 'Recall', 'F1']
)

latex_str = pivot_df.to_latex(
    float_format="%.4f",
    caption="GPT-4o-mini and Claude Haiku 4.5 Evaluation Results",
    label="tab:llm_comparison"
)

latex_path = os.path.join(output_dir, "llm_evaluation_summary.tex")
with open(latex_path, 'w') as f:
    f.write(latex_str)

print(f"\nLaTeX table saved to: {latex_path}")
print("\n" + "="*70)
print("LATEX TABLE PREVIEW")
print("="*70)
print(latex_str)

print("\n" + "="*70)
print("EVALUATION COMPLETE!")
print("="*70)
print(f"All results saved to: {output_dir}")