In [15]:

import pandas as pd
from sklearn.metrics import precision_score, recall_score
from tqdm import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import boto3
import time

In [16]:

AWS_REGION = "us-east-1"
MAX_TOKEN_COUNT = 256
TEMPERATURE = 0.0
TOP_P = 1.0
bedrock = boto3.client("bedrock-runtime", region_name=AWS_REGION)

In [17]:
def call_bedrock(model_id, prompt: str, max_retries: int = 3) -> str:
    """
    Call AWS Bedrock using the Converse API.
    Works uniformly across different model families.
    """
    for attempt in range(max_retries):
        try:
            response = bedrock.converse(
                modelId=model_id,
                messages=[
                    {
                        "role": "user",
                        "content": [{"text": prompt}]
                    }
                ],
                inferenceConfig={
                    "maxTokens": MAX_TOKEN_COUNT,
                    "temperature": TEMPERATURE,
                    "topP": TOP_P,
                }
            )

            # Extract text from converse API response
            output_text = response["output"]["message"]["content"][0]["text"]
            return output_text.strip()

        except Exception as e:
            if attempt == max_retries - 1:
                # Last attempt: re-raise
                raise
            # Simple exponential backoff
            sleep_s = 2 ** attempt
            print(f"Error calling Bedrock ({e}), retrying in {sleep_s}s...")
            time.sleep(sleep_s)


def calculate_accuracy(true_labels, pred_labels):

    def normalize_answer(val):
        """Normalize answer by removing punctuation and whitespace"""
        if pd.isna(val):
            return val  # Keep NaN as is

        # Convert to string, strip whitespace and trailing periods
        val_str = str(val).strip().rstrip('. ')

        # Convert to lowercase for case-insensitive comparison
        return val_str.lower()

    true_arr = pd.Series(true_labels).values
    pred_arr = pd.Series(pred_labels).values

    correct = 0
    total = len(true_arr)

    for i in range(total):
        true_val = true_arr[i]
        pred_val = pred_arr[i]

        # Check if both are NaN
        true_is_nan = pd.isna(true_val)
        pred_is_nan = pd.isna(pred_val)

        if true_is_nan and pred_is_nan:
            correct += 1
        elif not true_is_nan and not pred_is_nan:
            # Normalize both values before comparing
            true_normalized = normalize_answer(true_val)
            pred_normalized = normalize_answer(pred_val)

            if true_normalized == pred_normalized:
                correct += 1

    accuracy = correct / total if total > 0 else 0.0
    return accuracy

In [21]:
# df = df.head(230)
# # USE THIS TO LOOP ACROSS DIFFERENT MODELS
# "anthropic.claude-3-sonnet-20240229-v1:0"
#  "amazon.titan-text-express-v1"
model_id = [
    "us.anthropic.claude-3-5-haiku-20241022-v1:0"

]
CSV_PATH = ["./fixed_RAG_prompt_save_c_50_k_15.csv","./fixed_RAG_prompt_save_c_100_k_8.csv","./fixed_RAG_prompt_save_c_150_k_5.csv","./fixed_RAG_prompt_save_c_200_k_4.csv","./fixed_RAG_prompt_save_c_300_k_2.csv","./fixed_RAG_prompt_save_c_400_k_2.csv","./fixed_RAG_prompt_save_c_500_k_1.csv"]
# model_id = ["meta.llama3-8b-instruct-v1:0"]
# CSV_PATH = ['fixed_RAG_prompt_save_c_50_k_15.csv']

for csv_path in CSV_PATH:
    print('*'*80)
    print(f"Reading {csv_path}")
    print('*'*80)
    df = pd.read_csv(csv_path)
    parts = csv_path.split('_')
    c = parts[-3]
    k = parts[-1].split('.')[0]


    for model in model_id:
        print(f"Sending {len(df)} prompts to Bedrock model '{model}'...")
        OUTPUT_WITH_PREDS = f'final_{model}_pred_w_gt_c_{c}_k_{k}.csv'
        METRICS_PER_QUESTION = f'final_{model}_acc_per_q_c_{c}_k_{k}.csv'
        model_answers = []
        for _, row in tqdm(df.iterrows(), total=len(df)):
            prompt = row["prompt"]
            answer = call_bedrock(model, prompt)
            model_answers.append(answer)
        df["model_answer"] = model_answers
        df.to_csv(OUTPUT_WITH_PREDS, index=False)
        print(f"Saved predictions to {OUTPUT_WITH_PREDS}")

        # Normalize labels
        df["true_label"] = df["true_answer"].replace({np.nan: "NA", "nan": "NA"})
        df["pred_label"] = df["model_answer"]
        df_eval = df.copy()


        # print("\n=== DEBUG: Checking label values ===")
        # print("Unique true_label values:", sorted(df_eval["true_label"].unique()))
        # print("Unique pred_label values:", sorted(df_eval["pred_label"].unique()))
        # print("\nTrue label value counts:")
        # print(df_eval["true_label"].value_counts().head(10))
        # print("\nPred label value counts:")
        # print(df_eval["pred_label"].value_counts().head(10))

        # Calculate global accuracy
        global_accuracy = calculate_accuracy(
            df_eval["true_label"],
            df_eval["pred_label"]
        )

        print("\n=== Global Accuracy ===")
        print(f"Accuracy: {global_accuracy:.4f}")
        print(f"( NaN == NaN is counted as correct)")

        # Separate numeric and non-numeric questions
        def is_numeric(val):
            """Check if a value is numeric"""
            if pd.isna(val) or val == "NA":
                return False
            try:
                float(str(val))
                return True
            except (ValueError, TypeError):
                return False

        df_eval["is_numeric"] = df_eval["true_answer"].apply(is_numeric)

        # Calculate accuracy for numeric questions
        df_numeric = df_eval[df_eval["question_type"] == 'numeric']
        if len(df_numeric) > 0:
            numeric_accuracy = calculate_accuracy(
                df_numeric["true_label"],
                df_numeric["pred_label"]
            )
            print(f"\n=== Numeric Questions Accuracy ===")
            print(f"Number of numeric questions: {len(df_numeric)}")
            print(f"Accuracy: {numeric_accuracy:.4f}")
        else:
            print(f"\n=== Numeric Questions Accuracy ===")
            print(f"No numeric questions found")

        # Calculate accuracy for non-numeric questions
        df_non_numeric = df_eval[df_eval["question_type"] == 'yes']
        if len(df_non_numeric) > 0:
            non_numeric_accuracy = calculate_accuracy(
                df_non_numeric["true_label"],
                df_non_numeric["pred_label"]
            )
            print(f"\n=== Non-Numeric Questions Accuracy ===")
            print(f"Number of non-numeric questions: {len(df_non_numeric)}")
            print(f"Accuracy: {non_numeric_accuracy:.4f}")
        else:
            print(f"\n=== Non-Numeric Questions Accuracy ===")
            print(f"No non-numeric questions found")

        # Calculate accuracy by question_type
        if 'question_type' in df_eval.columns:
            print("\n=== Accuracy by Question Type ===")
            for qtype in df_eval['question_type'].unique():
                df_qtype = df_eval[df_eval['question_type'] == qtype]
                if len(df_qtype) > 0:
                    qtype_accuracy = calculate_accuracy(
                        df_qtype["true_label"],
                        df_qtype["pred_label"]
                    )
                    print(f"{qtype} questions: {len(df_qtype)} examples, Accuracy: {qtype_accuracy:.4f}")

        # Per-question metrics
        metrics_rows = []
        for question, g in df_eval.groupby("question"):
            if len(g) == 0:
                continue

            acc = calculate_accuracy(
                g["true_label"],
                g["pred_label"]
            )

            metrics_rows.append({
                "question": question,
                "n_examples": len(g),
                "accuracy": acc
            })

        metrics_df = pd.DataFrame(metrics_rows)

        if len(metrics_df) > 0:
            metrics_df = metrics_df.sort_values("question")
            metrics_df.to_csv(METRICS_PER_QUESTION, index=False)

            print(f"\nSaved per-question metrics to {METRICS_PER_QUESTION}")
            print("\nPer-question metrics:")
            print(metrics_df)
        else:
            print("\n No metrics to save!")

********************************************************************************
Reading ./fixed_RAG_prompt_save_c_50_k_15.csv
********************************************************************************
Sending 2300 prompts to Bedrock model 'us.anthropic.claude-3-5-haiku-20241022-v1:0'...


100%|██████████| 2300/2300 [37:52<00:00,  1.01it/s]  


Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_50_k_15.csv

=== Global Accuracy ===
Accuracy: 0.8457
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.7950

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8727

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8727
numeric questions: 800 examples, Accuracy: 0.7950

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_50_k_15.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.86
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.87
3   Does the note describe the patient as ever bei...         100      1.00
4   Do

100%|██████████| 2300/2300 [33:45<00:00,  1.14it/s] 


Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_100_k_8.csv

=== Global Accuracy ===
Accuracy: 0.8543
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.8475

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8580

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8580
numeric questions: 800 examples, Accuracy: 0.8475

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_100_k_8.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.86
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.86
3   Does the note describe the patient as ever bei...         100      1.00
4   Do

100%|██████████| 2300/2300 [33:38<00:00,  1.14it/s]


Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_150_k_5.csv

=== Global Accuracy ===
Accuracy: 0.8417
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.8187

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8540

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8540
numeric questions: 800 examples, Accuracy: 0.8187

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_150_k_5.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.86
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.87
3   Does the note describe the patient as ever bei...         100      1.00
4   Do

100%|██████████| 2300/2300 [34:10<00:00,  1.12it/s] 


Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_200_k_4.csv

=== Global Accuracy ===
Accuracy: 0.8400
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.7987

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8620

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8620
numeric questions: 800 examples, Accuracy: 0.7987

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_200_k_4.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.86
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.88
3   Does the note describe the patient as ever bei...         100      1.00
4   Do

100%|██████████| 2300/2300 [32:48<00:00,  1.17it/s]  


Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_300_k_2.csv

=== Global Accuracy ===
Accuracy: 0.8243
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.7500

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8640

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8640
numeric questions: 800 examples, Accuracy: 0.7500

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_300_k_2.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.87
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.86
3   Does the note describe the patient as ever bei...         100      1.00
4   Do

100%|██████████| 2300/2300 [32:06<00:00,  1.19it/s] 


Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_400_k_2.csv

=== Global Accuracy ===
Accuracy: 0.8296
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.7688

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8620

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8620
numeric questions: 800 examples, Accuracy: 0.7688

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_400_k_2.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.86
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.84
3   Does the note describe the patient as ever bei...         100      1.00
4   Do

100%|██████████| 2300/2300 [31:41<00:00,  1.21it/s]

Saved predictions to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_pred_w_gt_c_500_k_1.csv

=== Global Accuracy ===
Accuracy: 0.7652
( NaN == NaN is counted as correct)

=== Numeric Questions Accuracy ===
Number of numeric questions: 800
Accuracy: 0.6312

=== Non-Numeric Questions Accuracy ===
Number of non-numeric questions: 1500
Accuracy: 0.8367

=== Accuracy by Question Type ===
yes questions: 1500 examples, Accuracy: 0.8367
numeric questions: 800 examples, Accuracy: 0.6312

Saved per-question metrics to final_us.anthropic.claude-3-5-haiku-20241022-v1:0_acc_per_q_c_500_k_1.csv

Per-question metrics:
                                             question  n_examples  accuracy
0   Does the note describe the patient as being un...         100      0.87
1   Does the note describe the patient as ever bei...         100      1.00
2   Does the note describe the patient as ever bei...         100      0.86
3   Does the note describe the patient as ever bei...         100      1.00
4   Do


