### Import Libraries

In [5]:
import httpx
from openai import OpenAI
import json
import random
import glob as glob

import base64
from typing import List, Dict, Any
from openai import OpenAI
import anthropic, openai, math

In [6]:
from HelperFunctions import *

In [7]:
OPENAI_KEY = os.environ.get('OPENAI_KEY')
OPENAI_ORG_KEY = os.environ.get('OPENAI_ORG_KEY')
# If the key doesn't exist, you can provide a default value
value = os.environ.get('OPENAI_KEY', 'default_value')

client_openai = OpenAI(
  organization=OPENAI_ORG_KEY,
  api_key = OPENAI_KEY
)

ANTHROPIC_KEY = os.environ.get('ANTHROPIC_KEY')

client_anthropic = anthropic.Anthropic(api_key=ANTHROPIC_KEY)

### Few Shot Approach

### PneumoniaMNIST

In [29]:
dataset = 'PneumoniaMNIST'
dataset_classes = list(class_ground_truth_mapping[dataset].values())

number_of_shot = 1

few_shot_examples = get_samples_from_each_folder(f'./Datasets/{dataset}/train', number_of_shot)
ground_truths_for_few_shot_examples = get_ground_truth_from_path(few_shot_examples, dataset_classes)

few_shot_examples = [{'class': ground_truths_for_few_shot_examples[i], 'image_path': few_shot_examples[i]} for i in range(len(few_shot_examples))]

samples_to_use = get_samples_from_each_folder(f'./Datasets/{dataset}/test', 50)
ground_truths_for_samples = get_ground_truth_from_path(samples_to_use, dataset_classes)

samples_to_use_v1 = samples_to_use[:50]
ground_truths_for_samples_v1 = ground_truths_for_samples[:50]

samples_to_use_v2 = samples_to_use[50:100]
ground_truths_for_samples_v2 = ground_truths_for_samples[50:100]

In [8]:
few_shot_pneumoniamnist_anthropic_preds_file = 'few_shot_pneumoniamnist_anthropic_preds.txt'

if os.path.exists(few_shot_pneumoniamnist_anthropic_preds_file):
    print(f"File {few_shot_pneumoniamnist_anthropic_preds_file} exists. Loading Results...")
    anthropic_preds = load_list_from_file(few_shot_pneumoniamnist_anthropic_preds_file)
    matching_count = count_matching_samples_positionwise(anthropic_preds, ground_truths_for_samples)
    accuracy_anthropic = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {few_shot_pneumoniamnist_anthropic_preds_file} does not exist. Creating...")
    
    # Perform few-shot prediction
    accuracy_anthropic, anthropic_preds_v1 = get_class_samples_few_shots(
        client_anthropic,
        samples_to_use_v1,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v1,
        unified_few_shot_predict_class_only,  # Use the few-shot version of the prediction function
        model="Claude",  # or whatever model identifier you use for Anthropic
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    # Perform few-shot prediction
    accuracy_anthropic, anthropic_preds_v2 = get_class_samples_few_shots(
        client_anthropic,
        samples_to_use_v2,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v2,
        unified_few_shot_predict_class_only,  # Use the few-shot version of the prediction function
        model="Claude",  # or whatever model identifier you use for Anthropic
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    anthropic_preds = anthropic_preds_v1 + anthropic_preds_v2
    
    save_list_to_file(anthropic_preds, few_shot_pneumoniamnist_anthropic_preds_file)

# Compute statistical significance
accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(anthropic_preds, ground_truths_for_samples)

print(f"Few-Shot Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")
print(f"Z-Score: {z_score:.4f}")
print(f"P-Value: {p_value:.4f}")


File few_shot_pneumoniamnist_anthropic_preds.txt does not exist. Creating...
Model predictions: ['normal', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia']
Ground truths: ['pneumonia', 'pneumonia', 'pneumonia', 'normal', 'pneumonia', 'normal', 'pneumonia', 'normal', 'normal', 'pneumonia', 'normal', 'normal', 'normal', 'pneumonia', 'normal', 'pneumonia', 'pneumonia', 'pneumonia', 'normal', 'pneumonia', 'normal

In [30]:
# Compute result for OpenAI GPT-4o
few_shot_pneumoniamnist_openai_gpt4o_preds_file = 'few_shot_pneumoniamnist_openai_gpt4o_preds.txt'

if os.path.exists(few_shot_pneumoniamnist_openai_gpt4o_preds_file):
    # print(f"File {zero_shot_pneumoniamnist_openai_gpt4o_preds_file} exists. Loading Results...")
    openai_gpt4o_preds = load_list_from_file(few_shot_pneumoniamnist_openai_gpt4o_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {few_shot_pneumoniamnist_openai_gpt4o_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o, openai_gpt4o_preds_v1 = get_class_samples_few_shots(
        client_openai,
        samples_to_use_v1,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v1,
        unified_few_shot_predict_class_only,
        model="gpt-4o",  # The smaller GPT-4 model
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    accuracy_openai_gpt4o, openai_gpt4o_preds_v2 = get_class_samples_few_shots(
        client_openai,
        samples_to_use_v2,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v2,
        unified_few_shot_predict_class_only,
        model="gpt-4o",  # The smaller GPT-4 model
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    openai_gpt4o_preds = openai_gpt4o_preds_v1 + openai_gpt4o_preds_v2
    save_list_to_file(openai_gpt4o_preds, few_shot_pneumoniamnist_openai_gpt4o_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_preds, ground_truths_for_samples)
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

File few_shot_pneumoniamnist_openai_gpt4o_preds.txt does not exist. Creating ...
The response from the model is:  ```json
[
    {"Image": 0, "Class": "pneumonia"},
    {"Image": 1, "Class": "normal"},
    {"Image": 2, "Class": "pneumonia"},
    {"Image": 3, "Class": "normal"},
    {"Image": 4, "Class": "pneumonia"},
    {"Image": 5, "Class": "pneumonia"},
    {"Image": 6, "Class": "pneumonia"},
    {"Image": 7, "Class": "normal"},
    {"Image": 8, "Class": "normal"},
    {"Image": 9, "Class": "pneumonia"},
    {"Image": 10, "Class": "normal"},
    {"Image": 11, "Class": "normal"},
    {"Image": 12, "Class": "normal"},
    {"Image": 13, "Class": "normal"},
    {"Image": 14, "Class": "normal"},
    {"Image": 15, "Class": "normal"},
    {"Image": 16, "Class": "pneumonia"},
    {"Image": 17, "Class": "normal"},
    {"Image": 18, "Class": "pneumonia"},
    {"Image": 19, "Class": "pneumonia"},
    {"Image": 20, "Class": "normal"},
    {"Image": 21, "Class": "normal"},
    {"Image": 22, "Clas

### DermaMNIST

In [8]:
dataset = 'DermaMNIST'
dataset_classes = list(class_ground_truth_mapping[dataset].values())

number_of_shot = 1

few_shot_examples = get_samples_from_each_folder(f'./Datasets/{dataset}/train', number_of_shot)
ground_truths_for_few_shot_examples = get_ground_truth_from_path(few_shot_examples, dataset_classes)

few_shot_examples = [{'class': ground_truths_for_few_shot_examples[i], 'image_path': few_shot_examples[i]} for i in range(len(few_shot_examples))]

samples_to_use = get_samples_from_each_folder(f'./Datasets/{dataset}/test', 15)
ground_truths_for_samples = get_ground_truth_from_path(samples_to_use, dataset_classes)

samples_to_use_v1 = samples_to_use[:50]
ground_truths_for_samples_v1 = ground_truths_for_samples[:50]

samples_to_use_v2 = samples_to_use[50:100]
ground_truths_for_samples_v2 = ground_truths_for_samples[50:100]

ground_truths_for_samples = ground_truths_for_samples_v1 + ground_truths_for_samples_v2

In [9]:
few_shot_dermamnist_anthropic_preds_file = 'few_shot_dermamnist_anthropic_preds.txt'

if os.path.exists(few_shot_dermamnist_anthropic_preds_file):
    print(f"File {few_shot_dermamnist_anthropic_preds_file} exists. Loading Results...")
    anthropic_preds = load_list_from_file(few_shot_dermamnist_anthropic_preds_file)
    matching_count = count_matching_samples_positionwise(anthropic_preds, ground_truths_for_samples)
    accuracy_anthropic = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {few_shot_dermamnist_anthropic_preds_file} does not exist. Creating...")
    
    # Perform few-shot prediction
    accuracy_anthropic, anthropic_preds_v1 = get_class_samples_few_shots(
        client_anthropic,
        samples_to_use_v1,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v1,
        unified_few_shot_predict_class_only,  # Use the few-shot version of the prediction function
        model="Claude",  # or whatever model identifier you use for Anthropic
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    # Perform few-shot prediction
    accuracy_anthropic, anthropic_preds_v2 = get_class_samples_few_shots(
        client_anthropic,
        samples_to_use_v2,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v2,
        unified_few_shot_predict_class_only,  # Use the few-shot version of the prediction function
        model="Claude",  # or whatever model identifier you use for Anthropic
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    anthropic_preds = anthropic_preds_v1 + anthropic_preds_v2
    save_list_to_file(anthropic_preds, few_shot_dermamnist_anthropic_preds_file)

# Compute statistical significance
accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(anthropic_preds, ground_truths_for_samples)

print(f"Few-Shot Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")
print(f"Z-Score: {z_score:.4f}")
print(f"P-Value: {p_value:.4f}")


File few_shot_dermamnist_anthropic_preds.txt does not exist. Creating...
Model predictions: ['melanocytic nevi', 'basal cell carcinoma', 'benign keratosis-like lesions', 'vascular lesions', 'melanoma', 'actinic keratoses and intraepithelial carcinoma', 'dermatofibroma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocy

In [10]:
# Compute result for OpenAI GPT-4o
few_shot_dermamnist_openai_gpt4o_preds_file = 'few_shot_dermamnist_openai_gpt4o_preds.txt'

if os.path.exists(few_shot_dermamnist_openai_gpt4o_preds_file):
    # print(f"File {zero_shot_dermamnist_openai_gpt4o_preds_file} exists. Loading Results...")
    openai_gpt4o_preds = load_list_from_file(few_shot_dermamnist_openai_gpt4o_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {few_shot_dermamnist_openai_gpt4o_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o, openai_gpt4o_preds_v1 = get_class_samples_few_shots(
        client_openai,
        samples_to_use_v1,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v1,
        unified_few_shot_predict_class_only,
        model="gpt-4o",  # The smaller GPT-4 model
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    accuracy_openai_gpt4o, openai_gpt4o_preds_v2 = get_class_samples_few_shots(
        client_openai,
        samples_to_use_v2,
        dataset_classes,
        dataset,
        ground_truths_for_samples_v2,
        unified_few_shot_predict_class_only,
        model="gpt-4o",  # The smaller GPT-4 model
        few_shot_examples=few_shot_examples  # Pass the few-shot examples
    )

    openai_gpt4o_preds = openai_gpt4o_preds_v1 + openai_gpt4o_preds_v2
    save_list_to_file(openai_gpt4o_preds, few_shot_dermamnist_openai_gpt4o_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_preds, ground_truths_for_samples)
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

File few_shot_dermamnist_openai_gpt4o_preds.txt does not exist. Creating ...
Model predictions: ['benign keratosis-like lesions', 'vascular lesions', 'actinic keratoses and intraepithelial carcinoma', 'melanoma', 'melanocytic nevi', 'benign keratosis-like lesions', 'actinic keratoses and intraepithelial carcinoma', 'benign keratosis-like lesions', 'vascular lesions', 'benign keratosis-like lesions', 'melanoma', 'melanocytic nevi', 'benign keratosis-like lesions', 'melanocytic nevi', 'actinic keratoses and intraepithelial carcinoma', 'actinic keratoses and intraepithelial carcinoma', 'vascular lesions', 'vascular lesions', 'vascular lesions', 'vascular lesions', 'vascular lesions', 'benign keratosis-like lesions', 'vascular lesions', 'actinic keratoses and intraepithelial carcinoma', 'vascular lesions', 'melanocytic nevi', 'actinic keratoses and intraepithelial carcinoma', 'benign keratosis-like lesions', 'actinic keratoses and intraepithelial carcinoma', 'vascular lesions', 'basal cell