### Import Libraries

In [1]:
import httpx
from openai import OpenAI
import json
import random
import glob as glob

import base64
from typing import List, Dict, Any
from openai import OpenAI
import anthropic, openai, math

In [2]:
from HelperFunctions import *

In [3]:
OPENAI_KEY = os.environ.get('OPENAI_KEY')
OPENAI_ORG_KEY = os.environ.get('OPENAI_ORG_KEY')
# If the key doesn't exist, you can provide a default value
value = os.environ.get('OPENAI_KEY', 'default_value')

client_openai = OpenAI(
  organization=OPENAI_ORG_KEY,
  api_key = OPENAI_KEY
)

ANTHROPIC_KEY = os.environ.get('ANTHROPIC_KEY')

client_anthropic = anthropic.Anthropic(api_key=ANTHROPIC_KEY)

### Zero Shot Approach

### RQ1: How well do LLMs classify images belonging to Natural Classes?

In [4]:
# Test for CIFAR10 sample
dataset = 'CIFAR10'
dataset_classes = list(class_ground_truth_mapping[dataset].values())

samples_to_use = get_samples_from_each_folder(f'./Datasets/{dataset}/test', 10)
ground_truths_for_samples = get_ground_truth_from_path(samples_to_use, dataset_classes)

In [5]:
# Compute result for Anthropic
zero_shot_cifar10_anthropic_preds_file = 'zero_shot_cifar10_anthropic_preds.txt'
if os.path.exists(zero_shot_cifar10_anthropic_preds_file):
    # print(f"File {zero_shot_cifar10_anthropic_preds_file} exists. Load Results...")
    anthropic_preds = load_list_from_file(zero_shot_cifar10_anthropic_preds_file)

    matching_count = count_matching_samples_positionwise(anthropic_preds, ground_truths_for_samples)
    accuracy_anthropic = matching_count / len(ground_truths_for_samples)
    # Add your code here for when the file exists
else:
    print(f"File {zero_shot_cifar10_anthropic_preds_file} does not exist. Creating ...")
    # Add your code here for when the file doesn't exist
    accuracy_anthropic, anthropic_preds = get_class_samples(
        client_anthropic, 
        samples_to_use, 
        dataset_classes, 
        dataset,
        ground_truths_for_samples, 
        unified_zero_shot_predict_class_only,
        model="Claude"  # or whatever model identifier you use for Anthropic
    )

    save_list_to_file(anthropic_preds, zero_shot_cifar10_anthropic_preds_file)
accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(anthropic_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

Accuracy: 0.1000
Mean Bootstrap Accuracy: 0.1002
Standard Deviation of Bootstrap Accuracies: 0.029921


In [6]:
# Compute result for OpenAI GPT-4
zero_shot_cifar10_openai_gpt4o_preds_file = 'zero_shot_cifar10_openai_gpt4o_preds.txt'

if os.path.exists(zero_shot_cifar10_openai_gpt4o_preds_file):
    # print(f"File {zero_shot_cifar10_openai_gpt4o_preds_file} exists. Loading Results...")
    openai_gpt4o_preds = load_list_from_file(zero_shot_cifar10_openai_gpt4o_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {zero_shot_cifar10_openai_gpt4o_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o, openai_gpt4o_preds = get_class_samples(
        client_openai,
        samples_to_use,
        dataset_classes,
        dataset,
        ground_truths_for_samples,
        unified_zero_shot_predict_class_only,
        model="gpt-4o"  # or "GPT4o-mini" for the smaller model
    )
    save_list_to_file(openai_gpt4o_preds, zero_shot_cifar10_openai_gpt4o_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

Accuracy: 0.9800
Mean Bootstrap Accuracy: 0.9794
Standard Deviation of Bootstrap Accuracies: 0.014499


In [7]:
# Compute result for OpenAI GPT-4-mini
zero_shot_cifar10_openai_gpt4o_mini_preds_file = 'zero_shot_cifar10_openai_gpt4o_mini_preds.txt'

if os.path.exists(zero_shot_cifar10_openai_gpt4o_mini_preds_file):
    # print(f"File {zero_shot_cifar10_openai_gpt4o_mini_preds_file} exists. Loading Results...")
    openai_gpt4o_mini_preds = load_list_from_file(zero_shot_cifar10_openai_gpt4o_mini_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_mini_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o_mini = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {zero_shot_cifar10_openai_gpt4o_mini_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o_mini, openai_gpt4o_mini_preds = get_class_samples(
        client_openai,
        samples_to_use,
        dataset_classes,
        dataset,
        ground_truths_for_samples,
        unified_zero_shot_predict_class_only,
        model="gpt-4o-mini"  # The smaller GPT-4 model
    )
    save_list_to_file(openai_gpt4o_mini_preds, zero_shot_cifar10_openai_gpt4o_mini_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_mini_preds, ground_truths_for_samples)
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

Accuracy: 0.9500
Mean Bootstrap Accuracy: 0.9503
Standard Deviation of Bootstrap Accuracies: 0.022227


In [8]:
# Test for CIFAR10 sample
dataset = 'STL10'
dataset_classes = list(class_ground_truth_mapping[dataset].values())

samples_to_use = get_samples_from_each_folder(f'./Datasets/{dataset}/test', 10)
ground_truths_for_samples = get_ground_truth_from_path(samples_to_use, dataset_classes)

### STL10

In [9]:
# Compute result for Anthropic
zero_shot_stl10_anthropic_preds_file = 'zero_shot_stl10_anthropic_preds.txt'
if os.path.exists(zero_shot_stl10_anthropic_preds_file):
    # print(f"File {zero_shot_stl10_anthropic_preds_file} exists. Load Results...")
    anthropic_preds = load_list_from_file(zero_shot_stl10_anthropic_preds_file)

    matching_count = count_matching_samples_positionwise(anthropic_preds, ground_truths_for_samples)
    accuracy_anthropic = matching_count / len(ground_truths_for_samples)
    # Add your code here for when the file exists
else:
    print(f"File {zero_shot_stl10_anthropic_preds_file} does not exist. Creating ...")
    # Add your code here for when the file doesn't exist
    accuracy_anthropic, anthropic_preds = get_class_samples(
        client_anthropic, 
        samples_to_use, 
        dataset_classes, 
        dataset,
        ground_truths_for_samples, 
        unified_zero_shot_predict_class_only,
        model="Claude"  # or whatever model identifier you use for Anthropic
    )

    save_list_to_file(anthropic_preds, zero_shot_stl10_anthropic_preds_file)
accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(anthropic_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

Accuracy: 0.2200
Mean Bootstrap Accuracy: 0.2215
Standard Deviation of Bootstrap Accuracies: 0.042158


In [10]:
# Compute result for OpenAI GPT-4o
zero_shot_stl10_openai_gpt4o_preds_file = 'zero_shot_stl10_openai_gpt4o_preds.txt'

if os.path.exists(zero_shot_stl10_openai_gpt4o_preds_file):
    # print(f"File {zero_shot_stl10_openai_gpt4o_preds_file} exists. Loading Results...")
    openai_gpt4o_preds = load_list_from_file(zero_shot_stl10_openai_gpt4o_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {zero_shot_stl10_openai_gpt4o_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o, openai_gpt4o_preds = get_class_samples(
        client_openai,
        samples_to_use,
        dataset_classes,
        dataset,
        ground_truths_for_samples,
        unified_zero_shot_predict_class_only,
        model="gpt-4o"  # or "GPT4o-mini" for the smaller model
    )
    save_list_to_file(openai_gpt4o_preds, zero_shot_stl10_openai_gpt4o_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

Accuracy: 0.9700
Mean Bootstrap Accuracy: 0.9708
Standard Deviation of Bootstrap Accuracies: 0.016859


In [11]:
# Compute result for OpenAI GPT-4-mini
zero_shot_stl10_openai_gpt4o_mini_preds_file = 'zero_shot_stl10_openai_gpt4o_mini_preds.txt'

if os.path.exists(zero_shot_stl10_openai_gpt4o_mini_preds_file):
    # print(f"File {zero_shot_stl10_openai_gpt4o_mini_preds_file} exists. Loading Results...")
    openai_gpt4o_mini_preds = load_list_from_file(zero_shot_stl10_openai_gpt4o_mini_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_mini_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o_mini = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {zero_shot_stl10_openai_gpt4o_mini_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o_mini, openai_gpt4o_mini_preds = get_class_samples(
        client_openai,
        samples_to_use,
        dataset_classes,
        dataset,
        ground_truths_for_samples,
        unified_zero_shot_predict_class_only,
        model="gpt-4o-mini"  # The smaller GPT-4 model
    )
    save_list_to_file(openai_gpt4o_mini_preds, zero_shot_stl10_openai_gpt4o_mini_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_mini_preds, ground_truths_for_samples)
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

Accuracy: 0.9800
Mean Bootstrap Accuracy: 0.9803
Standard Deviation of Bootstrap Accuracies: 0.013838


### PneumoniaMNIST

In [11]:
# Test for CIFAR10 sample
dataset = 'PneumoniaMNIST'
dataset_classes = list(class_ground_truth_mapping[dataset].values())

samples_to_use = get_samples_from_each_folder(f'./Datasets/{dataset}/test', 50)
ground_truths_for_samples = get_ground_truth_from_path(samples_to_use, dataset_classes)

In [12]:
# Compute result for Anthropic
zero_shot_pneumoniamnist_anthropic_preds_file = 'zero_shot_pneumoniamnist_anthropic_preds.txt'
if os.path.exists(zero_shot_pneumoniamnist_anthropic_preds_file):
    # print(f"File {zero_shot_pneumoniamnist_anthropic_preds_file} exists. Load Results...")
    anthropic_preds = load_list_from_file(zero_shot_pneumoniamnist_anthropic_preds_file)

    matching_count = count_matching_samples_positionwise(anthropic_preds, ground_truths_for_samples)
    accuracy_anthropic = matching_count / len(ground_truths_for_samples)
    # Add your code here for when the file exists
else:
    print(f"File {zero_shot_pneumoniamnist_anthropic_preds_file} does not exist. Creating ...")
    # Add your code here for when the file doesn't exist
    accuracy_anthropic, anthropic_preds = get_class_samples(
        client_anthropic, 
        samples_to_use, 
        dataset_classes, 
        dataset,
        ground_truths_for_samples, 
        unified_zero_shot_predict_class_only,
        model="Claude"  # or whatever model identifier you use for Anthropic
    )

    save_list_to_file(anthropic_preds, zero_shot_pneumoniamnist_anthropic_preds_file)
accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(anthropic_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

File zero_shot_pneumoniamnist_anthropic_preds.txt does not exist. Creating ...
Model predictions: ['normal', 'normal', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia', 'pneumonia

In [13]:
# Compute result for OpenAI GPT-4o
zero_shot_pneumoniamnist_openai_gpt4o_preds_file = 'zero_shot_pneumoniamnist_openai_gpt4o_preds.txt'

if os.path.exists(zero_shot_pneumoniamnist_openai_gpt4o_preds_file):
    # print(f"File {zero_shot_pneumoniamnist_openai_gpt4o_preds_file} exists. Loading Results...")
    openai_gpt4o_preds = load_list_from_file(zero_shot_pneumoniamnist_openai_gpt4o_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {zero_shot_pneumoniamnist_openai_gpt4o_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o, openai_gpt4o_preds = get_class_samples(
        client_openai,
        samples_to_use,
        dataset_classes,
        dataset,
        ground_truths_for_samples,
        unified_zero_shot_predict_class_only,
        model="gpt-4o"  # or "GPT4o-mini" for the smaller model
    )
    save_list_to_file(openai_gpt4o_preds, zero_shot_pneumoniamnist_openai_gpt4o_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

File zero_shot_pneumoniamnist_openai_gpt4o_preds.txt does not exist. Creating ...
The response from the model is:  ```json
[
    {"Image": 0, "Class": "pneumonia"},
    {"Image": 1, "Class": "normal"},
    {"Image": 2, "Class": "pneumonia"},
    {"Image": 3, "Class": "normal"},
    {"Image": 4, "Class": "pneumonia"},
    {"Image": 5, "Class": "pneumonia"},
    {"Image": 6, "Class": "pneumonia"},
    {"Image": 7, "Class": "normal"},
    {"Image": 8, "Class": "normal"},
    {"Image": 9, "Class": "pneumonia"},
    {"Image": 10, "Class": "pneumonia"},
    {"Image": 11, "Class": "normal"},
    {"Image": 12, "Class": "normal"},
    {"Image": 13, "Class": "pneumonia"},
    {"Image": 14, "Class": "normal"},
    {"Image": 15, "Class": "pneumonia"},
    {"Image": 16, "Class": "pneumonia"},
    {"Image": 17, "Class": "normal"},
    {"Image": 18, "Class": "pneumonia"},
    {"Image": 19, "Class": "pneumonia"},
    {"Image": 20, "Class": "normal"},
    {"Image": 21, "Class": "normal"},
    {"Image":

### DermaMNIST

In [7]:
# Test for CIFAR10 sample
dataset = 'DermaMNIST'
dataset_classes = list(class_ground_truth_mapping[dataset].values())

samples_to_use = get_samples_from_each_folder(f'./Datasets/{dataset}/test', 15)[:100]
ground_truths_for_samples = get_ground_truth_from_path(samples_to_use, dataset_classes)

In [9]:
# Compute result for Anthropic
zero_shot_dermamnist_anthropic_preds_file = 'zero_shot_dermamnist_anthropic_preds.txt'
if os.path.exists(zero_shot_dermamnist_anthropic_preds_file):
    # print(f"File {zero_shot_dermamnist_anthropic_preds_file} exists. Load Results...")
    anthropic_preds = load_list_from_file(zero_shot_dermamnist_anthropic_preds_file)

    matching_count = count_matching_samples_positionwise(anthropic_preds, ground_truths_for_samples)
    accuracy_anthropic = matching_count / len(ground_truths_for_samples)
    # Add your code here for when the file exists
else:
    print(f"File {zero_shot_dermamnist_anthropic_preds_file} does not exist. Creating ...")
    # Add your code here for when the file doesn't exist
    accuracy_anthropic, anthropic_preds = get_class_samples(
        client_anthropic, 
        samples_to_use, 
        dataset_classes, 
        dataset,
        ground_truths_for_samples, 
        unified_zero_shot_predict_class_only,
        model="Claude"  # or whatever model identifier you use for Anthropic
    )

    save_list_to_file(anthropic_preds, zero_shot_dermamnist_anthropic_preds_file)
accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(anthropic_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

File zero_shot_dermamnist_anthropic_preds.txt does not exist. Creating ...
Model predictions: ['melanocytic nevi', 'melanocytic nevi', 'melanoma', 'melanoma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanoma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanoma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanoma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanoma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanoma', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', 'melanocytic nevi', '

In [10]:
# Compute result for OpenAI GPT-4o
zero_shot_dermamnist_openai_gpt4o_preds_file = 'zero_shot_dermamnist_openai_gpt4o_preds.txt'

if os.path.exists(zero_shot_dermamnist_openai_gpt4o_preds_file):
    # print(f"File {zero_shot_dermamnist_openai_gpt4o_preds_file} exists. Loading Results...")
    openai_gpt4o_preds = load_list_from_file(zero_shot_dermamnist_openai_gpt4o_preds_file)
    matching_count = count_matching_samples_positionwise(openai_gpt4o_preds, ground_truths_for_samples)
    accuracy_openai_gpt4o = matching_count / len(ground_truths_for_samples)
else:
    print(f"File {zero_shot_dermamnist_openai_gpt4o_preds_file} does not exist. Creating ...")
    accuracy_openai_gpt4o, openai_gpt4o_preds = get_class_samples(
        client_openai,
        samples_to_use,
        dataset_classes,
        dataset,
        ground_truths_for_samples,
        unified_zero_shot_predict_class_only,
        model="gpt-4o"  # or "GPT4o-mini" for the smaller model
    )
    save_list_to_file(openai_gpt4o_preds, zero_shot_dermamnist_openai_gpt4o_preds_file)

accuracy, mean_acc, var_acc, z_score, p_value = compute_statistical_significance(openai_gpt4o_preds, ground_truths_for_samples)

print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Bootstrap Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation of Bootstrap Accuracies: {math.sqrt(var_acc):.6f}")

File zero_shot_dermamnist_openai_gpt4o_preds.txt does not exist. Creating ...
The response from the model is:  ```json
[
    {"Image": 0, "Class": "melanocytic nevi"},
    {"Image": 1, "Class": "melanocytic nevi"},
    {"Image": 2, "Class": "melanocytic nevi"},
    {"Image": 3, "Class": "melanocytic nevi"},
    {"Image": 4, "Class": "melanocytic nevi"},
    {"Image": 5, "Class": "melanocytic nevi"},
    {"Image": 6, "Class": "melanocytic nevi"},
    {"Image": 7, "Class": "melanocytic nevi"},
    {"Image": 8, "Class": "melanocytic nevi"},
    {"Image": 9, "Class": "melanocytic nevi"},
    {"Image": 10, "Class": "melanocytic nevi"},
    {"Image": 11, "Class": "melanocytic nevi"},
    {"Image": 12, "Class": "melanocytic nevi"},
    {"Image": 13, "Class": "melanocytic nevi"},
    {"Image": 14, "Class": "melanocytic nevi"},
    {"Image": 15, "Class": "melanocytic nevi"},
    {"Image": 16, "Class": "melanocytic nevi"},
    {"Image": 17, "Class": "melanocytic nevi"},
    {"Image": 18, "Class"