<a href="https://colab.research.google.com/github/alexisjkim/conformal_prediction_limitations/blob/main/Conformal_Prediction_Limitations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# imports

import numpy as np
import matplotlib.pyplot as plt
import time
import math


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, ConcatDataset, random_split
import torchvision

In [None]:
# loading data and splitting into train, calibration, test sets

batch_size = 128

# this dataset has the training data from MNIST; will be split into training and calibration sets
mnist_train_set = torchvision.datasets.MNIST(root='./datasets/',
                                           train=True,
                                           transform=torchvision.transforms.ToTensor(),
                                           download=True)



# this dataset has the test data from MNIST
mnist_test_dataset = torchvision.datasets.MNIST(root='./datasets',
                                          train=False,
                                          transform=torchvision.transforms.ToTensor())


train_percentage = 0.8 # this percentage of the training data set stays in the train set; the rest becomes part of the calibration set

train_size = int(train_percentage *len(mnist_train_set))
calibration_size = len(mnist_train_set) - train_size

mnist_train_set, mnist_cal_set = random_split(mnist_train_set, [train_size, calibration_size])

# Data loader
mnist_train_loader = torch.utils.data.DataLoader(dataset=mnist_train_set,
                                           batch_size=batch_size,
                                           shuffle=True, drop_last=True)

mnist_cal_loader = torch.utils.data.DataLoader(dataset=mnist_cal_set,
                                           batch_size=batch_size,
                                           shuffle=True, drop_last=True)

# We use drop_last=True to avoid the case where the data / batch_size != int

mnist_test_loader = torch.utils.data.DataLoader(dataset=mnist_test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

print("SIZES OF DATASETS: ")
print("training set: ", len(mnist_train_loader.dataset))
print("calibration set: ", len(mnist_cal_loader.dataset))
print("testing set: ", len(mnist_test_loader.dataset))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./datasets/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 16170977.00it/s]


Extracting ./datasets/MNIST/raw/train-images-idx3-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./datasets/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 496956.76it/s]


Extracting ./datasets/MNIST/raw/train-labels-idx1-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./datasets/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 3868631.93it/s]


Extracting ./datasets/MNIST/raw/t10k-images-idx3-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 2185696.28it/s]

Extracting ./datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./datasets/MNIST/raw

SIZES OF DATASETS: 
training set:  48000
calibration set:  12000
testing set:  10000





In [None]:
# class for our neural network

class TwoLayerNetPiped(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        Parameters:
            D_in - dimensions of inputs
            H - number of hidden units per layer
            D_out - dimensions of outputs
        """
        # initialzing the parent object (important!)
        super(TwoLayerNetPiped, self).__init__()
        # Create a pipeline - a sequence of layers
        self.pipe = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        Parameters:
            x - tensor of inputs (shape: [BATCH_SIZE, D_in])
        """
        return self.pipe(x)

In [None]:
# Setting up the model

# hyper-parameters:
num_epochs = 1
learning_rate = 0.001

# Device configuration, as before
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# create model, send it to device
model = TwoLayerNetPiped(D_in=28*28, H=256, D_out=10).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Train the model



model.train()  # training mode
total_step = len(mnist_train_loader)
start_time = time.time()
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(mnist_train_loader):
        # each i is a batch of 128 samples
        images = images.to(device).view(batch_size, -1)  # represent images as column vectors
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize - ALWAYS IN THIS ORDER!
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time: {:.4f} secs'
                   .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), time.time() - start_time))

Epoch [1/1], Step [100/375], Loss: 0.3426, Time: 2.5092 secs
Epoch [1/1], Step [200/375], Loss: 0.2782, Time: 4.7107 secs
Epoch [1/1], Step [300/375], Loss: 0.2621, Time: 8.0662 secs


In [None]:
# Test the model

def test_model(model, test_loader):

  model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance), or use:
  with torch.no_grad(): # "don't keep track of the gradients" ,can also use .detach()
      correct = 0
      total = 0
      for images, labels in test_loader:
          images = images.to(device).view(images.size(0), -1) #image.size(0) returns batch size
          labels = labels.to(device)
          outputs = model(images)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

      print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))


test_model(model, mnist_test_loader)

Test Accuracy of the model on the 10000 test images: 93.72 %


Conformal prediction starts here

In [None]:
#Calibration

cal_probs = []
cal_pred = []
with torch.no_grad():
  for images, labels in mnist_cal_loader:
    images = images.to(device).view(images.size(0), -1)
    outputs = model(images)
    #OUTPUTS IS A TENSOR with values corresponding to each image's logits (raw unnormalized scores)
    #softmax outputs a probabiity for each class
    probabilities = torch.nn.functional.softmax(outputs, dim = 1) #1 corresponds to columns
    #the prediction outputs the index of what it thinks the class is
    #the omitted term is the value
    _, predictions = torch.max(outputs.data, 1) #get the index of the highest output

    # Append to lists
    cal_pred.extend(predictions.cpu().numpy())
    cal_probs.extend(probabilities.cpu().numpy())

print(cal_probs[:5])

[array([4.9151044e-05, 1.7761309e-05, 1.0242654e-03, 2.2454487e-04,
       2.8897154e-03, 1.2629440e-03, 7.0550470e-03, 2.6623131e-06,
       9.5993304e-01, 2.7540868e-02], dtype=float32), array([1.9031499e-05, 2.1645799e-04, 6.4659311e-05, 2.7961194e-04,
       1.1784593e-03, 3.0271948e-04, 1.8629456e-06, 9.7328293e-01,
       5.4794981e-04, 2.4106339e-02], dtype=float32), array([3.2352060e-03, 5.3867490e-07, 5.9868897e-05, 1.9845819e-01,
       1.0001037e-04, 2.2523753e-01, 4.8853335e-06, 2.6538512e-01,
       4.0974751e-02, 2.6654395e-01], dtype=float32), array([1.15080955e-04, 4.93528560e-08, 5.29570534e-05, 3.33868340e-02,
       2.45377964e-06, 9.64293301e-01, 1.86600751e-04, 1.27749360e-07,
       1.95637438e-03, 6.32836600e-06], dtype=float32), array([3.7699076e-04, 8.8945382e-07, 1.6375295e-03, 1.2919343e-02,
       1.3287524e-06, 9.5303333e-04, 6.0599483e-08, 9.7795630e-01,
       9.3731076e-05, 6.0608303e-03], dtype=float32)]


In [None]:
cal_scores = []
for prob, true_label in zip(cal_probs, cal_pred): #prob is the probability and true_label is index of pred
  true_class_prob = prob[true_label] #the corresponding lists with their prob function getting the most predicted class
  cal_scores.append(1 - true_class_prob) #s_i score

cal_scores = np.array(cal_scores)
sorted_scores = np.sort(cal_scores) #probabilities

def get_quantile(scores, alpha):
  n = 0
  for images, labels in mnist_cal_loader:
    n += labels.size(0)
  q_level = math.ceil((1 - alpha) * (n + 1)) / n

  print(f"Adjusted quantile level: {q_level}")
  return np.percentile(scores, (1 - alpha) * 100)

alpha = 0.05
threshold = get_quantile(sorted_scores, alpha) #for the calibration set
print(threshold)



Adjusted quantile level: 0.9501008064516129
0.4504952162504196


In [None]:
import pandas as pd
#we know the conformal prediciton model takes in probabilities until it reaches the
#the threshold q hat
#the threshold is q hat (that quantile value)

def conformal_prediction(probabilities, threshold):
    predictions = []
    for prob in probabilities:
        sorted_indices = np.argsort(prob)[::-1]
        total = 0.0
        prediction = []
        for i in sorted_indices:
            total += prob[i]
            prediction.append(i)
            if total > 1 - threshold: #we do 1 - threshold because we want to observe the right side as we are adding in ascending order
                break
        predictions.append(prediction)
    return predictions





In [None]:
#evaluation
def evaluate_and_print(observed_labels, conformal_prediction, predicted, start_row, end_row):
  formatted_output = pd.DataFrame({
    'observed labels': observed_labels,
    'confromal prediction set': conformal_prediction,
    'prediction': predicted
  })


  #evaluation metric
  hits = 0
  total = len(observed_labels)
  for i in range(len(observed_labels)):
    conf_pred_row = conformal_prediction[i]
    observed = observed_labels[i]

    if observed in conf_pred_row:
      hits += 1

  print("the prediction was in the set ", hits/total *100, " percent of the time")

  print(formatted_output[start_row:end_row])


In [None]:
#evaluating our testing data set

observed_labels = []
with torch.no_grad():
    for images, labels in mnist_test_loader:
        observed_labels.extend(labels.cpu().numpy())

test_probs = []
test_pred = []
with torch.no_grad():
    for images, _ in mnist_test_loader:
        images = images.to(device).view(images.size(0), -1)
        outputs = model(images)
        probabilities = torch.nn.functional.softmax(outputs, dim = 1) #1 corresponds to columns
        #the prediction outputs the index of what it thinks the class is
        #the ommited term is the value
        _, predictions = torch.max(outputs.data, 1) #get the index of the highest output
        # Append to lists
        test_pred.extend(predictions.cpu().numpy())
        test_probs.extend(probabilities.cpu().numpy())

conformal_predictions = conformal_prediction(test_probs, threshold)
evaluate_and_print(observed_labels, conformal_predictions, test_pred, 300, 450)



the prediction was in the set  95.12  percent of the time
     observed labels confromal prediction set  prediction
300                4                      [6]           6
301                7                      [7]           7
302                1                      [1]           1
303                2                      [2]           2
304                4                      [4]           4
..               ...                      ...         ...
445                6                      [0]           0
446                6                      [6]           6
447                4                      [4]           4
448                9                      [8]           8
449                3                      [3]           3

[150 rows x 3 columns]


Repeating the experiment with test data that doesn't match the calibration set.

In [None]:
# adding gaussian blur with 5x5 kernel

import cv2


blur_loader = torch.utils.data.DataLoader(dataset=mnist_test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

def blur_images(image):
  return torch.tensor(cv2.blur(images[i].numpy(), (30, 30)))



observed_labels = []
with torch.no_grad():
    for images, labels in blur_loader:
        observed_labels.extend(labels.cpu().numpy())

#blur the images
blurred_images = []
blurred_labels = []
with torch.no_grad():
    for images, labels in mnist_test_loader:
        for i in range(len(images)):
            blurred_image = blur_images(images[i])
            blurred_images.append(blurred_image)
            blurred_labels.append(labels[i])

blurred_images = torch.stack(blurred_images)
blurred_labels = torch.tensor(blurred_labels)


blur_probs = []
blur_pred = []

# Flatten images for model input
blurred_images = blurred_images.view(len(blurred_images), -1).to(device)

#find probabilities and predictions
with torch.no_grad():
  outputs = model(blurred_images)
  probabilities = torch.nn.functional.softmax(outputs, dim = 1)
  _, blur_pred = torch.max(outputs.data, 1)


blur_preds = blur_pred.cpu().numpy()
blur_probs = probabilities.cpu().numpy()


conformal_predictions = conformal_prediction(blur_probs, threshold)
print("blur evaluation:")
evaluate_and_print(observed_labels, conformal_predictions, blur_preds, 10, 200)



blur evaluation:
the prediction was in the set  27.800000000000004  percent of the time
     observed labels confromal prediction set  prediction
10                 0                [6, 5, 2]           6
11                 6                [5, 2, 3]           5
12                 9                   [3, 2]           3
13                 0                [2, 6, 5]           2
14                 1                      [1]           1
..               ...                      ...         ...
195                3                [2, 3, 8]           2
196                1                      [1]           1
197                6                      [3]           3
198                4                   [3, 2]           3
199                2                      [3]           3

[190 rows x 3 columns]


Training on blurred Images

In [None]:
# Train the model -- additional training on blurred images without calibration
from torch.utils.data import DataLoader, TensorDataset


blur_train_loader = torch.utils.data.DataLoader(dataset=mnist_train_set,
                                          batch_size=batch_size,
                                          shuffle=False)

def blur_images(image):
  return torch.tensor(cv2.blur(images[i].numpy(), (30, 30)))


#blur the images
blurred_images = []
blurred_labels = []

# adding blur to every image in the training set
with torch.no_grad():
    for images, labels in blur_train_loader:
        for i in range(len(images)):
            blurred_image = blur_images(images[i])
            blurred_images.append(blurred_image)
            blurred_labels.append(labels[i])

blurred_images = torch.stack(blurred_images)
blurred_labels = torch.tensor(blurred_labels)


# Create a TensorDataset and DataLoader for the blurred images
blurred_dataset = TensorDataset(blurred_images, blurred_labels)
blurred_train_loader = DataLoader(blurred_dataset, batch_size=128, shuffle=False)


model.train()  # training mode
total_step = len(blurred_train_loader)
start_time = time.time()
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(blurred_train_loader):
        # each i is a batch of 128 samples
        images = images.to(device).view(-1, 28*28) # represent images as column vectors
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize - ALWAYS IN THIS ORDER!
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time: {:.4f} secs'
                   .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), time.time() - start_time))

Epoch [1/1], Step [100/375], Loss: 1.1225, Time: 0.6869 secs
Epoch [1/1], Step [200/375], Loss: 0.7359, Time: 1.2192 secs
Epoch [1/1], Step [300/375], Loss: 0.5970, Time: 1.7158 secs


In [None]:
#now use original calibration (the conformal predicition with the blurry images)

#creating blurry images again

blur_test_loader = torch.utils.data.DataLoader(dataset=mnist_test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)
def blur_images(image):
  return torch.tensor(cv2.blur(images[i].numpy(), (30, 30)))


#blur the images
blurred_images = []
blurred_labels = []
with torch.no_grad():
    for images, labels in blur_test_loader:
        for i in range(len(images)):
            blurred_image = blur_images(images[i])
            blurred_images.append(blurred_image)
            blurred_labels.append(labels[i])

blurred_images = torch.stack(blurred_images)
blurred_labels = torch.tensor(blurred_labels)


# Create a TensorDataset and DataLoader for the blurred images
blurred_dataset = TensorDataset(blurred_images, blurred_labels)
blur_test_loader = DataLoader(blurred_dataset, batch_size=128, shuffle=False)

blur_test_preds = []
blur_test_probs = []

#testing on the blurred image with original calibration
with torch.no_grad():
    for images, _ in blur_test_loader:
        images = images.to(device).view(images.size(0), -1)
        outputs = model(images)
        probabilities = torch.nn.functional.softmax(outputs, dim = 1) #1 corresponds to columns
        #the prediction outputs the index of what it thinks the class is
        #the ommited term is the value
        _, predictions = torch.max(outputs.data, 1) #get the index of the highest output
        # Append to lists
        blur_test_preds.extend(predictions.cpu().numpy())
        blur_test_probs.extend(probabilities.cpu().numpy())


observed_labels = []
with torch.no_grad():
    for images, labels in blur_test_loader:
        observed_labels.extend(labels.cpu().numpy())

conformal_predictions = conformal_prediction(blur_test_probs, threshold)
print("After putting additional training with no recalibration:")
evaluate_and_print(observed_labels, conformal_predictions, blur_test_preds, 100, 150)


the prediction was in the set  91.02  percent of the time
     observed labels confromal prediction set  prediction
100                6                      [6]           6
101                0                      [0]           0
102                5                      [5]           5
103                4                      [4]           4
104                9                   [9, 5]           9
105                9                      [9]           9
106                2                      [0]           0
107                1                      [1]           1
108                9                      [9]           9
109                4                   [9, 4]           9
110                8                   [8, 3]           8
111                7                      [1]           1
112                3                [3, 1, 8]           3
113                9                      [9]           9
114                7                   [9, 7]           9
115           

In [None]:
#Now lets compare. lets calibrate the new blurred images
#Calibration of the blurred images

#blur everyother image in the calibration data


mnist_cal_loader = torch.utils.data.DataLoader(dataset=mnist_cal_set,
                                           batch_size=batch_size,
                                           shuffle=True, drop_last=True)
def blur_images(image):
  return torch.tensor(cv2.blur(images[i].numpy(), (30, 30)))


#blurs every other images
new_cal_images = []
new_cal_labels = []
with torch.no_grad():
    for images, labels in mnist_cal_loader:
        for i in range(len(images)):
          if i % 2 == 0:
            blurred_image = blur_images(images[i])
            new_cal_images.append(blurred_image)
            new_cal_labels.append(labels[i])
          else:
            new_cal_images.append(images[i])
            new_cal_labels.append(labels[i])


new_cal_images = torch.stack(new_cal_images)
new_cal_labels = torch.tensor(new_cal_labels)


# Create a TensorDataset and DataLoader for the blurred images
new_cal_dataset = TensorDataset(new_cal_images, new_cal_labels)
new_cal_loader = DataLoader(new_cal_dataset, batch_size=128, shuffle=False)


#now re-calibrate

new_cal_probs = []
new_cal_preds = []
with torch.no_grad():
  for images, labels in new_cal_loader:
    images = images.to(device).view(images.size(0), -1)
    outputs = model(images)
    #OUTPUTS IS A TENSOR with values corresponding to each image's logits (raw unnormalized scores)
    #softmax outputs a probabiity for each class
    probabilities = torch.nn.functional.softmax(outputs, dim = 1) #1 corresponds to columns
    #the prediction outputs the index of what it thinks the class is
    #the omitted term is the value
    _, predictions = torch.max(outputs.data, 1) #get the index of the highest output

    # Append to lists
    new_cal_preds.extend(predictions.cpu().numpy())
    new_cal_probs.extend(probabilities.cpu().numpy())

new_cal_scores = []
for prob, true_label in zip(new_cal_probs, new_cal_preds): #prob is the probability and true_label is index of pred
  true_class_prob = prob[true_label] #the corresponding lists with their prob function getting the most predicted class
  new_cal_scores.append(1 - true_class_prob) #s_i score

new_cal_scores = np.array(new_cal_scores)
new_sorted_scores = np.sort(new_cal_scores) #probabilities

new_cal_threshold = get_quantile(new_sorted_scores, 0.05)
print(new_cal_threshold)

#now retest with new calibration on blurred images

observed_labels = []
with torch.no_grad():
    for images, labels in blur_test_loader:
        observed_labels.extend(labels.cpu().numpy())

conformal_predictions = conformal_prediction(blur_test_probs, new_cal_threshold)
print("After no additional training, but recalibrating with 50:50 normal to blur images:")
evaluate_and_print(observed_labels, conformal_predictions, blur_test_preds, 100, 150)

Adjusted quantile level: 0.9501008064516129
0.6082548245787621
the prediction was in the set  86.78  percent of the time
     observed labels confromal prediction set  prediction
100                6                      [6]           6
101                0                      [0]           0
102                5                      [5]           5
103                4                      [4]           4
104                9                      [9]           9
105                9                      [9]           9
106                2                      [0]           0
107                1                      [1]           1
108                9                      [9]           9
109                4                      [9]           9
110                8                      [8]           8
111                7                      [1]           1
112                3                   [3, 1]           3
113                9                      [9]           9
114      

In [None]:
import cv2



rotate_loader = torch.utils.data.DataLoader(dataset=mnist_test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

observed_labels = []
with torch.no_grad():
    for images, labels in mnist_test_loader:
        observed_labels.extend(labels.cpu().numpy())


def rotate_image(image):
    image = image.numpy().squeeze()  # Convert to numpy array and remove batch dimension
    rows, cols = image.shape
    rotated_image = cv2.warpAffine(image, dsize=None, M=cv2.getRotationMatrix2D((cols / 2, rows / 2), -120, 1))
    return torch.tensor(rotated_image, dtype=torch.float32).unsqueeze(0)  # Add batch dimension back

rotated_images = []
rotated_labels = []

with torch.no_grad():
    for images, labels in mnist_test_loader:
        for i in range(len(images)):
            rotated_image = rotate_image(images[i])
            rotated_images.append(rotated_image)
            rotated_labels.append(labels[i])


rotated_images = torch.stack(rotated_images)
rotated_labels = torch.tensor(rotated_labels)

# Evaluate the model on the rotated test set
rotate_probs = []
rotate_preds = []

# Flatten images for model input
rotated_images = rotated_images.view(len(rotated_images), -1).to(device)


# Evaluate the model on the entire rotated test set at once
with torch.no_grad():
    outputs = model(rotated_images) #
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    _, rotate_preds = torch.max(outputs.data, 1)

rotate_preds = rotate_preds.cpu().numpy()
rotate_probs = probabilities.cpu().numpy()

conformal_predictions = conformal_prediction(rotate_probs, threshold)
print("flip evaluation:")
evaluate_and_print(observed_labels, conformal_predictions, rotate_preds, 10, 200)




flip evaluation:
the prediction was in the set  23.65  percent of the time
     observed labels confromal prediction set  prediction
10                 0                      [0]           0
11                 6                   [0, 8]           0
12                 9                   [4, 3]           4
13                 0                      [0]           0
14                 1                      [4]           4
..               ...                      ...         ...
195                3                   [4, 6]           4
196                1                      [4]           4
197                6                      [9]           9
198                4                      [4]           4
199                2                      [6]           6

[190 rows x 3 columns]
