# Import Packages

In [20]:
import numpy as np
import pandas as pd
import os
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from torchvision.models import resnet50 
import preprocess.load_split_data as pr
import torchvision.transforms as transforms
from torch.utils.data import DataLoader,Subset,ConcatDataset,Dataset
sns.set_theme(style="darkgrid")

# Reproducibility

In [21]:
torch.cuda.is_available()

True

In [22]:
device ="cuda" if torch.cuda.is_available() else "cpu"
#seed 
SEED=42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# load data

In [23]:
# Load MNIST dataset
transform_mnist=transforms.Compose([
    transforms.ToTensor(),
    transforms.ColorJitter(),
    # transforms.Normalize((0.1307,), (0.3081,))
])
class_mnist=pr.LoadDataset("MNIST",transform=transform_mnist)
mnist=class_mnist.load_dataset()

In [24]:
train_mnist,test_mnist=mnist

### Labeled data

In [25]:
# 1% of labeled data
tau=0.01
init_sample=int(tau*len(train_mnist))
labeled_indices=torch.randperm(len(train_mnist))[:init_sample]
initial_subset = Subset(train_mnist, labeled_indices)

In [26]:
labeled_indices

tensor([37542, 43087,  2992, 25810, 39634,  6425, 55768, 11034, 44310, 44384,
        23068, 47752, 37118, 26420, 46483, 45982, 57971, 42205,  1203, 10258,
        59229, 30716, 19083, 30280, 24611, 13538, 24923, 46281, 36903, 36959,
        10452, 53824,  7787, 55993, 49623,  7309, 23543, 53500, 12350, 16020,
        21306, 44185, 21283, 51268, 52539, 27188, 19459, 29066, 53997,  9785,
        57670, 27213, 36282, 54045, 29662, 59084, 18247, 57692, 16060, 31370,
        17306, 52258, 13180, 53326, 32838, 34376, 40257, 49627,  3747,  9922,
        55627, 52451, 18088, 18138, 16622, 42584, 25675, 26759, 21380, 23516,
        30054, 17416, 39226,  2161,  4726, 31870,  7385, 51486, 24131, 20818,
        26257,   561, 24881,  5415, 20119, 39000, 45769, 22627,  2851, 11918,
        26470, 45448, 39399,  1128, 43593, 32601, 36013, 19871, 31561, 54333,
        32015,  1443, 23255,  1277, 42374, 21964, 46024, 48175, 56523, 13693,
        21719, 21520, 29825,  1745, 11928, 58204,  8457, 13853, 

### Unlabeled data

In [27]:
pool_indices = torch.randperm(len(train_mnist))[init_sample:len(train_mnist)]
pool_subset = Subset(train_mnist, pool_indices)

In [28]:
pool_indices

tensor([15658, 49260, 33067,  ..., 11047, 43759, 56020])

In [29]:
len(pool_subset)

59400

### Dataloader for labeled and unlabeled data

In [30]:
# Setup the batch size hyperparameter
BATCH_SIZE = 64

# Turn datasets into iterables (batches)
train_labeled_dataloader = DataLoader(initial_subset, # dataset to turn into iterable
    batch_size=BATCH_SIZE,# how many samples per batch?
    shuffle=True,
)
train_unlabeled_dataloader=DataLoader(pool_subset, # dataset to turn into iterable
    batch_size=BATCH_SIZE,# how many samples per batch?
    shuffle=True,
)
test_dataloader = DataLoader(test_mnist,
    batch_size=BATCH_SIZE,
    shuffle=False
)



# Load Model

In [31]:
for i , y in train_unlabeled_dataloader: 
    print(i.shape)
    break

torch.Size([64, 1, 28, 28])


In [32]:
# Load the pre-trained ResNet50 model without pre-trained weights
model = resnet50(pretrained=False)



In [33]:
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)  # Conversion en 3 canaux
model.fc = torch.nn.Linear(2048, 10)  # Modifier la couche de classification pour 10 classes

In [34]:
model.to(device)

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [35]:
# Define the loss function
loss_function = torch.nn.CrossEntropyLoss()
# Define the optimizer
optimizer = torch.optim.Adam(model.parameters())

In [36]:
accuracy_dict={}

In [37]:
def train_evaluate(model, train_loader, test_loader ,optimizer, criterion, epochs):
    acc_train=[]
    # Train the model
    for epoch in range(epochs):
        for inputs, labels in train_labeled_dataloader :
            # Get the inputs and labels
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate the loss
            loss = loss_function(outputs, labels)

            # Backpropagate
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate the model
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, labels in test_dataloader:
                inputs, labels = inputs.to(device), labels.to(device) 
                outputs = model(inputs)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            accuracy = correct / total
            acc_train.append(accuracy)

            print(f"Epoch {epoch + 1}: Accuracy {accuracy} for testing" )
    return acc_train


In [38]:
train=train_evaluate(model,train_labeled_dataloader,test_dataloader,optimizer,loss_function,5)
accuracy_dict['acc_1']=train

Epoch 1: Accuracy 0.3713 for testing
Epoch 2: Accuracy 0.6037 for testing
Epoch 3: Accuracy 0.7159 for testing
Epoch 4: Accuracy 0.7651 for testing
Epoch 5: Accuracy 0.7701 for testing


In [59]:
train_unlabeled_dataloader.dataset[0]

(tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 

In [69]:
pred=model(torch.unsqueeze(train_unlabeled_dataloader.dataset[0][0],0).to(device))

In [71]:
pred

tensor([[-4.7371, -5.4271,  0.4352,  4.8412,  0.3612,  2.7566, -2.5223,  0.5200,
          1.7233,  8.5183]], device='cuda:0', grad_fn=<AddmmBackward0>)

-4.7371, -5.4271,  0.4352,  4.8412,  0.3612,  2.7566, -2.5223,  0.5200,
          1.7233,  8.5183

In [72]:
torch.argmax(pred)

tensor(9, device='cuda:0')

# Evaluate the stability 

![logo-test.png](figures/bootstrap.png)


source : youtube by josh starmer 

In [39]:
len(train_unlabeled_dataloader)

929

In [40]:
predictions = []

In [41]:
for _ in range(2):
    # Create a bootstrap sample with replacement
    bootstrap_sample = []
    for i, batch in enumerate(train_unlabeled_dataloader):
    
        indices = np.random.choice(len(batch[0]), size=len(batch[0]), replace=True)
        bootstrap_images = torch.stack([batch[0][j] for j in indices])
        bootstrap_labels = torch.stack([batch[1][j] for j in indices])
        bootstrap_sample.append((bootstrap_images,bootstrap_labels))
        # Concatenate the bootstrap sample to create a new dataloader
    bootstrap_dataloader = DataLoader(bootstrap_sample,
            batch_size=None,
            shuffle=True
    )
    # Make predictions using the ResNet model on the bootstrap sample
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        batch_predictions = []
        for batch in bootstrap_dataloader:
            images = batch[0].to(device)  # Unpack the tuple to get the input data (images)
            batch_predictions.append(model(images))

    # Concatenate the predictions to match the original data format
    batch_predictions = torch.cat(batch_predictions)
    predictions.append(batch_predictions.cpu().numpy())
  


In [42]:
# Reshape predictions to (num_bootstrap_samples, num_instances, num_classes)
predictions = np.array(predictions)
num_instances, num_classes = predictions.shape[1], predictions.shape[2]

In [43]:
num_instances, num_classes


(59400, 10)

In [44]:
predictions.shape

(2, 59400, 10)

In [45]:
len(predictions[0])

59400

In [46]:
disagreement_rates = []
for i in range(num_instances):
    instance_predictions = predictions[:, i, :]
    majority_vote = np.argmax(np.sum(instance_predictions, axis=0))  # Simple majority voting
    disagreement_rate = np.mean(np.argmax(instance_predictions, axis=1) != majority_vote)
    disagreement_rates.append(disagreement_rate)

In [47]:
predictions[:, 1, :]

array([[-5.4621369e-01, -2.0536773e+00,  2.5454197e+00,  4.9508195e-03,
        -1.0101005e+00,  3.2060733e+00,  1.7011166e+00, -3.3062258e+00,
        -4.9035452e-02, -1.7945881e+00],
       [ 1.2441807e+01, -3.6062658e+00,  2.5733311e+00, -5.3816104e+00,
        -2.2051461e+00, -1.3982928e+00,  2.0322719e+00, -8.6980319e-01,
        -2.1267722e+00, -2.8715246e+00]], dtype=float32)

In [48]:
np.argmax(np.sum(predictions[:, 1, :], axis=0))


0

In [49]:
np.mean(np.argmax(predictions[:, 1, :], axis=1)  !=np.argmax(np.sum(predictions[:, 1, :], axis=0)))

0.5

In [50]:
len(disagreement_rates)

59400

In [51]:
# Step 6: Select High Stable Instances
stability_threshold = 0.1  # You can adjust this threshold based on your needs
high_stability_indices = [i for i, rate in enumerate(disagreement_rates) if rate <= stability_threshold]
high_stability_instances = [sample[0] for idx, sample in enumerate(train_unlabeled_dataloader.dataset) if idx in high_stability_indices]

In [57]:
len(high_stability_instances)

6219

In [53]:
def bootstrap (num_bootstrap_samples = 5):
    
    return 0

In [55]:
bootstrap()

0