In [43]:
import os
import time
import torch
import numpy as np
import pandas as pd
from torch import nn, optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

In [2]:
from safetensors.torch import save_file
from safetensors.torch import load_file

In [3]:
device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [4]:
# GLOBAL VARIABLES
ensemble_size = 3
batch_size = 64
num_epochs = 40
learning_rate = 0.001

LOAD_MODELS = False

# ARRAY TO DICTIONARY RESULTS
results = []

## Data Preparation

In [5]:
# Define transformations for training and testing data
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [6]:
# Define data transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

In [7]:
# Load the CIFAR-10 dataset: Training & Test Data
train = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [8]:
# Calculate the base length for each subset
subset_length = len(train) // 3

# Calculate the remainder
remainder = len(train) % 3

# Distribute the remainder among the subsets
lengths = [subset_length + (1 if i < remainder else 0) for i in range(3)]

In [9]:
subset_length, remainder, lengths

(16666, 2, [16667, 16667, 16666])

In [10]:
# Split the training dataset into three subsets for the ensemble
train_subset_1, train_subset_2, train_subset_3 = random_split(train, lengths)

# Individual trainloaders for specific subsets
trainloader_1 = DataLoader(train_subset_1, batch_size=batch_size, shuffle=True, num_workers=2)
trainloader_2 = DataLoader(train_subset_2, batch_size=batch_size, shuffle=True, num_workers=2)
trainloader_3 = DataLoader(train_subset_3, batch_size=batch_size, shuffle=True, num_workers=2)

# test definition
testloader = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=2)

## Neural Network Architecture (Simple CNN)

In [11]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)

        # Adding batch normalization layers after each convolutional layer
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)

        # Adding dropout with a probability of 0.5
        self.dropout = nn.Dropout(0.5)

        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 4 * 4, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        # Apply batch norm and dropout after ReLU activations
        x = self.pool(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool(torch.relu(self.bn2(self.conv2(x))))
        x = self.pool(torch.relu(self.bn3(self.conv3(x))))

        x = x.view(-1, 128 * 4 * 4)

        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout only on fully connected layer
        x = self.fc2(x)

        return x

In [12]:
def train_model(model, trainloader, criterion, optimizer):
    start_time = time.time()
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_loss = running_loss / len(trainloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"\nTraining Finisehd, time: {elapsed_time:.2f} seconds")

    # Clear the CUDA memory cache after training each model
    torch.cuda.empty_cache()

In [13]:
def evaluate_model(model, testloader, model_name=""):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
      for inputs, labels in testloader:
          inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
          outputs = model(inputs)
          _, predicted = torch.max(outputs, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

    print(f"Accuracy on test set: {100 * correct / total:.2f}%")
    return dict(name=model_name, accuracy=(100 * correct / total), epochs=num_epochs, learning_rate=learning_rate)

## Model 1: CNN with first subset of training *data*

`criterion = nn.CrossEntropyLoss()` initializes cross-entropy loss as the criterion, a commonly used loss function for multi-class classification tasks. Cross-entropy loss measures the difference between the model’s predicted class probabilities and the actual class labels, with a higher penalty for incorrect predictions. This guides the model toward learning accurate class probabilities by minimizing this loss during training.

`optimizer = optim.Adam(model.parameters(), lr=learning_rate)` sets up the Adam optimizer, which will adjust the model’s parameters at each training step. Here, `model.parameters()` specifies the parameters to be updated, and `lr=learning_rate` defines the learning rate, controlling the size of the updates applied to the model’s parameters. The optimizer and loss function work together to enable effective backpropagation and parameter adjustment, essential for improving the model's performance as training progresses.

In [14]:
model_name = "base_learner_1"
base_learner_1 = SimpleCNN().to(device) # Enable model to run on CUDA
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_learner_1.parameters(), lr=learning_rate)

In [15]:
loaded_state_dict = load_file('base_learner_1.safetensors')

In [16]:
if loaded_state_dict.keys() == base_learner_1.state_dict().keys() and LOAD_MODELS:
    print("Matching model architecture found. Loading weights...")
    base_learner_1.load_state_dict(loaded_state_dict)
else:
    print("Model architecture does not match. Retraining the model...")
    # Retrain your model (assuming you have a function `train_model`)
    train_model(base_learner_1, trainloader_1, criterion, optimizer)

Model architecture does not match. Retraining the model...
Epoch 1/40, Loss: 1.6402
Epoch 2/40, Loss: 1.2885
Epoch 3/40, Loss: 1.1304
Epoch 4/40, Loss: 1.0335
Epoch 5/40, Loss: 0.9468
Epoch 6/40, Loss: 0.8760
Epoch 7/40, Loss: 0.8253
Epoch 8/40, Loss: 0.7690
Epoch 9/40, Loss: 0.7239
Epoch 10/40, Loss: 0.6667
Epoch 11/40, Loss: 0.6199
Epoch 12/40, Loss: 0.5674
Epoch 13/40, Loss: 0.5355
Epoch 14/40, Loss: 0.4977
Epoch 15/40, Loss: 0.4557
Epoch 16/40, Loss: 0.4233
Epoch 17/40, Loss: 0.3954
Epoch 18/40, Loss: 0.3659
Epoch 19/40, Loss: 0.3261
Epoch 20/40, Loss: 0.3247
Epoch 21/40, Loss: 0.2877
Epoch 22/40, Loss: 0.2731
Epoch 23/40, Loss: 0.2549
Epoch 24/40, Loss: 0.2293
Epoch 25/40, Loss: 0.2179
Epoch 26/40, Loss: 0.2061
Epoch 27/40, Loss: 0.2030
Epoch 28/40, Loss: 0.1809
Epoch 29/40, Loss: 0.1929
Epoch 30/40, Loss: 0.1724
Epoch 31/40, Loss: 0.1696
Epoch 32/40, Loss: 0.1559
Epoch 33/40, Loss: 0.1475
Epoch 34/40, Loss: 0.1468
Epoch 35/40, Loss: 0.1337
Epoch 36/40, Loss: 0.1274
Epoch 37/40, L

In [17]:
save_file(base_learner_1.state_dict(), 'base_learner_1.safetensors')

### Model Evaluation for base learner (1)

In [18]:
result = evaluate_model(base_learner_1, testloader, model_name="base_learner_1")

Accuracy on test set: 73.73%


In [19]:
result

{'name': 'base_learner_1',
 'accuracy': 73.73,
 'epochs': 40,
 'learning_rate': 0.001}

In [20]:
results.append(result)

## Model 2: CNN with second subset of training *data*

`criterion = nn.CrossEntropyLoss()` initializes cross-entropy loss as the criterion, a commonly used loss function for multi-class classification tasks. Cross-entropy loss measures the difference between the model’s predicted class probabilities and the actual class labels, with a higher penalty for incorrect predictions. This guides the model toward learning accurate class probabilities by minimizing this loss during training.

`optimizer = optim.Adam(model.parameters(), lr=learning_rate)` sets up the Adam optimizer, which will adjust the model’s parameters at each training step. Here, `model.parameters()` specifies the parameters to be updated, and `lr=learning_rate` defines the learning rate, controlling the size of the updates applied to the model’s parameters. The optimizer and loss function work together to enable effective backpropagation and parameter adjustment, essential for improving the model's performance as training progresses.

In [21]:
base_learner_2 = SimpleCNN().to(device) # Enable model to run on CUDA
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_learner_2.parameters(), lr=learning_rate)

In [22]:
loaded_state_dict = load_file('base_learner_2.safetensors')

In [23]:
if  loaded_state_dict.keys() == base_learner_2.state_dict().keys() and LOAD_MODELS:
    print("Matching model architecture found. Loading weights...")
    base_learner_2.load_state_dict(loaded_state_dict)
else:
    print("Model architecture does not match. Retraining the model...")
    train_model(base_learner_2, trainloader_2, criterion, optimizer)

Model architecture does not match. Retraining the model...
Epoch 1/40, Loss: 1.6530
Epoch 2/40, Loss: 1.3240
Epoch 3/40, Loss: 1.1737
Epoch 4/40, Loss: 1.0651
Epoch 5/40, Loss: 0.9797
Epoch 6/40, Loss: 0.9087
Epoch 7/40, Loss: 0.8484
Epoch 8/40, Loss: 0.7847
Epoch 9/40, Loss: 0.7286
Epoch 10/40, Loss: 0.6879
Epoch 11/40, Loss: 0.6409
Epoch 12/40, Loss: 0.5964
Epoch 13/40, Loss: 0.5355
Epoch 14/40, Loss: 0.4970
Epoch 15/40, Loss: 0.4591
Epoch 16/40, Loss: 0.4282
Epoch 17/40, Loss: 0.3881
Epoch 18/40, Loss: 0.3476
Epoch 19/40, Loss: 0.3320
Epoch 20/40, Loss: 0.3114
Epoch 21/40, Loss: 0.2882
Epoch 22/40, Loss: 0.2682
Epoch 23/40, Loss: 0.2465
Epoch 24/40, Loss: 0.2315
Epoch 25/40, Loss: 0.2198
Epoch 26/40, Loss: 0.2088
Epoch 27/40, Loss: 0.1872
Epoch 28/40, Loss: 0.1664
Epoch 29/40, Loss: 0.1642
Epoch 30/40, Loss: 0.1646
Epoch 31/40, Loss: 0.1673
Epoch 32/40, Loss: 0.1661
Epoch 33/40, Loss: 0.1535
Epoch 34/40, Loss: 0.1427
Epoch 35/40, Loss: 0.1366
Epoch 36/40, Loss: 0.1247
Epoch 37/40, L

In [24]:
save_file(base_learner_2.state_dict(), 'base_learner_2.safetensors')

### Model Evaluation for base learner (2)


In [25]:
result = evaluate_model(base_learner_2, testloader, model_name="learner_2")

Accuracy on test set: 73.66%


In [26]:
result

{'name': 'learner_2', 'accuracy': 73.66, 'epochs': 40, 'learning_rate': 0.001}

In [27]:
results.append(result)

## Model 3: CNN with third subset of training *data*

`criterion = nn.CrossEntropyLoss()` initializes cross-entropy loss as the criterion, a commonly used loss function for multi-class classification tasks. Cross-entropy loss measures the difference between the model’s predicted class probabilities and the actual class labels, with a higher penalty for incorrect predictions. This guides the model toward learning accurate class probabilities by minimizing this loss during training.

`optimizer = optim.Adam(model.parameters(), lr=learning_rate)` sets up the Adam optimizer, which will adjust the model’s parameters at each training step. Here, `model.parameters()` specifies the parameters to be updated, and `lr=learning_rate` defines the learning rate, controlling the size of the updates applied to the model’s parameters. The optimizer and loss function work together to enable effective backpropagation and parameter adjustment, essential for improving the model's performance as training progresses.

In [28]:
base_learner_3 = SimpleCNN().to(device) # Enable model to run on CUDA
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_learner_3.parameters(), lr=learning_rate)

In [29]:
loaded_state_dict = load_file('base_learner_3.safetensors')

In [30]:
if loaded_state_dict.keys() == base_learner_3.state_dict().keys() and LOAD_MODELS:
    print("Matching model architecture found. Loading weights...")
    base_learner_3.load_state_dict(loaded_state_dict)
else:
    print("Model architecture does not match. Retraining the model...")
    # Retrain your model (assuming you have a function `train_model`)
    train_model(base_learner_3, trainloader_3, criterion, optimizer)

Model architecture does not match. Retraining the model...
Epoch 1/40, Loss: 1.6559
Epoch 2/40, Loss: 1.3236
Epoch 3/40, Loss: 1.1627
Epoch 4/40, Loss: 1.0519
Epoch 5/40, Loss: 0.9715
Epoch 6/40, Loss: 0.8871
Epoch 7/40, Loss: 0.8324
Epoch 8/40, Loss: 0.7647
Epoch 9/40, Loss: 0.7019
Epoch 10/40, Loss: 0.6585
Epoch 11/40, Loss: 0.5980
Epoch 12/40, Loss: 0.5583
Epoch 13/40, Loss: 0.5168
Epoch 14/40, Loss: 0.4738
Epoch 15/40, Loss: 0.4458
Epoch 16/40, Loss: 0.4034
Epoch 17/40, Loss: 0.3654
Epoch 18/40, Loss: 0.3356
Epoch 19/40, Loss: 0.3167
Epoch 20/40, Loss: 0.2939
Epoch 21/40, Loss: 0.2694
Epoch 22/40, Loss: 0.2384
Epoch 23/40, Loss: 0.2367
Epoch 24/40, Loss: 0.2122
Epoch 25/40, Loss: 0.2011
Epoch 26/40, Loss: 0.1905
Epoch 27/40, Loss: 0.1906
Epoch 28/40, Loss: 0.1602
Epoch 29/40, Loss: 0.1633
Epoch 30/40, Loss: 0.1574
Epoch 31/40, Loss: 0.1513
Epoch 32/40, Loss: 0.1348
Epoch 33/40, Loss: 0.1259
Epoch 34/40, Loss: 0.1416
Epoch 35/40, Loss: 0.1302
Epoch 36/40, Loss: 0.1207
Epoch 37/40, L

In [31]:
save_file(base_learner_3.state_dict(), 'base_learner_3.safetensors')

### Model Evaluation for base learner (3)

In [32]:
result = evaluate_model(base_learner_3, testloader, model_name="learner_3")

Accuracy on test set: 70.56%


In [33]:
result

{'name': 'learner_3', 'accuracy': 70.56, 'epochs': 40, 'learning_rate': 0.001}

In [34]:
results.append(result)

## Results before ensembling

In [35]:
results

[{'name': 'base_learner_1',
  'accuracy': 73.73,
  'epochs': 40,
  'learning_rate': 0.001},
 {'name': 'learner_2',
  'accuracy': 73.66,
  'epochs': 40,
  'learning_rate': 0.001},
 {'name': 'learner_3',
  'accuracy': 70.56,
  'epochs': 40,
  'learning_rate': 0.001}]

## Ensemble Predictions


In [36]:
def evaluate_ensemble(models, testloader, model_names=None):
    print("Evaluating ensemble...")

    # Set models to evaluation mode
    for model in models:
        model.eval()

    correct = 0
    total = 0
    all_preds = []

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU

            # Get predictions from all models
            model_preds = []
            for model in models:
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                model_preds.append(predicted.cpu().numpy())

            # Convert model_preds to a numpy array (models x samples)
            model_preds = np.array(model_preds)

            # Majority voting (take the class with the most votes for each sample)
            final_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=model_preds)

            all_preds.extend(final_preds)
            total += labels.size(0)
            correct += (final_preds == labels.cpu().numpy()).sum()

    accuracy = 100 * correct / total
    print(f"Accuracy on test set: {accuracy:.2f}%")
    return dict(name=f"ensemble:[{', '.join(model_names)}]", accuracy=accuracy, epochs=num_epochs, learning_rate=learning_rate)


In [37]:
models = [base_learner_1, base_learner_2, base_learner_3]
model_names = ["BL1", "BL2", "BL3"]

In [38]:
result = evaluate_ensemble(models, testloader, model_names)

Evaluating ensemble...
Accuracy on test set: 76.30%


In [39]:
results.append(result)

In [49]:
pd.DataFrame(results).sort_values(by="accuracy", ascending=False)

Unnamed: 0,name,accuracy,epochs,learning_rate
3,"ensemble:[BL1, BL2, BL3]",76.3,40,0.001
0,base_learner_1,73.73,40,0.001
1,learner_2,73.66,40,0.001
2,learner_3,70.56,40,0.001
