In [1]:
# Code cell intended for imports and global settings.

# Imports
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from SupportClasses import ModelSupport as ms, EnvironmentSetup as env

# Define main data directory
DATA_DIR = 'tiny-imagenet-200' # Original images come in shapes of [3,64,64]

# Define training and validation data paths
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
VALID_DIR = os.path.join(DATA_DIR, 'val')

# Global values
NUMBER_OF_NODES = 3     # We only support values between 2 and 4 at present.
NUMBER_OF_EPOCHS = 5
NUMBER_OF_CLASSES = NUMBER_OF_NODES     # Generally we will have the same number of classes as nodes.
PATH = 'baseline-model.pt'

# Model specific values
criterion = nn.CrossEntropyLoss()

In [2]:
# Create separate validation subfolders for the validation images based on
# their labels indicated in the val_annotations txt file
val_img_dir = os.path.join(VALID_DIR, 'images')

# Open and read val annotations text file
fp = open(os.path.join(VALID_DIR, 'val_annotations.txt'), 'r')
data = fp.readlines()

# Create dictionary to store img filename (word 0) and corresponding
# label (word 1) for every line in the txt file (as key value pair)
val_img_dict = {}
for line in data:
    words = line.split('\t')
    val_img_dict[words[0]] = words[1]
fp.close()

# Display first 10 entries of resulting val_img_dict dictionary
{k: val_img_dict[k] for k in list(val_img_dict)[:10]}

# Create subfolders (if not present) for validation images based on label,
# and move images into the respective folders
for img, folder in val_img_dict.items():
    newpath = (os.path.join(val_img_dir, folder))
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    if os.path.exists(os.path.join(val_img_dir, img)):
        os.rename(os.path.join(val_img_dir, img), os.path.join(newpath, img))

# Define transformation sequence for image pre-processing
# If not using pre-trained model, normalize with 0.5, 0.5, 0.5 (mean and SD)
# If using pre-trained ImageNet, normalize with mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225])

preprocess_transform_pretrain = T.Compose([
                T.Resize(256), # Resize images to 256 x 256
                T.CenterCrop(224), # Center crop image
                T.RandomHorizontalFlip(),
                T.ToTensor(),  # Converting cropped images to tensors
                T.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
])

Files already downloaded and verified
Files already downloaded and verified
The classes in the training and testing set are ['airplane', 'automobile', 'bird']


TypeError: list indices must be integers or slices, not list

In [None]:
# This code cell will be used for setting up the unbalanced datasets.
# Define device to use (CPU or GPU). CUDA = GPU support for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
# Note that we are implicitly assuming the data is well balanced in the original dataset.
# Data distributions based on the number of nodes.
data_distribution_list = env.data_distribution(NUMBER_OF_NODES)
# Define batch size for DataLoaders
batch_size = 64

# Create DataLoaders for pre-trained models (normalized based on specific requirements)
train_loader_pretrain = env.generate_dataloader(TRAIN_DIR, "train",
                                  transform=preprocess_transform_pretrain, use_cuda=use_cuda, batch_size=batch_size)

val_loader_pretrain = env.generate_dataloader(val_img_dir, "val",
                                 transform=preprocess_transform_pretrain, use_cuda=use_cuda, batch_size=batch_size)

# Now we distribute the dataset, for each node.
unbalanced_training_sets = []
for data_dist in data_distribution_list:
    unbalanced_training_sets.append( env.unbalance_training_set(train_set=train_set, classes=classes, data_distribution=data_dist) )

print("Done loading data.")

In [None]:
# This code cell is likely where you will want to do the GAN work on the given datasets.

In [3]:
global_train_loader = env.create_single_loader(train_set.dataset)
# This code cell is to be used for importing data and setting up the model.
training_loaders, validation_loader, test_loader = env.create_data_loaders(training_sets=unbalanced_training_sets,
                                                                       validation_set=validation_set, test_set=test_set)
# Create and load the models. We initiate the model with None as we will update it with the global model in each round.
fed_models = {f"Federated_Model_{i+1}": ms.FederatedModel(train_loader, validation_loader,
                                                          ms.ConvNet(NUMBER_OF_CLASSES))
                for i, train_loader in enumerate(training_loaders)}

# Create the baseline, non-federated model.
baseline_model = ms.ConvNet(NUMBER_OF_CLASSES)
# Create the federated model
federated_model = ms.ConvNet(NUMBER_OF_CLASSES)

# Send the models to the CUDA device if it exists.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
baseline_model.to(device=device)
federated_model.to(device=device)

ConvNet(
  (conv1): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (fc1): Linear(in_features=1568, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=3, bias=True)
)

In [4]:
# Here we train a baseline model on all data. No federation as our baseline.
optimizer = optim.Adam(baseline_model.parameters())

# We train a new model, if the model does not already exist in memory.
if not os.path.exists(PATH):
    for epoch in range(NUMBER_OF_EPOCHS):
        start_time = time.time()
        train_loss, train_acc = ms.train(baseline_model, global_train_loader, optimizer, criterion, device=device)
        valid_loss, valid_acc = ms.test(baseline_model, validation_loader, criterion, device=device)
        end_time = time.time()
        # Get the time to perform non-federated learning
        epoch_mins, epoch_secs = ms.epoch_time(start_time, end_time)

        print(f'Epoch: {epoch+1:02} | Model name: Baseline Model | Epoch time (Baseline Training): {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.5f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.5f} |  Val. Acc: {valid_acc*100:.2f}%')
    torch.save(baseline_model.state_dict(), PATH)
print("Baseline model training complete.\n\n")

Baseline model training complete.




In [5]:
# Here we train our federated model.
best_valid_loss = float('inf')

for epoch in range(NUMBER_OF_EPOCHS):
    # Perform the computation steps on the individual models
    start_time = time.time()
    for key, fed_model in fed_models.items():
        # Update each model with the global model, before training again.
        fed_model.model.load_state_dict(federated_model.state_dict())
        fed_model.model.to(device=device)

        # Begin training
        optimizer = optim.Adam(fed_model.model.parameters())
        train_loss, train_acc = ms.train(fed_model.model, fed_model.train_loader, optimizer, criterion, device=device)
        valid_loss, valid_acc = ms.test(fed_model.model, fed_model.validation_loader, criterion, device=device)
        print(f'Epoch: {epoch+1:02} | Model name: {key}')
        print(f'\tTrain Loss: {train_loss:.5f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.5f} |  Val. Acc: {valid_acc*100:.2f}%')
    end_time = time.time()
    # Get the time to perform federated learning
    epoch_mins, epoch_secs = ms.epoch_time(start_time, end_time)

    # Average the federated models and combine their weights into the main model.
    federated_model.load_state_dict(ms.federated_averaging(fed_models))
    # Validate this model on a, small balanced validation set
    valid_loss, valid_acc = ms.test(federated_model, validation_loader, criterion, device=device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # This will save our best model in case we encounter a drop off during training.
        torch.save(federated_model.state_dict(), 'best-model.pt')

    print(f'Epoch: {epoch+1:02} | Model name: Federated Average | Epoch time (Federated Training): {epoch_mins}m {epoch_secs}s')
    print(f'\t Val. Loss: {valid_loss:.5f} |  Val. Acc: {valid_acc*100:.2f}%')

print("Federated Model training complete.\n\n")

Epoch: 01 | Model name: Federated_Model_1
	Train Loss: 0.03947 | Train Acc: 64.00%
	 Val. Loss: 0.00791 |  Val. Acc: 79.66%
Epoch: 01 | Model name: Federated_Model_2
	Train Loss: 0.03828 | Train Acc: 50.00%
	 Val. Loss: 0.00828 |  Val. Acc: 49.41%
Epoch: 01 | Model name: Federated_Model_3
	Train Loss: 0.03981 | Train Acc: 42.00%
	 Val. Loss: 0.00908 |  Val. Acc: 34.01%
Epoch: 01 | Model name: Federated Average | Epoch time (Federated Training): 0m 9s
	 Val. Loss: 0.00844 |  Val. Acc: 77.30%
Epoch: 02 | Model name: Federated_Model_1
	Train Loss: 0.02441 | Train Acc: 82.00%
	 Val. Loss: 0.00324 |  Val. Acc: 94.99%
Epoch: 02 | Model name: Federated_Model_2
	Train Loss: 0.02649 | Train Acc: 78.00%
	 Val. Loss: 0.00370 |  Val. Acc: 95.49%
Epoch: 02 | Model name: Federated_Model_3
	Train Loss: 0.02370 | Train Acc: 83.00%
	 Val. Loss: 0.00359 |  Val. Acc: 94.41%
Epoch: 02 | Model name: Federated Average | Epoch time (Federated Training): 0m 8s
	 Val. Loss: 0.00333 |  Val. Acc: 96.49%
Epoch: 0

In [7]:
# The main testing loop
# Load the model
baseline_model.load_state_dict(torch.load(PATH))
federated_model.load_state_dict(torch.load('best-model.pt'))

baseline_test_loss, baseline_test_acc = ms.test(baseline_model, test_loader, criterion, device=device)
fed_avg_test_loss, fed_avg_test_acc = ms.test(federated_model, test_loader, criterion, device=device)

print(f'Model name: Baseline | Test Loss: {baseline_test_loss:.3f} | Test Acc: {baseline_test_acc*100:.2f}%')
print(f'Model name: Federated Average | Test Loss: {fed_avg_test_loss:.3f} | Test Acc: {fed_avg_test_acc*100:.2f}%')

Model name: Baseline | Test Loss: 0.000 | Test Acc: 99.90%
Model name: Federated Average | Test Loss: 0.000 | Test Acc: 99.94%
