<a href="https://colab.research.google.com/github/anurag-saraswat/-Competitive-Coding/blob/master/Attention/Attention_CIFAR10_iid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cosine similarity is used for finding out the attention matrix from embedding vectors from each client.

Embedding vector is the average activation value(of last epoch) of one of the hidden linear layers against each input image.



In [1]:
import torch
from torchvision.datasets import MNIST,CIFAR10
from torch.utils.data import random_split, DataLoader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torchvision.transforms as tt

import random
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
stats = ((0.5074,0.4867,0.4411),(0.2011,0.1987,0.2025))
train_transform = tt.Compose([
    tt.RandomHorizontalFlip(),
    tt.RandomCrop(32,padding=4,padding_mode="reflect"),
    tt.ToTensor(),
    tt.Normalize(*stats)
])

test_transform = tt.Compose([
    tt.ToTensor(),
    tt.Normalize(*stats)
])
train_dataset = CIFAR10('../data/', train=True, download=True, transform=train_transform)
test_dataset = CIFAR10('../data/', train=False, download=True, transform=test_transform)

train_dataset, val_dataset = random_split(train_dataset, [int(len(train_dataset) * 0.80), int(len(train_dataset) * 0.2)])

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting ../data/cifar-10-python.tar.gz to ../data/
Files already downloaded and verified


In [3]:
total_train_size = len(train_dataset)
total_test_size = len(test_dataset)
total_val_size = len(val_dataset)

classes = 10
input_dim = 1024

num_clients = 5
rounds = 10
batch_size = 128
epochs_per_client = 20 
learning_rate = 2e-2

In [4]:
total_train_size, total_test_size, total_val_size

(40000, 10000, 10000)

In [5]:
random_epochs = [[random.randint(1,epochs_per_client) for j in range(num_clients)] for i in range(rounds)]

In [6]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def to_device(data, device):
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader(DataLoader):
        def __init__(self, dl, device):
            self.dl = dl
            self.device = device

        def __iter__(self):
            for batch in self.dl:
                yield to_device(batch, self.device)

        def __len__(self):
            return len(self.dl)

device = get_device()

In [7]:
class FederatedNet(torch.nn.Module):    
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(3, 6, 3)     # 32x32x3 ---> 30x30x6
        self.conv2 = torch.nn.Conv2d(6, 8, 3)     # 30x30x6 ---> 28X28X8
        self.maxpool = torch.nn.MaxPool2d(2, 2)   # 28x28x8 ---> 14x14x8
        self.flatten = torch.nn.Flatten()         # 14x14x8 ---> 1568
        self.linear1 = torch.nn.Linear(1568, 128) # 1568    ---> 128
        self.linear2 = torch.nn.Linear(128, 10)   # 128    ---> 10
        self.non_linearity = torch.nn.functional.relu
        self.track_layers = {'conv1': self.conv1, 'conv2': self.conv2, 'linear1': self.linear1 , 'linear2': self.linear2}
    
    def forward(self, x_batch):
        out = self.conv1(x_batch)
        out = self.non_linearity(out)
        out = self.conv2(out)
        out = self.non_linearity(out)
        out = self.maxpool(out)
        out = self.flatten(out)
        out = self.linear1(out)
        attention_layer = out
        out = self.linear2(out)
        return out , attention_layer
    
    def get_track_layers(self):
        return self.track_layers
    
    def apply_parameters(self, parameters_dict):
        with torch.no_grad():
            for layer_name in parameters_dict:
                self.track_layers[layer_name].weight.data *= 0
                self.track_layers[layer_name].bias.data *= 0
                self.track_layers[layer_name].weight.data += parameters_dict[layer_name]['weight']
                self.track_layers[layer_name].bias.data += parameters_dict[layer_name]['bias']
    
    def get_parameters(self):
        parameters_dict = dict()
        for layer_name in self.track_layers:
            parameters_dict[layer_name] = {
                'weight': self.track_layers[layer_name].weight.data, 
                'bias': self.track_layers[layer_name].bias.data
            }
        return parameters_dict
    
    def batch_accuracy(self, outputs, labels):
        with torch.no_grad():
            _, predictions = torch.max(outputs, dim=1)
            return torch.tensor(torch.sum(predictions == labels).item() / len(predictions))
    
    def _process_batch(self, batch):
        images, labels = batch
        outputs,attention_layer = self(images)
        loss = torch.nn.functional.cross_entropy(outputs, labels)
        accuracy = self.batch_accuracy(outputs, labels)
        return (loss, accuracy,attention_layer)
    
    def fit(self, dataset, epochs, lr, batch_size=128, opt=torch.optim.SGD):
        dataloader = DeviceDataLoader(DataLoader(dataset, batch_size, shuffle=True), device)
        optimizer = opt(self.parameters(), lr)
        history = []
        attention_weights = torch.zeros((128)).cuda()
        for epoch in range(epochs):
            losses = []
            accs = []
            count=0
            for batch in dataloader:
                loss, acc ,attention_layer= self._process_batch(batch)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                loss.detach()
                losses.append(loss)
                accs.append(acc)
                if(epoch == epochs-1): 
                    count+=1
                    attention_weights = torch.add(attention_weights , (torch.mean(attention_layer,dim = 0)))
            avg_loss = torch.stack(losses).mean().item()
            avg_acc = torch.stack(accs).mean().item()
            history.append((avg_loss, avg_acc))
        attention_weights = attention_weights/count
        return history,attention_weights 
    
    def evaluate(self, dataset, batch_size=128):
        dataloader = DeviceDataLoader(DataLoader(dataset, batch_size), device)
        losses = []
        accs = []
        with torch.no_grad():
            for batch in dataloader:
                loss, acc,_= self._process_batch(batch)
                losses.append(loss)
                accs.append(acc)
        avg_loss = torch.stack(losses).mean().item()
        avg_acc = torch.stack(accs).mean().item()
        return (avg_loss, avg_acc)
    
    def findAttention(self,attention_weights_dictionary ):
        feature_matrix = np.array([attention_weights_dictionary[key].cpu().detach().numpy() for key in attention_weights_dictionary])
        similarities = cosine_similarity(feature_matrix)
        return similarities
        
        

In [8]:
class Client:
    def __init__(self, client_id, dataset):
        self.client_id = client_id
        self.dataset = dataset
    
    def get_dataset_size(self):
        return len(self.dataset)
    
    def get_client_id(self):
        return self.client_id
    
    def train(self, parameters_dict,epochs):
        net = to_device(FederatedNet(), device)
        net.apply_parameters(parameters_dict)
        train_history,attention_weights = net.fit(self.dataset, epochs, learning_rate, batch_size)
        print('{}: Loss = {}, Accuracy = {} , Epochs = {}'.format(self.client_id, round(train_history[-1][0], 4), round(train_history[-1][1], 4),epoch))
        return net.get_parameters(),attention_weights
    
    

In [9]:
examples_per_client = total_train_size // num_clients
client_datasets = random_split(train_dataset, [min(i + examples_per_client, 
           total_train_size) - i for i in range(0, total_train_size, examples_per_client)])
clients = [Client('client_' + str(i), client_datasets[i]) for i in range(num_clients)]

In [None]:
global_net = to_device(FederatedNet(), device)
history = []

for i in range(rounds):
    print('Start Round {} ...'.format(i + 1))
    curr_parameters = global_net.get_parameters()
    new_parameters = dict([(layer_name, {'weight': 0, 'bias': 0}) for layer_name in curr_parameters])
    attention_weights_dictionary = {}
    
    j = 0
    
    for client in clients:
        epoch = random_epochs[i][j]
        j+=1
        client_parameters,client_attention_weights = client.train(curr_parameters,epoch)
        attention_weights_dictionary[client.get_client_id()] = client_attention_weights
        
        fraction = client.get_dataset_size() / total_train_size
        for layer_name in client_parameters:
            new_parameters[layer_name]['weight'] += fraction * client_parameters[layer_name]['weight']
            new_parameters[layer_name]['bias'] += fraction * client_parameters[layer_name]['bias']

    
    global_net.apply_parameters(new_parameters)
    attention_matrix = global_net.findAttention(attention_weights_dictionary )
    
    
    train_loss, train_acc = global_net.evaluate(train_dataset)
    val_loss, val_acc = global_net.evaluate(val_dataset)
    print('After round {}, train_loss = {}, val_loss = {}, val_acc = {}\n'.format(i + 1, round(train_loss, 4), 
            round(val_loss, 4), round(val_acc, 4)))
    print("Attention Matrix :")
    print(attention_matrix)
    history.append((train_loss, val_loss))
    
    print("---------------------------------------------------------------------------")
    
test_loss, test_acc = global_net.evaluate(test_dataset)
print('End of Training. Test Loss = {} , Test Accuracy = {}\n'.format(round(test_loss, 4), round(test_acc, 4)))

Start Round 1 ...
client_0: Loss = 1.8769, Accuracy = 0.3276 , Epochs = 7
client_1: Loss = 1.9518, Accuracy = 0.311 , Epochs = 6
client_2: Loss = 1.8061, Accuracy = 0.3581 , Epochs = 10
client_3: Loss = 1.6572, Accuracy = 0.4035 , Epochs = 15
client_4: Loss = 1.8029, Accuracy = 0.3583 , Epochs = 10
After round 1, train_loss = 1.7983, val_loss = 1.8152, val_acc = 0.3572

Attention Matrix :
[[1.0000001  0.95781446 0.98666483 0.9178045  0.9819339 ]
 [0.95781446 1.         0.9452074  0.8325512  0.9328257 ]
 [0.98666483 0.9452074  0.99999994 0.9186138  0.9918625 ]
 [0.9178045  0.8325512  0.9186138  1.         0.9276874 ]
 [0.9819339  0.9328257  0.9918625  0.9276874  0.99999994]]
---------------------------------------------------------------------------
Start Round 2 ...
client_0: Loss = 1.5319, Accuracy = 0.4433 , Epochs = 14
client_1: Loss = 1.5386, Accuracy = 0.4478 , Epochs = 14
client_2: Loss = 1.6684, Accuracy = 0.3973 , Epochs = 5
client_3: Loss = 1.7853, Accuracy = 0.3712 , Epochs =