In [1]:
# only run once
#!conda create -n resnet_fl_2host python=3 anaconda -y

In [3]:
# activate conda environment to access pysyft
!source /usr/local/anaconda3/bin/activate resnet_fl_2host

In [18]:
!pip install natsort

Collecting natsort
  Downloading natsort-7.1.0-py3-none-any.whl (35 kB)
Installing collected packages: natsort
Successfully installed natsort-7.1.0


In [10]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import syft as sy
import numpy as np
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import time
import os
import copy
from PIL import Image
import glob 

In [5]:
hook = sy.TorchHook(torch)

In [32]:
class Arguments():
    def __init__(self):
        self.batch_size = 4
        self.test_batch_size = 100
        self.epochs = 5
        self.lr = 0.01
        self.momentum = 0.5
        self.no_cuda = False
        self.seed = 1
        self.log_interval = 30
        self.save_model = True

args = Arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

torch.set_num_threads(4)

In [12]:
import pandas as pd

df = pd.read_excel ('../multilabels/LandUse_Multilabeled.xlsx')
df_label = np.array(df)

In [13]:
class_names =  np.array(["airplane","bare-soil","buildings","cars","chaparral","court","dock","field","grass","mobile-home","pavement","sand","sea","ship","tanks","trees","water"])

In [14]:
largestxor = 0 
largestij = (0,0)

for i in range(1,17):
    for j in range(i+1,18):
        #colnand = np.sum(np.logical_not(np.logical_and(df_label[:,i], df_label[:,j])))
        colxor = np.sum(np.logical_xor(df_label[:,i].astype(bool) , df_label[:,j].astype(bool) )) -  np.sum(np.logical_and(df_label[:,i], df_label[:,j]))
        #print(i,j, colxor, colnand)
        if colxor >= largestxor and np.sum(df_label[:,i]) >=700 and np.sum(df_label[:,j])>= 700 :
            largestxor = colxor
            largestij = (i,j)
print(largestxor,class_names[largestij[0]-1], class_names[largestij[1]-1], largestij)

674 bare-soil cars (2, 4)


In [15]:
def uncor_selecter(nr_label = 4,min_img = 300):
    """retrun a list with the least correlated labels """
    image_perlabel = np.sum(df_label[:,1:],axis= 0)
    biggest_label =np.where(np.any([image_perlabel > min_img],axis=0))[0]
    #print(biggest_label, image_perlabel[biggest_label])

    selected_list = [] 
    allcor_lost = np.array([0,0,0])
    for i in range(0,len(biggest_label)-1):
        it = biggest_label[i]
        for j in range(i+1,len(biggest_label)):
            jt = biggest_label[j]

            colxor = np.sum(np.logical_xor(df_label[:,it].astype(bool) , df_label[:,jt].astype(bool) )) -  np.sum(np.logical_and(df_label[:,it], df_label[:,jt]))
            allcor_lost = np.vstack((allcor_lost, np.array([colxor,it,jt]))) 
    sorted_list = allcor_lost[allcor_lost[:,0].argsort()]
    selected_list.append(sorted_list[-1,1])
    selected_list.append(sorted_list[-1,2])
    #print(sorted_list, selected_list)        

    while len(selected_list)<nr_label:
        biggest_label = np.setdiff1d(biggest_label,np.array(selected_list))
        largestxor = 0 
        largestind = 0
        for i in biggest_label:
            overall_xor = 0 
            for j in (selected_list):
                overall_xor += np.sum(np.logical_xor(df_label[:,i].astype(bool) , df_label[:,j].astype(bool) )) -  np.sum(np.logical_and(df_label[:,i], df_label[:,j]))

            if overall_xor >= largestxor:
                largestxor = overall_xor
                largestind = i

        selected_list.append(largestind)
    
    return selected_list

In [16]:
import random

def sampler_split_for_client(cdata, idxs, nr_client=4, minimum_skew_percentage = .4):
    selected_labels = uncor_selecter(nr_client,300)
    
    splitlists = []
    for sb in selected_labels:
        splitlists.append([])
        
    
    for i in idxs:
        nplabel = cdata.__getlabel__(i)
        #nplabel = label.numpy()
        
        if np.any(nplabel[selected_labels] == 1):
            if random.random() < minimum_skew_percentage:
                
                flip = np.random.randint(np.sum(nplabel[selected_labels] == 1)) 
                mask = np.where(nplabel[selected_labels] == 1)[0][flip]
                splitlists[mask].append(i)
            
            else:
                flip = np.random.randint(nr_client) 
                splitlists[flip].append(i)
                    
        else:
            flip = np.random.randint(nr_client) 
            splitlists[flip].append(i)

    
    for alist in splitlists:
        print(len(alist))
    return splitlists
    

In [19]:
from torch.utils.data import Dataset
from natsort import natsorted

class CustomDataSet(Dataset):
    def __init__(self, main_dir, transform, labelmat):
        self.main_dir = main_dir
        self.transforms = transform
        self.all_imgs = glob.glob(os.path.join(main_dir, '**/*.tif'), recursive=True)
        self.total_imgs = natsorted(self.all_imgs)
        self.xlabels = labelmat
        
    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        #print(idx,len(self.total_imgs))
        img_loc = self.total_imgs[idx]
        #print(img_loc)
        imagebaselabel = os.path.splitext(os.path.basename(img_loc))[0]
        label = self.xlabels[np.where(self.xlabels[:,0] == imagebaselabel),1:].reshape(17).astype(np.int64)
        #print(label,label.shape)
        tensor_label =  torch.from_numpy(label)
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transforms(image)
        return tensor_image, tensor_label
    
    def __getlabel__(self, idx):
        
        img_loc = self.total_imgs[idx]
        #print(img_loc)
        imagebaselabel = os.path.splitext(os.path.basename(img_loc))[0]
        label = self.xlabels[np.where(self.xlabels[:,0] == imagebaselabel),1:].reshape(17).astype(np.int64)
        
        return label

In [38]:
data_dir = "../../datasets/UCMerced_LandUse/Images"

def load_split_train_test(datadir, labelmat, valid_size=.2, num_clients=4):
    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    test_transforms = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    train_data = CustomDataSet(datadir, transform=train_transforms, labelmat=labelmat)
    test_data = CustomDataSet(datadir, transform=train_transforms, labelmat=labelmat)

    indices = list(range(2100))
    split = int(np.floor(valid_size * 2100))
    np.random.shuffle(indices)
    from torch.utils.data.sampler import SubsetRandomSampler
    train_idx, test_idx = indices[split:], indices[:split]
    
    lists = sampler_split_for_client(train_data, train_idx, num_clients, .4)
    
    dataloaders = []
    for client_sampler in lists:
        train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=args.batch_size,
            sampler=torch.utils.data.Sampler(client_sampler),
            **kwargs
        )
        dataloaders.append( {'data': train_loader, 'size': len(client_sampler)} )
    
    return dataloaders

In [58]:
clients_listo = load_split_train_test(data_dir, df_label, .2, 5)
print(clients_listo)

315
303
332
355
375
[{'data': <torch.utils.data.dataloader.DataLoader object at 0x7f96cb1e6ed0>, 'size': 315}, {'data': <torch.utils.data.dataloader.DataLoader object at 0x7f96cb1e6ad0>, 'size': 303}, {'data': <torch.utils.data.dataloader.DataLoader object at 0x7f96cb1e62d0>, 'size': 332}, {'data': <torch.utils.data.dataloader.DataLoader object at 0x7f96cb043250>, 'size': 355}, {'data': <torch.utils.data.dataloader.DataLoader object at 0x7f96cb043210>, 'size': 375}]


In [72]:
def train_client_model(args, model, device, client_dataloader, optimizer, criterion, scheduler, local_epochs):
    
    # train
    for epoch in range(local_epochs):
        
        running_loss_train, running_loss_val = 0, 0
        running_corrects_train, running_corrects_val = 0, 0
        
        # set model to training mode
        model.train()

        for data, target in client_dataloader['data']:
            
            data = data.to(device)
            target = target.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()
            
            # forward
            with torch.set_grad_enabled(True):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            running_loss_train += loss.item() * args.batch_size
            running_corrects_train += torch.sum(preds == labels.data)
            
        epoch_loss_train = running_loss_train / client_dataloader['size']
        epoch_acc_train = running_corrects_train.double() /  client_dataloader['size']
            
    return model

In [62]:
LOCAL_EPOCHS = 5
C_FRACTION = 0.7

#clients is array of dataloaders
def train_fedavg_model(args, model, device, clients, optimizer, criterion, scheduler, c_fraction, epochs=10):
    # iterate through epochs
    for i in range(epochs):
        # get random subset of clients
        fraction = int( c_fraction * float(len(clients_t)) )
        client_subset = random.sample(clients, fraction)
        
        # train each of the clients
        models = []
        print("Running epoch numero " + str(i))
        for client in client_subset:
            client_model = train_client_model(args, model, device, client, optimizer, criterion, scheduler, LOCAL_EPOCHS)
            models.append(client_model)
            print("Done with clientelo numero whateva")
            
        # average clients params
        # model = sum(k for 1 - num_clients): ( data_client / total_num_data ) * model_client_k
    
    return models[0]    

In [51]:
class LENET(nn.Module):
    def __init__(self, n_classes):
        super(LENET, self).__init__()
        from collections import OrderedDict
        self.conv1 = nn.Conv2d(3, 16, kernel_size=(5, 5))
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(5, 5))
        self.conv3 = nn.Conv2d(32, 64, kernel_size=(5, 5))
        #self.conv4 = nn.Conv2d(64, 128, kernel_size=(5, 5))
        self.linear1 = nn.Linear(64 * 24 * 24, 120)
        self.linear2 = nn.Linear(120, 84)
        self.linear3 = nn.Linear(84, n_classes)                                
    def forward(self, x):
        """
        Args:
          x of shape (batch_size, 1, 28, 28): Input images.
        
        Returns:
          y of shape (batch_size, 10): Outputs of the network.
        """
        x = F.max_pool2d(F.relu(self.conv1(x)), kernel_size=2, stride=2)
        x = F.max_pool2d(F.relu(self.conv2(x)), kernel_size=2, stride=2)
        x = F.max_pool2d(F.relu(self.conv3(x)), kernel_size=2, stride=2)
        #x = F.max_pool2d(F.relu(self.conv4(x)), kernel_size=2, stride=2)
        x = x.view(-1, 64 * 24 * 24)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [8]:
from torchvision import models
from torch.optim import lr_scheduler
import time
import os
import copy

In [53]:
model = LENET(len(class_names))
model = model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [73]:
model = train_fedavg_model(args, model, device, clients_listo, optimizer_ft, criterion, exp_lr_scheduler, C_FRACTION, 3)

Running epoch numero 0


NotImplementedError: 