In [3]:
!pip install syft




In [0]:
import torch
import numpy as np
from torchvision import datasets, transforms
import torchvision.datasets as datasets
from torch.utils.data import Subset
from torch import nn
import torch.nn.functional as F
from torch import optim
#from syft.frameworks.torch.differential_privacy import pate
import helper
from syft.frameworks.torch.dp import pate

# Switch between cpu and gpu depending on which is available for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [1]:




%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
# Application of transforms to normalize the mnist data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])
mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

In [0]:
# Method to divide the mnist trainingset among the number of teachers to simulate unique datasets 
def private_data_loaders(trainset, teachers):
  num_part = len(trainset) // teachers
  
  priv_loaders = []
  for i in range(teachers):
    indices = list(range(i * num_part, (i + 1)*num_part)) 
#     if (i == teachers - 1):
#       indices = list(range(i * num_part, len(trainset)))
    sub_pd = Subset(trainset, indices)
    temp_loader = torch.utils.data.DataLoader(sub_pd, batch_size=64, shuffle=True)
    priv_loaders.append(temp_loader)
  return priv_loaders
  

In [0]:
# Method for seperating the mnist test dataset into 2. The first one being the public database and the other one the private database
def test_database_seperator(testset):
  i1 = int(len(testset) * 0.9)
  i2 = int(len(testset) * 0.1)
  
  ind1 = list(range(0, i1))
  ind2 = list(range(i1, len(testset)))
  
  pdb = Subset(testset, ind1)
  
  db = Subset(testset, ind2)
  
  pdb_loader = torch.utils.data.DataLoader(pdb, batch_size=64, shuffle=False)
  
  db_loader = torch.utils.data.DataLoader(db, batch_size=64, shuffle=True)
  return pdb_loader, db_loader

In [0]:

# Method for creating and training a model
def create_train_model(classifier, loader, lr = 0.12, epoch = 100):
  print("Running on ", device)
  model = classifier()
  optimizer = optim.SGD(model.parameters(), lr)
  
  criterion = nn.NLLLoss()
  
  model.to(device)
  for i in range(epoch):
    cum_loss  = 0
    cum_perc = 0
    for imgs, labels in loader:
      imgs, labels = imgs.to(device), labels.to(device)
      optimizer.zero_grad()
      output = model.forward(imgs)
      loss = criterion(output, labels)
      loss.backward()
      cum_loss +=  loss.item()
      optimizer.step()
    for imgs, labels in loader:
      imgs, labels = imgs.to(device), labels.to(device)
      with torch.no_grad():
        ps =  torch.exp(model.forward(imgs))
      top_p, top_class = ps.topk(1, dim = 1)
      prob = top_class == labels.view(*top_class.shape)
      prob = prob.float()
      cum_perc += prob.mean().float()
    if (i == epoch -1):
      print("The loss for {0} epoch is {1}".format(i, cum_loss / len(loader)))
      print("The percentage for {0} epoch is {1}".format(i, cum_perc / len(loader)))  
  return model

In [0]:
# Method for running the unlabelled database through the teacher models in order to get their respective predictions for each items.

def evaluate(models, loader):
  m_labels = []
  for model in models:
    model_class = []
    for imgs, labels in loader:
      imgs = imgs.to(device)
      with torch.no_grad():
        ps =  torch.exp(model.forward(imgs))
      top_p, top_class = ps.topk(1, dim = 1)
      
      model_class.append(np.array(top_class.cpu()).T)
    m_label = np.hstack(model_class)
    m_labels.append(m_label)
  return m_labels

In [0]:
# Method for creating and training the teacher models
def train_teacher_models(loaders, lr = 0.12, epoch = 10):
  teacher_models = []
  for loader in loaders:
    model = create_train_model(classifier, loader, lr, epoch)
    teacher_models.append(model)
  return teacher_models

In [0]:
#Method for applying Global differential privacy to the labels predicted by the teacher models and to perform PATE analysis.
def return_new_indices(preds, epsilon):
  preds = preds.T
  ind = []
  beta = 1 / epsilon
  for pred in preds:
    label_count = np.bincount(pred, minlength = 10)
    for i in range(len(label_count)):
      label_count[i] += np.random.laplace(0, beta, 1)
    new_labels = np.argmax(label_count)
    ind.append(new_labels)

  ind = np.array(ind)
  return ind


def pate_analysis(pred, ind, epsilon):
  dde, die = pate.perform_analysis(teacher_preds = pred, indices = ind, noise_eps = epsilon, delta = 1e-5 )
  print("Data dependent epsilon ", dde)
  print("Data Independent epsilon ", die)

In [0]:
#Method to create a new dataloader with the new target labels and the public database
def join_label_image(dataloader, ind):
  img_list = []
  for img,label in dataloader:
    img_list.append(img)

  images = np.vstack(img_list)

  model_zip = list(zip(images, ind))
  modelloader = torch.utils.data.DataLoader(model_zip, shuffle=True, batch_size=64)
  return modelloader

In [0]:
#Method o analyze the private database with the trained model
def analyze_privatedata(model, loader):
  print("Running on ", device)
  model.to(device)
  cum_perc = 0
  for imgs, labels in loader:
    imgs, labels = imgs.to(device), labels.to(device)
    with torch.no_grad():
      ps =  torch.exp(model.forward(imgs))
    top_p, top_class = ps.topk(1, dim = 1)
    prob = top_class == labels.view(*top_class.shape)
    prob = prob.float()
    cum_perc += prob.mean().float()
  print("The accuracy of the differentially private model on the private dataset is {0}%".format((cum_perc / len(loader)) * 100))

In [0]:

#Classifier for creating the models
class classifier(nn.Module):
  def __init__(self):
    super().__init__() 
    self.fc1 = nn.Linear(784, 256)
    self.fc2 = nn.Linear(256, 128)
    self.fc3 = nn.Linear(128, 64)
    self.fc4 = nn.Linear(64, 32)
    self.fc5 = nn.Linear(32, 10)
    
    self.dropout = nn.Dropout(p = 0.2)
    
  def forward(self, x):
    x = x.view(x.shape[0], -1)
    x = self.dropout(F.relu(self.fc1(x)))
    x = self.dropout(F.relu(self.fc2(x)))
    x = self.dropout(F.relu(self.fc3(x)))
    x = self.dropout(F.relu(self.fc4(x)))
    x = F.log_softmax(self.fc5(x), dim = 1)   
    return x

In [0]:
teachers = 100
epsilon = 0.1


In [0]:
pdb, db = test_database_seperator(mnist_testset)
teachers_loaders = private_data_loaders(mnist_trainset, teachers)

In [17]:
teacher_models = train_teacher_models(teachers_loaders, lr = 0.12, epoch = 40)

Running on  cuda
The loss for 39 epoch is 0.4208499014377594
The percentage for 39 epoch is 0.8619791865348816
Running on  cuda
The loss for 39 epoch is 0.47125611305236814
The percentage for 39 epoch is 0.8421875238418579
Running on  cuda
The loss for 39 epoch is 0.4659587472677231
The percentage for 39 epoch is 0.8791667222976685
Running on  cuda
The loss for 39 epoch is 0.30157948434352877
The percentage for 39 epoch is 0.7807291746139526
Running on  cuda
The loss for 39 epoch is 0.731645143032074
The percentage for 39 epoch is 0.8536458015441895
Running on  cuda
The loss for 39 epoch is 0.3601694226264954
The percentage for 39 epoch is 0.7677083611488342
Running on  cuda
The loss for 39 epoch is 0.4831513434648514
The percentage for 39 epoch is 0.7786458730697632
Running on  cuda
The loss for 39 epoch is 0.668919250369072
The percentage for 39 epoch is 0.8062500357627869
Running on  cuda
The loss for 39 epoch is 0.5323271751403809
The percentage for 39 epoch is 0.8166667222976685
R

In [0]:
teachers_pred = np.array(evaluate(teacher_models, pdb))
teachers_pred = teachers_pred.reshape(teachers, -1)

In [19]:

indices = return_new_indices(teachers_pred, epsilon)
pate_analysis(teachers_pred, indices, epsilon)

Data dependent epsilon  176.48738214594042
Data Independent epsilon  371.5129254649703


In [20]:
labelledloader = join_label_image(pdb, indices)
main_model = create_train_model(classifier, labelledloader, lr = 0.06, epoch = 30)

Running on  cuda
The loss for 29 epoch is 0.31846795146558304
The percentage for 29 epoch is 0.9162677526473999


In [21]:
analyze_privatedata(main_model, db)

Running on  cuda
The accuracy of the differentially private model on the private dataset is 87.32421875%
