We are going to do some ML on encrypted data. We will train ML model using Pytorch on the TOX21 dataset from kaggle on plain data and doing the test on encrypted data

In [35]:
import tenseal as ts
import torch
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

In [8]:
x_train = th.load("data/train_X.pt")
x_test = th.load("data/test_X.pt")
y_train = th.load("data/train_y.pt")
y_test = th.load("data/test_y.pt")

In [73]:
training_dataset = TensorDataset(x_train,y_train)
train_dataloader = DataLoader(training_dataset,batch_size = 64)
testing_dataset = TensorDataset(x_test,y_test)
test_dataloader= DataLoader(testing_dataset,batch_size =1) 



# Training dataset
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64)
# Test dataset
test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=1)

for the activation fonction we are choosing the square fonction for simplicity

In [79]:
class Model(nn.Module): 
    def __init__(self):
        super(Model, self).__init__()
        self.ln1 = nn.Linear(1024, 128)
        self.ln2 = nn.Linear(128, 12)
        
    def forward(self, x):
        out = self.ln1(x)
        out = out * out
        out = self.ln2(out)
        return out

In [152]:
def loss_batch(model,loss_fn,x_batch,y_batch ,opt=None,metric=None): 
    preds = model(x_batch)
    loss  = loss_fn(preds,y_batch)
    
    if opt is not None: 
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    metric_result = None 
    if metric is not None: 
        metric_result = metric(preds,y_batch)
    return loss.item(), len(x_batch), metric_result

In [158]:
def accuracy(outputs,lables):
    outputs = th.sigmoid(outputs)
    outputs = (outputs >= 0.5).int()
    return torch.sum(outputs==lables).item() / len(outputs)

In [159]:
def fit(epochs , model,device , loss_fn , opt , train_dl , metric = None): 
    for epoch in range(epochs):
        losses,metrics = [],[]
        for x_batch  , y_batch in train_dl:
            x_batch , y_batch = x_batch.to(device), y_batch.to(device) 
            #training the model 
            loss ,_ ,metric_result = loss_batch(model,loss_fn,x_batch,y_batch , opt,metric=accuracy)
            losses.append(loss)
            metrics.append(metric_result)
        val_metric = th.mean(th.tensor(metrics))
        if metric is None: 
            print(f"Epoch : {epoch+1}/{epochs} , Loss : {th.mean(th.tensor(losses))}")
        else  :
            print(f"Epoch : {epoch+1}/{epochs} , Loss : {th.mean(th.tensor(losses))} , {metric.__name__} : {val_metric} ")

In [168]:
model = Model()
lr = 0.01
loss_fn = nn.BCEWithLogitsLoss()
opt = torch.optim.SGD(params = model.parameters(),lr = lr)
device = th.device("cuda" if th.cuda.is_available() else "cpu")
fit(30,model,device,loss_fn,opt,train_dataloader)

Epoch : 1/30 , Loss : 0.6855852007865906
Epoch : 2/30 , Loss : 0.6699648499488831
Epoch : 3/30 , Loss : 0.6549543142318726
Epoch : 4/30 , Loss : 0.6405144333839417
Epoch : 5/30 , Loss : 0.6266041398048401
Epoch : 6/30 , Loss : 0.6131782531738281
Epoch : 7/30 , Loss : 0.6001833081245422
Epoch : 8/30 , Loss : 0.5875508785247803
Epoch : 9/30 , Loss : 0.5751843452453613
Epoch : 10/30 , Loss : 0.5629319548606873
Epoch : 11/30 , Loss : 0.5505279898643494
Epoch : 12/30 , Loss : 0.537453293800354
Epoch : 13/30 , Loss : 0.5225729942321777
Epoch : 14/30 , Loss : 0.503153920173645
Epoch : 15/30 , Loss : 0.4725356996059418
Epoch : 16/30 , Loss : 0.4198954701423645
Epoch : 17/30 , Loss : 0.35275372862815857
Epoch : 18/30 , Loss : 0.30621689558029175
Epoch : 19/30 , Loss : 0.28526604175567627
Epoch : 20/30 , Loss : 0.2755386531352997
Epoch : 21/30 , Loss : 0.2697027921676636
Epoch : 22/30 , Loss : 0.2654082477092743
Epoch : 23/30 , Loss : 0.26187074184417725
Epoch : 24/30 , Loss : 0.2587743997573852

In [170]:
def compute_labels(out):
    out = th.sigmoid(out)
    return (out >= 0.5).int()


# compute accuracy using hamming loss
def accuracy(output, target):
    # convert to labels
    out = compute_labels(output)
    # flatten and compute hamming loss
    flat_out = out.flatten()
    flat_target = target.flatten()
    incorrect = th.logical_xor(flat_out, flat_target).sum().item()
    hamming_loss = incorrect / len(flat_out)
    return 1 - hamming_loss


print("Accuracy on test set: {:.2f}".format(accuracy(model(x_test), y_test)))

Accuracy on test set: 0.93


Now we define a PyTorch-like model, but which uses TenSEAL operations. During initialization, we fetch and store weights from PyTorch layers. The forward method will then use the stored weights to perform linear layers.

In [193]:
class HEModel:
    def __init__(self, ln1, ln2):
        self.ln1_weight = ln1.weight.t().tolist()
        self.ln1_bias = ln1.bias.tolist()
        self.ln2_weight = ln2.weight.t().tolist()
        self.ln2_bias = ln2.bias.tolist()
        
    def forward(self, encrypted_vec):
         #as the linear layer is : y = A*x + B where A is the weight and B is the bias so we are going to do this with the encrypted vector
        encrypted_vec = encrypted_vec.mm(self.ln1_weight) + self.ln1_bias #the .mm() method is used for multuply matrix
        encrypted_vec *= encrypted_vec # here is the activation fonction x*x
        encrypted_vec = encrypted_vec.mm(self.ln2_weight) + self.ln2_bias
        return encrypted_vec
    
    def __call__(self, x):
        return self.forward(x)

Now we are going to choose the parameters and for that we saw previousely some intuition : 
    
    in our model we have 3 multiplication , 1 for the activation fonction and 2 for the matmul (matrix multiplication) 
         operation so we need 3 bit scale at the middle of our coeff_mod_bit_size
    the choice of the bit_scale can impact the precision of the franctionnal part, we must try diffrent values (so diffrent
        precisions) 
    The last coefficient modulus should be higher than the bits_scale and the difference between them (5 here) impacts the precision of the integer part, but since we are deeling weth small numbers so 5 is enough
    After choosing those parameters, we need to find the appropriate polynomial modulus degree to use, to guarantee 128-bits security. We can start with small power of two (4096) and go higher, till TenSEAL doesn't throw an error. It is also important to choose the polynomial modulus degree that allows us to put all our elements into a ciphertext.
    Here, we need to put 1024 values, and anything above 2048 should make it, but only 8192 (and above) meet the security requirement.
    

In [194]:
bit_scale = 30
coeff_mod_bit_sizes = [50, bit_scale,bit_scale,bit_scale,50]
poly_modulus_degree = 8192
# for 8192 we can create a cipher text contains a vector of 8192//2 values, since we have juste 1024 so it much enogh
context = ts.context(ts.SCHEME_TYPE.CKKS,poly_modulus_degree, coeff_mod_bit_sizes=coeff_mod_bit_sizes)
context.global_scale =  2 ** bit_scale

# Generate galois keys required for matmul in ckks_vector
context.generate_galois_keys()

he_model = HEModel(model.ln1, model.ln2)

Now we are doing the evaluation and for that we are going to encrypt the vector before passing it to the model to predict.
after that we decrypt the results and compare it to the plain results

if this evaluation is for 2 parties so the encrypted vector must be sent to the remote evalutation and the results are sent back for decryption

In [None]:
from time import time
matches= 0
he_outs= []
start = time()
for data , target in test_dataloader: 
    #flattened the data before processing
    flat_vec = data.flatten()
    #encryption of the vector
    enc_vec = ts.ckks_vector(context,flat_vec)
    #encryption eval 
    enc_out = he_model(enc_vec)
    #decryption 
    # if there is 2 parties for evaluation so we stop here and re-send the enc_outs to the data owner for decrypting it and seyying the results 
    he_out = th.tensor(enc_out.decrypt())
    he_outs.append(he_out.tolist())
    #evalutation on plain data
    plain_out = model(data)
    # counting the matched labels
    he_labels = compute_labels(he_out)
    plain_labels = compute_labels(plain_out)
    matches += (he_labels == plain_labels).sum().item()
    
end = time()

print(f'this operation took {end-start} seconds')
    
    

In [191]:
compute_labels(th.tensor([1.131231,-3.131414,0.05,0.5]))

tensor([1, 0, 1, 1], dtype=torch.int32)

In [201]:
print(f"Accuracy on test set (encryption evalutaiton  ) : {accuracy(th.tensor(he_outs),y_test)}")
print(f"encrypted evaluation matched {(matches / 12 * len(test_dataloader))*100}")

RuntimeError: The size of tensor a (216) must match the size of tensor b (9408) at non-singleton dimension 0

In [207]:
len(he_outs)

18