In [None]:
!pip install torch "pandas<2.0.0" numpy scikit-learn tenseal torchvision torch torchmetrics
!pip install --upgrade pip
#!pip install torch pandas numpy scikit-learn tenseal torchvision

In [3]:
import pickle
import json
import numpy as np
import pandas as pd
import torch
import random
import os
import tenseal as ts
from time import time
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor, Lambda
#import torchmetrics
from torchmetrics.classification import Accuracy, Precision, F1Score, MulticlassAUROC
from sklearn.metrics import f1_score, recall_score

In [4]:
class EmbeddingDataset(Dataset):

    def __init__(self, embedding_fp, transform=None, target_transform=None):

        with open(embedding_fp, "rb") as fIn:
            stored_data = pickle.load(fIn)
            stored_labels = stored_data['labels']
            stored_embeddings = stored_data['embeddings']
        self.X = stored_embeddings 
        self.X = torch.tensor(self.X).float().unsqueeze(0)
            
        self.y = stored_labels.to_numpy()
        self.y = torch.LongTensor(self.y).unsqueeze(-2) 

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        
        X = self.X[idx]

        y = self.y[idx] 

        return X, y


In [5]:
#train_data = EmbeddingDataset('embeddings_l6_train15.pkl')
train_data = EmbeddingDataset('train_15rfe.pkl')

In [6]:
#test_data = EmbeddingDataset('embeddings_l6_test15.pkl')
test_data = EmbeddingDataset('test_15rfe.pkl')

In [7]:
X_train, y_train = train_data[0]

In [11]:
X_train.shape

torch.Size([115102, 384])

In [12]:
y_train.shape

torch.Size([115102])

In [8]:
X_test, y_test = test_data[0]

In [13]:
X_test.shape

torch.Size([25267, 384])

In [38]:
y_test.shape

torch.Size([25267])

#### Plain Model

In [46]:
# base model 
class LR(torch.nn.Module):

    def __init__(self, n_features):
        super(LR, self).__init__()        
        self.lr = torch.nn.Linear(n_features, 16)
        
    def forward(self, x):
        print(x.shape)
        out = torch.softmax(self.lr(x),dim = 1,dtype=None)
        return out

In [48]:
# parameters
n_features = 384
n_classes = 16
epochs = 15

# a model instance
model = LR(n_features)

# use gradient descent with a learning_rate=0.01
optim = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
device = 'cpu'

In [49]:
model

LR(
  (lr): Linear(in_features=384, out_features=16, bias=True)
)

In [11]:
# metrics
accuracy_fn = Accuracy(task="multiclass",  num_classes=16)
precision_fn = Precision(task="multiclass", average='macro', num_classes=36)
f1_fn = F1Score(task="multiclass", average='macro', num_classes=16)
auroc_fn = MulticlassAUROC(average="macro", num_classes=16)

##### Testing accuracy of the plain model

In [32]:
def multi_acc(model, x, y):
    out = model(x)
    _, y_pred_tags = torch.max(out, dim = 1)    
#    print(_,y_pred_tags)
    correct_pred = (y_pred_tags == y).float()
#    print(correct_pred)
    acc = correct_pred.sum() / len(correct_pred) 
#    print(correct_pred.sum(), len(correct_pred))
    return acc.item() # or acc for a tensor

In [85]:
# model training function
def train(model, optim, criterion, x, y, epochs=epochs):
    for e in range(1, epochs + 1):
        #
        optim.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optim.step()
 #       print(f"Loss at epoch {e}: {loss.data}")
        acc = accuracy_fn(out, y)
 #       print(out, out.shape)
  #      print(y, y.shape)
        acc_m = multi_acc(model, x, y)
        f1 = f1_fn(out, y) 
        auroc = auroc_fn(out, y)
        print(f"Epoch: {e} | Loss: {loss.data:.5f}, Acc_multi: {acc_m:.5f}% | Acc: {acc.data:.5f}% | F1-score: {f1.data:.5f}% | AUROC: {auroc.data:.5f}%", ) 
    return model

model = train(model, optim, criterion, X_train, y_train)

torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 1 | Loss: 2.77232, Acc_multi: 0.06707% | Acc: 0.06694% | F1-score: 0.04603% | AUROC: 0.54752%
torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 2 | Loss: 2.77231, Acc_multi: 0.06722% | Acc: 0.06707% | F1-score: 0.04605% | AUROC: 0.54755%
torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 3 | Loss: 2.77231, Acc_multi: 0.06726% | Acc: 0.06722% | F1-score: 0.04613% | AUROC: 0.54757%
torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 4 | Loss: 2.77231, Acc_multi: 0.06740% | Acc: 0.06726% | F1-score: 0.04614% | AUROC: 0.54760%
torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 5 | Loss: 2.77231, Acc_multi: 0.06757% | Acc: 0.06740% | F1-score: 0.04619% | AUROC: 0.54763%
torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 6 | Loss: 2.77231, Acc_multi: 0.06766% | Acc: 0.06757% | F1-score: 0.04627% | AUROC: 0.54765%
torch.Size([115102, 384])
torch.Size([115102, 384])
Epoch: 7 | Loss: 2.77230, Acc_

#### Encrypted Model

In [128]:
class EncryptedLR:

    GOLDSCHMIDT_CONST = 32
    GOLDSCHMIDT_ITER = 30
    
    def __init__(self, torch_lr):
        
        self.weight = torch_lr.lr.weight.T.data.tolist() #[0]
        self.bias = torch_lr.lr.bias.data.tolist()
        # accumulate gradients and count the number of iterations
        self._delta_w = 0
        self._delta_b = 0
        self._count = 0
        self.l2 = 0.05
        
        
    def forward(self, enc_x):
        enc_out = enc_x.dot(self.weight) + self.bias
       
        enc_out = EncryptedLR.softmax(enc_out)
        return enc_out
    
    def backward(self, enc_x, enc_out, enc_y):
        out_minus_y = (enc_out - enc_y)
        self._delta_w += enc_x * out_minus_y
        self._delta_b += out_minus_y
        self._count += 1
        
    def update_parameters(self):
        if self._count == 0:
            raise RuntimeError("You should at least run one forward iteration")
        # update weights
        # use a small regularization term to keep the output
        # of the linear layer in the range of the sigmoid approximation
        self.weight -= self._delta_w * (1 / self._count) + self.weight * self.l2
        self.bias -= self._delta_b * (1 / self._count)
        # reset gradient accumulators and iterations count
        self._delta_w = 0
        self._delta_b = 0
        self._count = 0
    
    @staticmethod
    def loss_precision(x, d = 30):
        return x * (1 + (np.random.rand(1)[0] - 0.5) * 2 * 2**(-d))
    
    @staticmethod
    def dist(x, y):
        return 1 - np.sum(x == y) / (len(x) * 2. - np.sum(x == y))
    
    @staticmethod
    def our_exp(x):
        y = (np.array(x) + 16) / 80
        for i in range(4):
            y = y * y
            y = EncryptedLR.loss_precision(y)
        return y
    
    @staticmethod
    def goldschmidt(x, M, n):
        base = 1 - np.sum(x) / M
        z = 1. / M
        for zeta in range(n):
            z *= 1 + base
            z = EncryptedLR.loss_precision(z)
            base *= base
            base = EncryptedLR.loss_precision(base)
        return z
    
    @staticmethod
    def softmax(self, enc_x):

        val_out = enc_x.T.dot(self.weight) + self.bias
        val_expo = np.array([EncryptedLR.our_exp(x) for x in val_out])
        val_pred = np.array([x * EncryptedLR.goldschmidt(x, M=1/EncryptedLR.GOLDSCHMIDT_CONST, n=EncryptedLR.GOLDSCHMIDT_ITER) for x in val_expo])

        return val_pred
    
    def plain_accuracy(self, X_test, y_test):
        # evaluate accuracy of the model on
        # the plain (x_test, y_test) dataset
        w = torch.tensor(self.weight)
        b = torch.tensor(self.bias)

        out = torch.softmax(X_test.matmul(w) + b, dim=1)
        #(X, self.W) + self.b)
#       print(out)
        _, y_pred_tags = torch.max(out, dim = 1) 
        correct_pred = (y_pred_tags == y_test).float()
        acc = correct_pred.sum() / len(correct_pred)
        return acc

   
    def encrypt(self, context):
        self.weight = ts.ckks_vector(context, self.weight)
        self.bias = ts.ckks_vector(context, self.bias)
        
    def decrypt(self):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()
        
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)



In [125]:
# an instance of the encrypted model
eelr = EncryptedLR(model)

In [90]:
eelr.plain_accuracy(X_test, y_test)


tensor(0.0693)

##### Encrypting the tensors into ckksvectors

In [95]:
# TenSEAL context - parameters 1
poly_mod_degree = 4096
coeff_mod_bit_sizes = [40, 20, 40]
# create TenSEALContext
ctx_eval = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
# scale of ciphertext to use
ctx_eval.global_scale = 2 ** 20
# this key is needed for doing dot-product operations
ctx_eval.generate_galois_keys()

In [None]:
# parameters 2
poly_mod_degree = 8192
coeff_mod_bit_sizes = [40, 21, 21, 21, 21, 21, 21, 40]
# create TenSEALContext
ctx_train = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
ctx_train.global_scale = 2 ** 21
ctx_train.generate_galois_keys()

In [97]:
y_test_ = y_test.float().unsqueeze(-1)
y_test_.shape

torch.Size([25267, 1])

In [98]:
y_train_ = y_train.float().unsqueeze(-1)
y_train_.shape

torch.Size([115102, 1])

In [99]:
# train data
t_start = time()
enc_x_train = [ts.ckks_vector(ctx_eval, x.tolist()) for x in X_train]
enc_y_train = [ts.ckks_vector(ctx_eval, y.tolist()) for y in y_train_]
t_end = time()
print(f"Encryption of the training_set took {int(t_end - t_start)} seconds")

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Encryption of the training_set took 460 seconds


In [100]:
# test data
t_start = time()
enc_x_test = [ts.ckks_vector(ctx_eval, x.tolist()) for x in X_test]
enc_y_test = [ts.ckks_vector(ctx_eval, y.tolist()) for y in y_test_]
t_end = time()
print(f"Encryption of the test-set took {int(t_end - t_start)} seconds")

Encryption of the test-set took 773 seconds


#### Evaluation

In [34]:
# Defining some metric functions

def accuracy(model, x, y):
    out = model(x)
    correct = torch.abs(y - out) < 0.5
    return correct.float().mean()

#alternative accuracy
def multi_acc(model, x, y):
    out = model(x)
    _, y_pred_tags = torch.max(out, dim = 1)    
    
    correct_pred = (y_pred_tags == y).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    return acc

def recall(model, x, y):
    out = model(x)
    predicted = (out > 0.5).float()
    recall = recall_score(y_true=y.numpy(), y_pred=predicted.detach().numpy())
    return recall

def compute_f1_score(model, x, y):
    out = model(x)
    predicted = (out > 0.5).float()
    f1 = f1_score(y_true=y.numpy(), y_pred=predicted.detach().numpy())
    return f1


##### Accuracy evaluation - plain data vs encrypted data

In [73]:
def encrypted_evaluation(model, enc_x_test, y_test):
    t_start = time()
    
    correct = 0
    for enc_x, y in zip(enc_x_test, y_test):
        # encrypted evaluation
        enc_out = model(enc_x)
        # plain comparison
        out = enc_out.decrypt()
        out = torch.tensor(out)
        out = torch.softmax(out)
        if correct_pred = (y_pred_tags == y_test).float():
            correct_pred += 1
    
    t_end = time()
    print(f"Evaluated test_set of {len(x_test)} entries in {int(t_end - t_start)} seconds")
    print(f"Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
    return correct / len(x_test)


##### With Dataloaders

In [77]:
dataloader_train_X = DataLoader(train_data, batch_size=8, shuffle=True)
dataloader_test_X = DataLoader(test_data, batch_size=8, shuffle=True)

In [None]:
times = []
epochs =10
device = 'cpu'

for epoch in range(epochs):

    for batch, (X, y) in enumerate(dataloader_train_X):
        X, y = X.to(device), y.to(device)
        

#        enc_x_train = EncryptedLR.encrypt(X_train, context =ctx_eval)
#        enc_y_train = EncryptedLR.encrypt(y_train, context =ctx_eval)

    
        
        eelr = EncryptedLR(model0)
        t_start = time()
        for enc_x, enc_y in zip(enc_x_train, enc_y_train):
            enc_out = eelr.forward(enc_x)
            eelr.backward(enc_x, enc_out, enc_y)
        eelr.update_parameters()
        t_end = time()
        times.append(t_end - t_start)
        
        eelr.decrypt()
#        accuracy = eelr.plain_accuracy(x_test, y_test)
#        print(f"Accuracy at epoch #{epoch + 1} is {accuracy}")

#print(f"\nAverage time per epoch: {int(sum(times) / len(times))} seconds")
#print(f"Final accuracy is {accuracy}"