In [12]:
!pip install tenseal

Collecting tenseal
[?25l  Downloading https://files.pythonhosted.org/packages/88/2c/af53768083d6395f92f831f10b43fae9e3818bb838478d909bb065c3b6c2/tenseal-0.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8MB)
[K     |████████████████████████████████| 4.9MB 31.8MB/s 
[?25hInstalling collected packages: tenseal
Successfully installed tenseal-0.3.4


In [13]:
import torch
import tenseal as ts
import pandas as pd
import random
from time import time

# those are optional and are not necessary for training
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import pandas as pd

data = pd.read_csv("framingham.csv")
# drop rows with missing values
data = data.dropna()
# drop some features
data = data.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
# balance data
grouped = data.groupby('TenYearCHD')
data = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
display(data)

# extract labels
y = data["TenYearCHD"].values
display(y)
print(y.shape)

data = data.drop("TenYearCHD", 'columns')
# standardize data
data = (data - data.mean()) / data.std()
x = data.values
print(x.shape)
print(type(y))
#return split_train_test(x, y)

In [14]:
torch.random.manual_seed(73)
random.seed(73)


def split_train_test(x, y, test_ratio=0.3):
    idxs = [i for i in range(len(x))]
    random.shuffle(idxs)
    # delimiter between test and train data
    delim = int(len(x) * test_ratio)
    test_idxs, train_idxs = idxs[:delim], idxs[delim:]
    return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]


def heart_disease_data():
    data = pd.read_csv("framingham.csv")
    # drop rows with missing values
    data = data.dropna()
    # drop some features
    data = data.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
    # balance data
    grouped = data.groupby('TenYearCHD')
    data = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
    # extract labels
    y = torch.tensor(data["TenYearCHD"].values).float().unsqueeze(1)
    data = data.drop("TenYearCHD", 'columns')
    # standardize data
    data = (data - data.mean()) / data.std()
    x = torch.tensor(data.values).float()
    return split_train_test(x, y)

def random_data(m=1024, n=2):
    # data separable by the line `y = x`
    x_train = torch.randn(m, n)
    x_test = torch.randn(m // 2, n)
    y_train = (x_train[:, 0] >= x_train[:, 1]).float().unsqueeze(0).t()
    y_test = (x_test[:, 0] >= x_test[:, 1]).float().unsqueeze(0).t()
    return x_train, y_train, x_test, y_test


In [15]:
# You can use whatever data you want without modification to the tutorial
# x_train, y_train, x_test, y_test = random_data()
x_train, y_train, x_test, y_test = heart_disease_data()
#x_train, y_train, x_test, y_test = credit_card_data()

print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")

############# Data summary #############
x_train has shape: torch.Size([780, 9])
y_train has shape: torch.Size([780, 1])
x_test has shape: torch.Size([334, 9])
y_test has shape: torch.Size([334, 1])
#######################################


In [16]:
#Check how many positive/negative classes in test set.

#print(y_test)
print(torch.count_nonzero(y_test))

tensor(166)


In [4]:
def credit_card_data():
  # load train and test set into numpy arrays
  df = pd.read_csv('creditcard.csv')
  #print(df.shape)
  feature_names = df.iloc[:, 1:30].columns
  target = df.iloc[:1, 30:].columns

  data_features = df[feature_names]
  data_target = df[target]

  #print(feature_names)
  #print(target)

  x = torch.tensor(data_features.values).float()
  print(x)
  y = torch.tensor(data_target.values).float()
  print(y)
  return split_train_test(x, y)


#Training a Logistic Regression Model

In [17]:
class LR(torch.nn.Module):

    def __init__(self, n_features):
        super(LR, self).__init__()
        self.lr = torch.nn.Linear(n_features, 1)
        
    def forward(self, x):
        out = torch.sigmoid(self.lr(x))
        return out

In [18]:
n_features = x_train.shape[1]
model = LR(n_features)
# use gradient descent with a learning_rate=1
optim = torch.optim.SGD(model.parameters(), lr=1)
# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()

In [19]:
# define the number of epochs for both plain and encrypted training
EPOCHS = 5

def train(model, optim, criterion, x, y, epochs=EPOCHS):
    for e in range(1, epochs + 1):
        optim.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optim.step()
        print(f"Loss at epoch {e}: {loss.data}")
    return model

model = train(model, optim, criterion, x_train, y_train)

Loss at epoch 1: 0.8504331707954407
Loss at epoch 2: 0.6863385438919067
Loss at epoch 3: 0.6358115077018738
Loss at epoch 4: 0.6193529367446899
Loss at epoch 5: 0.6124349236488342


In [20]:
def accuracy(model, x, y):
    out = model(x)
    correct = torch.abs(y - out) < 0.5
    return correct.float().mean()

plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")

Accuracy on plain test_set: 0.703592836856842


#Encrypted Evaluation

In [None]:
#building test dataset.

# load train and test set into numpy arrays
df = pd.read_csv('creditcard_test.csv')
print(df.shape)
feature_names = df.iloc[:, 1:30].columns
target = df.iloc[:1, 30:].columns

data_features = df[feature_names]
data_target = df[target]

print(feature_names)
print(target)

x_test = torch.tensor(data_features.values).float()
#print(x)
y_test = torch.tensor(data_target.values).float()
print(y_test)


In [21]:
class EncryptedLR:
    
    def __init__(self, torch_lr):
        # TenSEAL processes lists and not torch tensors
        # so we take out parameters from the PyTorch model
        self.weight = torch_lr.lr.weight.data.tolist()[0]
        self.bias = torch_lr.lr.bias.data.tolist()
        
    def forward(self, enc_x):
        # We don't need to perform sigmoid as this model
        # will only be used for evaluation, and the label
        # can be deduced without applying sigmoid
        enc_out = enc_x.dot(self.weight) + self.bias
        return enc_out
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
        
    ################################################
    ## You can use the functions below to perform ##
    ## the evaluation with an encrypted model     ##
    ################################################
    
    def encrypt(self, context):
        self.weight = ts.ckks_vector(context, self.weight)
        self.bias = ts.ckks_vector(context, self.bias)
        
    def decrypt(self, context):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()
        

eelr = EncryptedLR(model)

In [22]:
# encryption parameters
poly_mod_degree = 4096
coeff_mod_bit_sizes = [40, 20, 40]
# create TenSEALContext
ctx_eval = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
# scale of ciphertext to use
ctx_eval.global_scale = 2 ** 20
# this key is needed for doing dot-product operations
ctx_eval.generate_galois_keys()

In [23]:
t_start = time()
enc_x_test = [ts.ckks_vector(ctx_eval, x.tolist()) for x in x_test]
t_end = time()
print(f"Encryption of the test-set took {int(t_end - t_start)} seconds")

print(enc_x_test)

Encryption of the test-set took 0 seconds
[<tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18838f50>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a1887a990>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a1887a750>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a1887a710>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a1957f110>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881050>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881090>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a188813d0>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881690>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881cd0>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881590>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a188815d0>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881f50>, <tenseal.tensors.ckksvector.CKKSVector object at 0x7f3a18881210>, <tenseal.tensors.ckksvector.CKKSV

In [25]:
def encrypted_evaluation(model, enc_x_test, y_test):
    t_start = time()    
    correct = 0
    for enc_x, y in zip(enc_x_test, y_test):
        # encrypted evaluation
        enc_out = model(enc_x)
        # plain comparaison
        out = enc_out.decrypt()
        out = torch.tensor(out)
        out = torch.sigmoid(out)
        #print(out,y)
        if torch.abs(out - y) < 0.5:
            correct += 1
    
    t_end = time()
    print(f"Evaluated test_set of {len(x_test)} entries in {int(t_end - t_start)} seconds")
    print(f"Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
    return correct / len(x_test)


In [26]:
encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test)

Evaluated test_set of 334 entries in 1 seconds
Accuracy: 210/334 = 0.6287425149700598


In [27]:
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
    print("Oh! We got a better accuracy on the encrypted test-set! The noise was on our side...")

Difference between plain and encrypted accuracies: 0.07485032081604004
