In [1]:
import time
import os
import sys
import logging
import multiprocessing

import numpy as np
import tenseal as ts
import pickle as pk

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, average_precision_score, f1_score, recall_score, precision_score, accuracy_score, classification_report
os.chdir("/home/apignet/homomorphic-encryption/ckks_titanic/")
from src.features import build_features
from models import encrypted_LR
from models import unencrypted_LR

In [2]:
%load_ext memory_profiler

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
1

1

# definition of parameters

### Log 

### Paths

In [4]:
#DATA_PATH = "/data/raw/"            # whole data set
DATA_PATH = "/data/quick_demo/"   # subset of the data set, with 15 train_samples and 5 test_samples
#DATA_PATH = "/home/apignet/homomorphic-encryption/ckks_titanic/data/quick_demo/"   # subset of the data set, with 400 train_samples and 50 test_samples
#DATA_PATH =   '/data/quick_demo/'
LOG_PATH = "reports/log"
LOG_FILENAME = "test_0716"

In [5]:
fileHandler = logging.FileHandler("{0}/{1}.log".format(LOG_PATH, LOG_FILENAME))
streamHandler = logging.StreamHandler(sys.stdout)
logging.basicConfig(format="%(asctime)s  [%(levelname)-8.8s]  %(message)s", datefmt='%m/%d/%Y %I:%M:%S %p', level = logging.INFO, handlers=[fileHandler, streamHandler])

In [6]:
os.getcwd()

'/home/apignet/homomorphic-encryption/ckks_titanic'

### Training variables

In [7]:
EPOCH = 40
LEARNING_RATE = 0.9
MOMENTUM_RATE = 0.6
REGULARIZATION_RATE = 0.5
VERBOSE = 2
SAVE_WEIGHT = 2
N_JOBS = 3

## Static functions

In [8]:
def crytp_array(X, local_context):
    """
    This function encrypt a list of vector
    
    :parameters 
    ------------
    
    :param X ; list of list, interpreted as list of vector to encrypt
    :param local_context ; TenSEAL context object used to encrypt
    
    :returns
    ------------
    
    list ; list of CKKS ciphertext  
    
    """
    res = []
    for i in range(len(X)):
        res.append(ts.ckks_vector(local_context, X[i]))
        if i == len(X) // 4:
            logging.info("25 % ...")
        elif i == len(X) // 2 :
            logging.info("50 % ...")
        elif i == 3* len(X)//4:
            logging.info("75% ...")
    return res

### Confidential functions

These functions involves security breachs (as use of unencrypted data, or decryption of weights) and cannot be coded by Alice.
However, the functions encapslulate the unsafe process, so can be performed by Alice if Bob provides them. 
Therefore, they are currently passed as parameters to Alice, which only calls them.

Currently there is a huge security breach, as confidential parameters (security key for instance), which are needed by those functions, are passed in a dictionnary to Alice. 
For a safe protocole, we have to change these functions, to set up a safe communication protocole between Bob and Alice.
Alice will therefore only send the crypted data to Bob (using these functions, in which can be set up the communication process) and Bob will locally perform the functions which are currently coded bellow. 

# Loading and processing the data

In [9]:
%%memit
logging.info(os.getcwd())
raw_train, raw_test = build_features.data_import(os.getcwd()+DATA_PATH)
train, submission_test = build_features.processing(raw_train, raw_test)
del submission_test

07/30/2020 04:28:42 PM  [INFO    ]  /home/apignet/homomorphic-encryption/ckks_titanic
07/30/2020 04:28:42 PM  [INFO    ]  loading the data into memory (pandas df)
07/30/2020 04:28:42 PM  [INFO    ]  Done
07/30/2020 04:28:42 PM  [INFO    ]  making final data set from raw data
07/30/2020 04:28:42 PM  [INFO    ]  Done
07/30/2020 04:28:42 PM  [INFO    ]  /home/apignet/homomorphic-encryption/ckks_titanic
07/30/2020 04:28:42 PM  [INFO    ]  loading the data into memory (pandas df)
07/30/2020 04:28:42 PM  [INFO    ]  Done
07/30/2020 04:28:42 PM  [INFO    ]  making final data set from raw data
07/30/2020 04:28:43 PM  [INFO    ]  Done
peak memory: 162.49 MiB, increment: 13.83 MiB


In [10]:
%%memit
train, test = train_test_split(train, test_size=0.15)
train_labels = train.Survived
test_labels = test.Survived
train_features = train.drop("Survived", axis=1)
test_features = test.drop("Survived", axis=1)

peak memory: 162.81 MiB, increment: 0.08 MiB


# Definition of safety parameters

In [11]:
%%memit
logging.info('Definition of safety parameters...')
timer = time.time()
# context = ts.context(ts.SCHEME_TYPE.CKKS, 32768,
# coeff_mod_bit_sizes=[60, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 60])
#context = ts.context(ts.SCHEME_TYPE.CKKS, 8192, coeff_mod_bit_sizes=[40, 21, 21, 21, 21, 21, 21, 40])

context = ts.context(ts.SCHEME_TYPE.CKKS, 4096, coeff_mod_bit_sizes=[40, 20, 40])
#context = ts.context(ts.SCHEME_TYPE.CKKS, 16384, coeff_mod_bit_sizes=[60, 40, 40, 40, 40, 40, 40,40, 60])
context.global_scale = pow(2, 20)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")


logging.info('Generation of the secret key...')
timer = time.time()
secret_key = context.secret_key()
context.make_context_public() #drop the relin keys, the galois keys, and the secret keys. 
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")
logging.info('Generation of the Galois Key...')
timer = time.time()
context.generate_galois_keys(secret_key)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")
logging.info('Generation of the Relin Key...')
timer = time.time()
context.generate_relin_keys(secret_key)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")
if context.is_public():
    logging.info("The context is now public, the context do not hold the secret key anymore, and decrypt methods need the secret key to be provide,")



07/30/2020 04:28:44 PM  [INFO    ]  Definition of safety parameters...
07/30/2020 04:28:44 PM  [INFO    ]  Done. 0.03 seconds
07/30/2020 04:28:44 PM  [INFO    ]  Generation of the secret key...
07/30/2020 04:28:44 PM  [INFO    ]  Done. 0.0 seconds
07/30/2020 04:28:44 PM  [INFO    ]  Generation of the Galois Key...
07/30/2020 04:28:45 PM  [INFO    ]  Done. 0.11 seconds
07/30/2020 04:28:45 PM  [INFO    ]  Generation of the Relin Key...
07/30/2020 04:28:45 PM  [INFO    ]  Done. 0.01 seconds
07/30/2020 04:28:45 PM  [INFO    ]  The context is now public, the context do not hold the secret key anymore, and decrypt methods need the secret key to be provide,
07/30/2020 04:28:45 PM  [INFO    ]  Definition of safety parameters...
07/30/2020 04:28:45 PM  [INFO    ]  Done. 0.03 seconds
07/30/2020 04:28:45 PM  [INFO    ]  Generation of the secret key...
07/30/2020 04:28:45 PM  [INFO    ]  Done. 0.0 seconds
07/30/2020 04:28:45 PM  [INFO    ]  Generation of the Galois Key...
07/30/2020 04:28:45 PM  [

# Data encryption

In [12]:
%%memit
logging.info("Data encryption...")
timer = time.time()
encrypted_X = crytp_array(train_features.to_numpy().tolist(), context)
encrypted_Y = crytp_array(train_labels.to_numpy().reshape((-1, 1)).tolist(), context)
encrypted_test_X = crytp_array(test_features.to_numpy().tolist(), context)
encrypted_test_Y = crytp_array(test_labels.to_numpy().reshape((-1, 1)).tolist(), context)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")

07/30/2020 04:28:45 PM  [INFO    ]  Data encryption...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO    ]  75% ...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO    ]  75% ...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO    ]  Done. 0.11 seconds
07/30/2020 04:28:45 PM  [INFO    ]  Data encryption...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO    ]  75% ...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO    ]  75% ...
07/30/2020 04:28:45 PM  [INFO    ]  25 % ...
07/30/2020 04:28:45 PM  [INFO    ]  50 % ...
07/30/2020 04:28:45 PM  [INFO

# Initialize the weight

The weights have to be crypted 

In [13]:
%%memit
unencrypted_weight = np.random.normal(loc=0,
                                      scale=0.2, size =(train_features.to_numpy().shape[1]))
unencrypted_bias = np.random.random((1))

weight = ts.ckks_vector(context, unencrypted_weight.tolist())
bias = ts.ckks_vector(context, unencrypted_bias.tolist())


peak memory: 181.41 MiB, increment: 0.13 MiB


The confidential data as yet stored into a dictionnary, and will be used during the training only by functions which are passed as arguments to the fit methods. This encapsulation of sensitive data will allows us to ensure security during training later. 

In [None]:
%%memit
confidential_data = {'context':context,
                     'secret_key':secret_key, 
                     'unencrypted_X':train_features.to_numpy(),
                     'unencrypted_Y':train_labels.to_numpy().reshape((-1, 1)) 
                    }

# Testing serialization

In [None]:
%%memit
b_context = context.serialize()
b_X = [x.serialize() for x in encrypted_X]
b_weight = weight.serialize()

# Testing multiprocessing 

Splitting the dataset into batches, one for each processes

In [15]:
len(b_X)

8

In [16]:
batches = [b_X[:len(b_X)//2] ,b_X[len(b_X)//2:]]
for i in batches:
    print(len(i))

4
4


Definition of all the functions needed. 

In [17]:
def worker(input, output):
    """
    This functions turns on the process until a string 'STOP' is found in the input queue
    
    It takes every couple (function, arguments of the functions) from the input queue, and put the result into the output queue
    """
    for func, args in iter(input.get, 'STOP'):
        result = func(*args)
        output.put(result)

def initialization(b_context, batch):
    """
    :param: b_context : binary representation of the context. context.serialize()$
    :param: batch : list of binary representations of CKKS vector
    This function is the first one to be passed in the input queue of the process. 
    It first deserialaze the context, passing it global,
    in the memory sapce allocated to the process
    Then the batch is also deserialize, using the context, 
    to generate a list of CKKS vector which stand for the encrypted samples on which the proces will work
    """
    global context
    context = ts.context_from(b_context)
    global data
    data = [ts.ckks_vector_from(context, i ) for i in batch]
    return 'Initialization done for process %s. Len of data : %i' %(multiprocessing.current_process().name, len(data) )
      
def op(b_weight):
    """
    Aims to represent the forward_backward_prop in test. 
    """
    w=ts.ckks_vector_from(context, b_weight)
    res = w.dot(data[0])
    for vec in data[1:]:
        res += w.dot(vec)
    return res.serialize()

def data_test():
    return len(data)

def testpickle(sk):
    data[0].decrypt(sk)
    return "j'ai recu une secret key..."

In [18]:
%%memit
NUMBER_OF_PROCESSES = 2
INIT_TASKS = [ (initialization, (b_context, batch )) for batch in batches]
LIST_QUEUE_IN = []
LIST_PROCESSES = []
QUEUE_OUT = multiprocessing.Queue()
for init_task in INIT_TASKS:
    LIST_QUEUE_IN.append(multiprocessing.Queue())
    LIST_QUEUE_IN[-1].put(init_task)
    LIST_PROCESSES.append(multiprocessing.Process(target=worker, args=(LIST_QUEUE_IN[-1],QUEUE_OUT)).start())


peak memory: 219.91 MiB, increment: 20.97 MiB


In [19]:
log_out = []
for _ in range(NUMBER_OF_PROCESSES):
    log_out.append(QUEUE_OUT.get())
    print(log_out[-1])

Initialization done for process Process-18. Len of data : 4
Initialization done for process Process-19. Len of data : 4


In [20]:
#temoin


In [21]:
for i in range(10): #epoch
    b_weight = weight.serialize()
    print("epoch %i" %i)
    for q in LIST_QUEUE_IN:
        q.put((op ,(b_weight,)))
    temoin = 0
    for batch in batches:
        temp = weight.dot(ts.ckks_vector_from(context, batch[0]))
        for vec in batch[1:]: 
            temp += weight.dot(ts.ckks_vector_from(context, vec))
        temoin = temp + temoin
    res = 0
    for _ in range(NUMBER_OF_PROCESSES):
        log_out.append(QUEUE_OUT.get())
        res = ts.ckks_vector_from(context, log_out[-1]) +res
    
    res = ts.ckks_vector(context, res.decrypt(secret_key))
    weight+=res
    weight = ts.ckks_vector(context, weight.decrypt(secret_key))

epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9


In [23]:
len(log_out)

22

In [None]:
for q in LIST_QUEUE_IN:
    q.put((op ,(b_weight,)))

In [None]:
LIST_QUEUE_IN[1].put((testpickle, (secret_key,)))

In [None]:
assert False, 'piche'

In [None]:
for _ in range(NUMBER_OF_PROCESSES):
    QUEUE_OUT.get()

In [86]:
enc_predictions =  ( 1 for i in range(10))
Y = range(10)

In [87]:
res = 0
for i in range(len(enc_predictions)):
    res -= Y[i] * (enc_predictions[i])
    print(res)
    res -= (1 - Y[i]) *(1 - enc_predictions[i])

res

TypeError: object of type 'generator' has no len()

In [88]:
res = 0
for y,pred in zip(Y,enc_predictions):
    res -= y * (pred)
    print(res)
    res -= (1 - y) *(1 - pred)

res

0
-1
-3
-6
-10
-15
-21
-28
-36
-45


-45

In [90]:
res = 0
for x,y,z in zip([1,2],[1,2],[1,2]):
    print(x,y,z)

1 1 1
2 2 2
