In [41]:
import time
import os
import sys
import logging
import multiprocessing

import numpy as np
import tenseal as ts
import pickle as pk

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, average_precision_score, f1_score, recall_score, precision_score, accuracy_score, classification_report
os.chdir("/home/apignet/homomorphic-encryption/ckks_titanic/")
from src.features import build_features
from models import encrypted_LR
from models import unencrypted_LR

In [13]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
1

1

# definition of parameters

### Log 

### Paths

In [15]:
#DATA_PATH = "/data/raw/"            # whole data set
DATA_PATH = "/data/quick_demo/"   # subset of the data set, with 15 train_samples and 5 test_samples
#DATA_PATH = "/home/apignet/homomorphic-encryption/ckks_titanic/data/quick_demo/"   # subset of the data set, with 400 train_samples and 50 test_samples
#DATA_PATH =   '/data/quick_demo/'
LOG_PATH = "reports/log"
LOG_FILENAME = "test_0716"

In [16]:
fileHandler = logging.FileHandler("{0}/{1}.log".format(LOG_PATH, LOG_FILENAME))
streamHandler = logging.StreamHandler(sys.stdout)
logging.basicConfig(format="%(asctime)s  [%(levelname)-8.8s]  %(message)s", datefmt='%m/%d/%Y %I:%M:%S %p', level = logging.INFO, handlers=[fileHandler, streamHandler])

In [17]:
os.getcwd()

'/home/apignet/homomorphic-encryption/ckks_titanic'

### Training variables

In [18]:
EPOCH = 40
LEARNING_RATE = 0.9
MOMENTUM_RATE = 0.6
REGULARIZATION_RATE = 0.5
VERBOSE = 2
SAVE_WEIGHT = 2
N_JOBS = 3

## Static functions

In [19]:
def crytp_array(X, local_context):
    """
    This function encrypt a list of vector
    
    :parameters 
    ------------
    
    :param X ; list of list, interpreted as list of vector to encrypt
    :param local_context ; TenSEAL context object used to encrypt
    
    :returns
    ------------
    
    list ; list of CKKS ciphertext  
    
    """
    res = []
    for i in range(len(X)):
        res.append(ts.ckks_vector(local_context, X[i]))
        if i == len(X) // 4:
            logging.info("25 % ...")
        elif i == len(X) // 2 :
            logging.info("50 % ...")
        elif i == 3* len(X)//4:
            logging.info("75% ...")
    return res

### Confidential functions

These functions involves security breachs (as use of unencrypted data, or decryption of weights) and cannot be coded by Alice.
However, the functions encapslulate the unsafe process, so can be performed by Alice if Bob provides them. 
Therefore, they are currently passed as parameters to Alice, which only calls them.

Currently there is a huge security breach, as confidential parameters (security key for instance), which are needed by those functions, are passed in a dictionnary to Alice. 
For a safe protocole, we have to change these functions, to set up a safe communication protocole between Bob and Alice.
Alice will therefore only send the crypted data to Bob (using these functions, in which can be set up the communication process) and Bob will locally perform the functions which are currently coded bellow. 

# Loading and processing the data

In [20]:
%%memit
logging.info(os.getcwd())
raw_train, raw_test = build_features.data_import(os.getcwd()+DATA_PATH)
train, submission_test = build_features.processing(raw_train, raw_test)
del submission_test

07/29/2020 02:51:00 PM  [INFO    ]  /home/apignet/homomorphic-encryption/ckks_titanic
07/29/2020 02:51:00 PM  [INFO    ]  loading the data into memory (pandas df)
07/29/2020 02:51:00 PM  [INFO    ]  Done
07/29/2020 02:51:00 PM  [INFO    ]  making final data set from raw data
07/29/2020 02:51:02 PM  [INFO    ]  Done
peak memory: 181.81 MiB, increment: 13.24 MiB


In [21]:
%%memit
train, test = train_test_split(train, test_size=0.15)
train_labels = train.Survived
test_labels = test.Survived
train_features = train.drop("Survived", axis=1)
test_features = test.drop("Survived", axis=1)

peak memory: 181.88 MiB, increment: 0.04 MiB


# Definition of safety parameters

In [22]:
%%memit
logging.info('Definition of safety parameters...')
timer = time.time()
# context = ts.context(ts.SCHEME_TYPE.CKKS, 32768,
# coeff_mod_bit_sizes=[60, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 60])
#context = ts.context(ts.SCHEME_TYPE.CKKS, 8192, coeff_mod_bit_sizes=[40, 21, 21, 21, 21, 21, 21, 40])

context = ts.context(ts.SCHEME_TYPE.CKKS, 4096, coeff_mod_bit_sizes=[40, 20, 40])
#context = ts.context(ts.SCHEME_TYPE.CKKS, 16384, coeff_mod_bit_sizes=[60, 40, 40, 40, 40, 40, 40,40, 60])
context.global_scale = pow(2, 20)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")


logging.info('Generation of the secret key...')
timer = time.time()
secret_key = context.secret_key()
context.make_context_public() #drop the relin keys, the galois keys, and the secret keys. 
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")
logging.info('Generation of the Galois Key...')
timer = time.time()
context.generate_galois_keys(secret_key)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")
logging.info('Generation of the Relin Key...')
timer = time.time()
context.generate_relin_keys(secret_key)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")
if context.is_public():
    logging.info("The context is now public, the context do not hold the secret key anymore, and decrypt methods need the secret key to be provide,")



07/29/2020 02:51:03 PM  [INFO    ]  Definition of safety parameters...
07/29/2020 02:51:03 PM  [INFO    ]  Done. 0.02 seconds
07/29/2020 02:51:03 PM  [INFO    ]  Generation of the secret key...
07/29/2020 02:51:03 PM  [INFO    ]  Done. 0.0 seconds
07/29/2020 02:51:03 PM  [INFO    ]  Generation of the Galois Key...
07/29/2020 02:51:03 PM  [INFO    ]  Done. 0.12 seconds
07/29/2020 02:51:03 PM  [INFO    ]  Generation of the Relin Key...
07/29/2020 02:51:03 PM  [INFO    ]  Done. 0.01 seconds
07/29/2020 02:51:03 PM  [INFO    ]  The context is now public, the context do not hold the secret key anymore, and decrypt methods need the secret key to be provide,
07/29/2020 02:51:03 PM  [INFO    ]  Definition of safety parameters...
07/29/2020 02:51:03 PM  [INFO    ]  Done. 0.03 seconds
07/29/2020 02:51:03 PM  [INFO    ]  Generation of the secret key...
07/29/2020 02:51:03 PM  [INFO    ]  Done. 0.0 seconds
07/29/2020 02:51:03 PM  [INFO    ]  Generation of the Galois Key...
07/29/2020 02:51:03 PM  [

# Data encryption

In [84]:
%%memit
logging.info("Data encryption...")
timer = time.time()
encrypted_X = crytp_array(train_features.to_numpy().tolist(), context)
encrypted_Y = crytp_array(train_labels.to_numpy().reshape((-1, 1)).tolist(), context)
encrypted_test_X = crytp_array(test_features.to_numpy().tolist(), context)
encrypted_test_Y = crytp_array(test_labels.to_numpy().reshape((-1, 1)).tolist(), context)
logging.info("Done. " + str(round(time.time() - timer, 2)) + " seconds")

07/29/2020 03:45:25 PM  [INFO    ]  Data encryption...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO    ]  75% ...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO    ]  75% ...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO    ]  Done. 0.12 seconds
07/29/2020 03:45:25 PM  [INFO    ]  Data encryption...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO    ]  75% ...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO    ]  75% ...
07/29/2020 03:45:25 PM  [INFO    ]  25 % ...
07/29/2020 03:45:25 PM  [INFO    ]  50 % ...
07/29/2020 03:45:25 PM  [INFO

# Initialize the weight

The weights have to be crypted 

In [85]:
%%memit
unencrypted_weight = np.random.normal(loc=0,
                                      scale=0.2, size =(train_features.to_numpy().shape[1]))
unencrypted_bias = np.random.random((1))

weight = ts.ckks_vector(context, unencrypted_weight.tolist())
bias = ts.ckks_vector(context, unencrypted_bias.tolist())


peak memory: 287.11 MiB, increment: 0.00 MiB


The confidential data as yet stored into a dictionnary, and will be used during the training only by functions which are passed as arguments to the fit methods. This encapsulation of sensitive data will allows us to ensure security during training later. 

In [25]:
%%memit
confidential_data = {'context':context,
                     'secret_key':secret_key, 
                     'unencrypted_X':train_features.to_numpy(),
                     'unencrypted_Y':train_labels.to_numpy().reshape((-1, 1)) 
                    }

peak memory: 184.73 MiB, increment: 0.00 MiB


# Testing serialization

In [119]:
%%memit
b_context = context.serialize()
b_weight = weight.serialize()

peak memory: 365.98 MiB, increment: 0.00 MiB


In [35]:
des_context = ts.context_from(b_context)
des_weight = ts.ckks_vector_from(des_context, b_weight)

# Testing multiprocessing 

In [46]:
type(b_weight)

bytes

In [55]:
def sharedvalues_func(values, arrays, shared_values, shared_arrays):
    for i in range(len(values)):
        v = values[i][1]
        sv = shared_values[i].value
        assert v == sv

    for i in range(len(values)):
        a = arrays[i][1]
        sa = list(shared_arrays[i][:])
        assert a == sa

    print('Tests passed')

def test_sharedvalues():
    values = [
        ('i', 10),
        ('h', -2),
        (multiprocessing.sharedctypes.ctypes.c_byte, b_weight)
        ]
    arrays = [
        ('i', list(range(100))),
        ('d', [0.25 * i for i in range(100)]),
        ('H', list(range(1000)))
        ]

    shared_values = [multiprocessing.Value(id, v) for id, v in values]
    shared_arrays = [multiprocessing.Array(id, a) for id, a in arrays]

    p = multiprocessing.Process(
        target=sharedvalues_func,
        args=(values, arrays, shared_values, shared_arrays)
        )
    p.start()
    p.join()

    assert p.exitcode == 0

In [56]:
test_sharedvalues()

TypeError: an integer is required (got type bytes)

In [86]:
type(weight), type(context)

(_tenseal_cpp.CKKSVector, _tenseal_cpp.TenSEALContext)

In [87]:
import _tenseal_cpp

In [88]:
_tenseal_cpp.CKKSVector

_tenseal_cpp.CKKSVector

In [118]:
from multiprocessing.managers import BaseManager

class MyManager(BaseManager):
    pass

class binary_ckks_vector(object):
    def __init__(self,b_obj, context):
        self.val = b_obj
class binary_context(object):
    def __init__(self, b_context):
        self.val=ts.context_from(b_context)
    def __get__(self):
        return self.val
    
    
def tentative_addition(a, context):
    a=ts.ckks_vector_from(context, a.get())
    print(type(a))

In [124]:
MyManager.register('ckks_vector', binary_ckks_vector)
MyManager.register('context', binary_context)

if __name__ == '__main__':
    manager = MyManager()
    manager.start()
    m_context = manager.context(b_context).__f
    print()
             # prints 7
            # prints 56

AttributeError: 'AutoProxy[context]' object has no attribute 'galois_keys'

In [111]:
x=sd_context(b_context)

TypeError: __init__() should return None, not '_tenseal_cpp.TenSEALContext'

In [None]:
x

In [None]:
x.galois_keys

In [None]:
x.galois_keys()

In [105]:
d_weight=ts.ckks_vector_from(x, b_weight)


RuntimeError: Unable to cast from non-held to held instance (T& to Holder<T>) (compile in debug mode for type information)