In [4]:
%cd ..

/home/dimitrii/projts/ZeroEnter/zero-enter-antifraud


In [5]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import tqdm

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


path_csv = "data/credit_card_transactions-ibm_v2.csv"
df = pd.read_csv(path_csv).sample(n=100000, random_state=42)


# The card_id is defined as one card by one user.
# A specific user can have multiple cards, which would correspond to multiple different card_ids for this graph.
# For this reason we will create a new column which is the concatenation of the column User and the Column Card
df["card_id"] = df["User"].astype(str) + "_" + df["Card"].astype(str)

# We need to strip the ‘$’ from the Amount to cast as a float
df["Amount"] = df["Amount"].str.replace("$", "").astype(float)

# time can't be casted to int so so opted to extract the hour and minute
df["Hour"] = df["Time"].str[0:2]
df["Minute"] = df["Time"].str[3:5]

# drop unnecessary columns
df = df.drop(["Time", "User", "Card"], axis=1)

# ERRORS:
# array([nan, 'Bad PIN', 'Insufficient Balance', 'Technical Glitch',
#        'Bad Card Number', 'Bad CVV', 'Bad Expiration', 'Bad Zipcode',
#        'Insufficient Balance,Technical Glitch', 'Bad Card Number,Bad CVV',
#        'Bad CVV,Insufficient Balance',
#        'Bad Card Number,Insufficient Balance'], dtype=object)

df["Errors?"] = df["Errors?"].fillna("No error")

# The two columns Zip and Merchant state contains missing values which can affect our graph.
# Moreover these information can be extracted from the column Merchant City so we will drop them.
df = df.drop(columns=["Merchant State", "Zip"], axis=1)

# change the is fraud column to binary
df["Is Fraud?"] = df["Is Fraud?"].apply(lambda x: 1 if x == "Yes" else 0)

df["Merchant City"] = LabelEncoder().fit_transform(df["Merchant City"])

# USE CHIP:
# array(['Chip Transaction', 'Online Transaction', 'Swipe Transaction'],
#       dtype=object)
df["Use Chip"] = LabelEncoder().fit_transform(df["Use Chip"])
df["Errors?"] = LabelEncoder().fit_transform(df["Errors?"])

  df["Amount"] = df["Amount"].str.replace("$", "").astype(float)


In [7]:
df

Unnamed: 0,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,MCC,Errors?,Is Fraud?,card_id,Hour,Minute
18199893,2019,7,10,59.18,0,-6853385250336487907,1890,5813,10,0,1470_0,00,11
9731325,2019,1,14,280.91,1,4241336128694185533,3294,4814,10,0,822_1,22,12
536687,2010,3,15,-144.00,2,190253443608377572,1930,3359,10,0,41_3,07,07
13223840,2015,9,20,6.76,0,-7837310524365334241,2544,5300,10,0,1084_0,14,58
17070521,2014,10,12,9.17,2,-5023497618971072366,1635,5812,10,0,1384_0,11,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3028639,2018,6,8,23.90,2,-3739862438923451178,4185,7832,10,0,254_3,20,37
11275290,2018,10,27,24.23,2,6661973303171003879,1602,5211,10,0,937_1,14,12
1327632,2008,4,17,32.20,2,7517466159504402752,808,4900,10,0,107_1,11,15
2929345,2017,3,30,2.54,0,-7573333216691584899,3547,5812,10,0,243_2,09,51


In [8]:
# Create an empty graph
from typing import List, Any

import networkx as nx
import pandas as pd
import torch


# Create an empty graph
G = nx.MultiGraph()

# Add nodes to the graph for each unique card_id, merchant_name
G.add_nodes_from(df["card_id"].unique(), type="card_id")
G.add_nodes_from(df["Merchant Name"].unique(), type="merchant_name")

# The code below adding edges and properties to the edges of a graph.
# The code iterates through each row of the dataframe, df,
# and creates a variable for each property then
# assign it to the edge between the card_id and merchant_name of that row.

# Add edges and properties to the edges
for _, row in df.iterrows():
    # Create a variable for each properties for each edge

    year = (row["Year"],)
    month = (row["Month"],)
    day = (row["Day"],)
    hour = (row["Hour"],)
    minute = (row["Minute"],)
    amount = (row["Amount"],)
    use_chip = (row["Use Chip"],)
    merchant_city = (row["Merchant City"],)
    errors = (row["Errors?"],)
    mcc = row["MCC"]

    G.add_edge(
        row["card_id"],
        row["Merchant Name"],
        year=year,
        month=month,
        day=day,
        hour=hour,
        minute=minute,
        amount=amount,
        use_chip=use_chip,
        merchant_city=merchant_city,
        errors=errors,
        mcc=mcc,
    )

# Get the number of nodes and edges in the graph
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Print the number of nodes and edges
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

# Convert the graph to an adjacency matrix
adj_matrix = nx.adjacency_matrix(G).todense()

print(f"adj_matrix.shape: {adj_matrix.shape}")

# We define the variable "edge_list" which is a list of edges and their associated data in a graph G.
# Then we create an empty list called "x" and iterates over each edge in the edge_list.
# For each edge, it extracts the values of the edge data, converts them to floats if needed, and append them to the list "x".
# Finally, we convert the list "x" to a PyTorch tensor with float datatype

# Prepare the data for input into the model
edge_list = list(G.edges(data=True))
features = []
for edge in edge_list:
    edge_values = list(edge[2].values())
    edge_values = [
        float(i[0])
        if type(i) == tuple and type(i[0]) == str
        else i[0]
        if type(i) == tuple
        else i
        for i in edge_values
    ]
    features.append(edge_values)

print(f"features.shape: {len(features)}")

target = df["Is Fraud?"].values.tolist()


Number of nodes: 16415
Number of edges: 100000
adj_matrix.shape: (16415, 16415)
features.shape: 100000


In [9]:
class Model(nn.Module):
    # define nn
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 20)
        self.fc3 = nn.Linear(20, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.softmax(x)

        return x



We will now need to split the dataset into a training set and testing set for ML. This is done fairly easily with the `train_test_split` helper function from sklearn.

In [10]:
train_X, test_X, train_y, test_y = train_test_split(
    features, # use columns 0-4 as X
    target, # use target as y
    test_size=0.2 # use 20% of data for testing
)

# Uncomment for sanity checks
# print("train_X: ", train_X)
# print("test_X: ", test_X)
print("train_y: ", train_y)
print("test_y: ", test_y)

train_y:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

We can now define the parameters for training, we will use the [Cross Entropy Loss](https://machinelearningmastery.com/cross-entropy-for-machine-learning/) and [Stochastic Gradient Descent Optimizer](https://en.wikipedia.org/wiki/Stochastic_gradient_descent).

In [11]:
# our loss function
loss_fn = nn.CrossEntropyLoss()


model = Model()
# our optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


# use 800 EPOCHS
EPOCHS = 800

# Convert training data to pytorch variables
tr_x = torch.Tensor(train_X).float()
tr_x = (tr_x - tr_x.mean(dim=0)) / tr_x.std(dim=0)
train_X = Variable(tr_x)

te_x = torch.Tensor(test_X).float()
te_x = (te_x - te_x.mean(dim=0)) / te_x.std(dim=0)
test_X = Variable(te_x)


train_y = Variable(torch.Tensor(train_y).long())
test_y = Variable(torch.Tensor(test_y).long())


loss_list     = np.zeros((EPOCHS,))
accuracy_list = np.zeros((EPOCHS,))


# we use tqdm for nice loading bars
for epoch in tqdm.trange(EPOCHS):

    # To train, we get a prediction from the current network
    predicted_y = model(train_X)

    # Compute the loss to see how bad or good we are doing
    loss = loss_fn(predicted_y, train_y)

    # Append the loss to keep track of our performance
    loss_list[epoch] = loss.item()

    # Afterwards, we will need to zero the gradients to reset
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Calculate the accuracy, call torch.no_grad() to prevent updating gradients
    # while calculating accuracy
    with torch.no_grad():
        y_pred = model(test_X)
        correct = (torch.argmax(y_pred, dim=1) == test_y).type(torch.FloatTensor)
        accuracy_list[epoch] = correct.mean()

100%|██████████| 800/800 [05:53<00:00,  2.26it/s]


# Step 2: ZK the Neural Network

Now that we have the Neural Network trained, we can use ezkl to easily ZK our model.

To proceed we will now need to install `ezkl`



In [29]:
import os
import json
import ezkl

Next, we will need to export the neural network to a `.onnx` file. ezkl reads this `.onnx` file and converts it into a circuit which then allows you to generate proofs as well as verify proofs

In [43]:
# Specify all the files we need
zkp_dir = "ezkl_inference/data_zkp"
os.makedirs(zkp_dir, exist_ok=True)


model_path = os.path.join(zkp_dir, "network.onnx")
compiled_model_path = os.path.join(zkp_dir, "network.compiled")
pk_path = os.path.join(zkp_dir, "test.pk")
vk_path = os.path.join(zkp_dir, "test.vk")
settings_path = os.path.join(zkp_dir, "settings.json")
srs_path = os.path.join(zkp_dir, "kzg.srs")
witness_path = os.path.join(zkp_dir, "witness.json")
data_path = os.path.join(zkp_dir, "input.json")
proof_path = os.path.join(zkp_dir, "test.pf")

In [44]:
test_X[0, None].shape

torch.Size([1, 10])

In [45]:
(test_X[0, None] - train_X.mean(dim=0)) / train_X.std(dim=0)

tensor([[ 1.3924, -1.0232,  0.5879,  1.1051, -0.7218,  0.7502, -1.6068, -0.6996,
          0.0998, -0.1622]])

In [46]:
a = 0.1*torch.rand(*[1, 10], requires_grad=True)
a.shape

torch.Size([1, 10])

In [47]:
0.1*torch.rand(*[1, 10], requires_grad=True)

tensor([[0.0667, 0.0148, 0.0192, 0.0560, 0.0160, 0.0150, 0.0119, 0.0612, 0.0196,
         0.0890]], grad_fn=<MulBackward0>)

In [48]:
# After training, export to onnx (network.onnx) and create a data file (input.json)

# create a random input
# x = 0.1*torch.rand(*[1, 10], requires_grad=True)
# x = test_X[0, None]
x = (test_X[0, None] - train_X.mean(dim=0)) / train_X.std(dim=0)

# Flips the neural net into inference mode
model.eval()

# Export the model
torch.onnx.export(model,                     # model being run
                  x,                         # model input (or a tuple for multiple inputs)
                  model_path,                # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=10,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                'output' : {0 : 'batch_size'}})

data_array = ((x).detach().numpy()).reshape([-1]).tolist()

data = dict(input_data = [data_array])

    # Serialize data into file:
json.dump(data, open(data_path, 'w'))

After which we can proceed to generate the settings file for `ezkl` and run calibrate settings to find the optimal settings for `ezkl`

In [49]:
!RUST_LOG=trace
# TODO: Dictionary outputs

run_args = ezkl.PyRunArgs()
# run_args.input_visibility = "encrypted"
# run_args.param_visibility = "encrypted"
run_args.input_visibility = "private"
run_args.param_visibility = "private"
run_args.output_visibility = "public"
res = ezkl.gen_settings(model_path, settings_path, py_run_args=run_args)
assert res == True

res = await ezkl.calibrate_settings(data_path, model_path, settings_path, "resources")  # Optimize for resources

Next, we will compile the model. The compilation step allow us to generate proofs faster.

In [50]:
res = ezkl.compile_model(model_path, compiled_model_path, settings_path)
assert res == True

Before we can setup the circuit params, we need a SRS (Structured Reference String). The SRS is used to generate the proofs.

In [51]:
res = ezkl.get_srs(srs_path, settings_path)

Now run setup, this will generate a proving key (pk) and verification key (vk). The proving key is used for proving while the verification key is used for verificaton.

In [52]:
res = ezkl.setup(
        compiled_model_path,
        vk_path,
        pk_path,
        srs_path,
        settings_path,
    )

assert res == True
assert os.path.isfile(vk_path)
assert os.path.isfile(pk_path)
assert os.path.isfile(settings_path)

Now, we can generate a proof and verify the proof as a sanity check. We will use the "evm" transcript. This will allow us to provide proofs to the EVM.

In [53]:
# Generate the Witness for the proof

# now generate the witness file


res = ezkl.gen_witness(data_path, compiled_model_path, witness_path, settings_path = settings_path)
assert os.path.isfile(witness_path)

In [54]:
# Generate the proof

# proof_path = os.path.join('proof.json')

proof = ezkl.prove(
        witness_path,
        compiled_model_path,
        pk_path,
        proof_path,
        srs_path,
        "evm",
        "single",
        settings_path,
    )

print(proof)
assert os.path.isfile(proof_path)

{'instances': [[[82, 0, 0, 0], [2, 0, 0, 0]]], 'proof': '04a3507b07303d1834d8bba10507cbebd0a622d175aa6a67298d7506ec90e1be2c27fabb83ba39b7b617e2290ea88959ec475be3fd80a97763ddfb615b82c6a2286cc09c5a198c76c16a7ebcadd068972ac6b657a961ac663ad53c04300caca011efcd4591dd796a59c1e7e93d6ae7babf21c0e505c2e816d94f4f18aa45541827166ca59142c84f06602f01fd4c13d639fa01bcc469bb9d56306809a8eb94001a2879fa1254e006316bc9d3445aaa811bd1793d4624a375b0829199f78ab97603f00729a56176ef1c08ab5e8eee094674b633b9b8c3dc5fbb641161b8a0850d167921e92c378310eb4d2d6171ad55170a069d28096e019249050bbde8b13d9e16295789346bd49ffe9da6482cf737bd5e23652e02f353b1badb3e4de0b8261208d684f11c737db76ff3371ee23f5a0ba1db757efa4edb74b7b32af2f0ed65960302a829ae0bb89503db894a636d7498c3954fdb31022caba9230c0702a3ca9a036134af0c89f9aac18474b1c4f926a27bed5d0703bc71565cfed142a240c9ea0860a314a364d63002d721066dd4776ab26a0f6f59125a9380e407df10717a5f18f4bf5885258cd5c1a611429b541fb1c24846f837c00e979ec74d2d0232ab5d18a857a26c6942e260d90bc88a4ab28dc2f01f96282f2a1

In [55]:
# verify our proof

res = ezkl.verify(
        proof_path,
        settings_path,
        vk_path,
        srs_path,
    )

assert res == True
print("verified")

verified
