# 2-class Classification

The dataset is public and references to its license could be found in the README.md in /data subdirectory of the repo.

I took inspiration from https://github.com/yuxiangw/autodp/tree/master/example/private-deep-learning

In [1]:
#!pip install autodp
#!pip install pyspark
#!pip install mxnet

import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors

import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon import nn, Trainer
from mxnet.gluon.data import DataLoader, ArrayDataset
from mxnet.optimizer import Optimizer

# shipped with the repo
import dpdl_utils

# import packages for DP
from autodp import rdp_bank, rdp_acct

In [2]:
# This is the entry point of our spark app
spark = SparkSession \
        .builder \
        .appName("2-class classification") \
        .getOrCreate()

## Where to download the dataset

The dataset we use can downloaded from https://raw.githubusercontent.com/alessio-proietti/dp-sgd-notebook/main/data/bank-additional-full-new-label.csv. 

It's shipped with the repo itself though.

In [3]:
df = spark.createDataFrame(pd.read_csv("data/bank-additional-full-new-label.csv"))
#df.printSchema()
#df.show(n=2, vertical=True)

In [4]:
# Here we set the stages for data processing

numericCols = [field for (field, dataType) in df.dtypes if ( dataType != "string" )]
categoricalCols = [field for (field, dataType) in df.dtypes if (dataType == "string" and field != "y")]

indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [5]:
stringIndexerLabel = StringIndexer(inputCol="y", outputCol="label", handleInvalid="skip")
labelModel = stringIndexerLabel.fit(df)
df = labelModel.transform(df)

In [6]:
train, validation = df.randomSplit([.75, .25], 24)

pipeline = Pipeline(stages=[stringIndexer, oheEncoder, vecAssembler])

pipelineModel = pipeline.fit(train)
train_df = pipelineModel.transform(train).select("label","features")

pipelineModel = pipeline.fit(validation)
val_df = pipelineModel.transform(validation).select("label","features")

In [7]:
train_array = nd.array( [row.features.toArray() for row in train_df.collect()])
train_label_array = nd.array( [row.label for row in train_df.collect()])

val_array = nd.array( [row.features.toArray() for row in val_df.collect()])
val_label_array = nd.array( [row.label for row in val_df.collect()])

print(train_array.shape, train_label_array.shape)
print(val_array.shape, val_label_array.shape)

(30831, 53) (30831,)
(10357, 53) (10357,)


In [8]:
mx.random.seed(12345)

ctx = mx.cpu()
batch_size = 10

In [9]:
train_dataset = ArrayDataset(train_array, train_label_array)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = ArrayDataset(val_array, val_label_array)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [10]:
net = nn.HybridSequential()

with net.name_scope():
    net.add(nn.Dense(units=10, activation='relu'))  # input layer
    net.add(nn.Dense(units=10, activation='relu'))   # inner layer 1
    net.add(nn.Dense(units=10, activation='relu'))   # inner layer 2
    net.add(nn.Dense(units=1))   # output layer: notice, it must have only 1 neuron

net.initialize(mx.init.Xavier())

In [11]:
loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
trainer = Trainer(params=net.collect_params(), optimizer='sgd',
                  optimizer_params={'learning_rate': 0.1})
accuracy = mx.metric.Accuracy()
f1 = mx.metric.F1()

In [20]:
@mx.optimizer.Optimizer.register
class privateSDG(mx.optimizer.Optimizer):
    pass

In [21]:
optim = mx.optimizer.Optimizer.create_optimizer('privateSDG')

In [12]:
def train_model():
    cumulative_train_loss = 0

    for i, (data, label) in enumerate(train_dataloader):
        with autograd.record():
            # Do forward pass on a batch of training data
            output = net(data)

            # Calculate loss for the training data batch
            loss_result = loss(output, label)

        # Calculate gradients
        loss_result.backward()

        # Update parameters of the network
        trainer.step(batch_size)

        # sum losses of every batch
        cumulative_train_loss += nd.sum(loss_result).asscalar()

    return cumulative_train_loss

In [13]:
def validate_model(threshold):
    cumulative_val_loss = 0

    for i, (val_data, val_ground_truth_class) in enumerate(val_dataloader):
        # Do forward pass on a batch of validation data
        output = net(val_data)

        # Similar to cumulative training loss, calculate cumulative validation loss
        cumulative_val_loss += nd.sum(loss(output, val_ground_truth_class)).asscalar()

        # getting prediction as a sigmoid
        prediction = net(val_data).sigmoid()

        # Converting neuron outputs to classes
        predicted_classes = mx.nd.ceil(prediction - threshold)

        # Update validation accuracy
        accuracy.update(val_ground_truth_class, predicted_classes.reshape(-1))

        # calculate probabilities of belonging to different classes. F1 metric works only with this notation
        prediction = prediction.reshape(-1)
        probabilities = mx.nd.stack(1 - prediction, prediction, axis=1)

        f1.update(val_ground_truth_class, probabilities)

    return cumulative_val_loss

In [15]:
epochs = 10
threshold = 0.5
train_data_size = 30831
val_data_size = 10357

for e in range(epochs):
    avg_train_loss = train_model() / train_data_size
    avg_val_loss = validate_model(threshold) / val_data_size

    print("Epoch: %s, Training loss: %.2f, Validation loss: %.2f, Validation accuracy: %.2f, F1 score: %.2f" %
          (e, avg_train_loss, avg_val_loss, accuracy.get()[1], f1.get()[1]))

    # we reset accuracy, so the new epoch's accuracy would be calculated from the blank state
    accuracy.reset()

Epoch: 0, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 1, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 2, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 3, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 4, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 5, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 6, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 7, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 8, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00
Epoch: 9, Training loss: 0.35, Validation loss: 0.35, Validation accuracy: 0.89, F1 score: 0.00


In [None]:
#spark.stop()