# 2-class Classification

We want to assess the performance of a NN trained with a privacy engine based on SGD algorithm.
The dataset is public and references to its license could be found in the README.md in /data subdirectory of the repo.

In [16]:
# Let's call it the Ordinary stack
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", color_codes=True)
%matplotlib inline

# These are the new cowboys in the town
import petastorm
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import torch
import opacus

In [2]:
# This is the entry point of our spark app
spark = SparkSession \
    .builder \
    .appName("2-class classification") \
    .getOrCreate()

In [46]:
# Unfortunately there's no direct way to populate Spark's Dataframes from remote. So... here's a hack
url = "https://raw.githubusercontent.com/alessio-proietti/dp-sgd-notebook/main/data/bank-additional-full.csv"
pdf = pd.read_csv(url, delimiter=";")

# This is the real thing, our dataframe exposed as Spark's Dataframe
df = spark.createDataFrame(pdf)

In [47]:
# We index the string category with numbers
stringIndexer = StringIndexer() \
    .setInputCols(["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]) \
    .setOutputCols(["jobIndex", "maritalIndex", "educationIndex", "defaultIndex", "housingIndex", "loanIndex", "contactIndex", "monthIndex", "poutcomeIndex"])

stringModel = stringIndexer.fit(df)
df = stringModel.transform(df).drop("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome")

# Note This is different from scikit-learn’s OneHotEncoder, which keeps all categories. 
# The output vectors are sparse.
ohe = OneHotEncoder() \
    .setInputCols(["jobIndex", "maritalIndex", "educationIndex", "defaultIndex", "housingIndex", "loanIndex", "contactIndex", "monthIndex", "poutcomeIndex"]) \
    .setOutputCols(["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"])

oheModel = ohe.fit(df)
df = oheModel.transform(df).drop("jobIndex", "maritalIndex", "educationIndex", "defaultIndex", "housingIndex", "loanIndex", "contactIndex", "monthIndex", "poutcomeIndex")

In [48]:
df.toPandas().head()

Unnamed: 0,age,day_of_week,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,y,housing,job,contact,loan,education,marital,default,poutcome,month
0,56,mon,261,1,999,0,1.1,93.994,-36.4,4.857,...,no,"(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",(0.0),"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
1,57,mon,149,1,999,0,1.1,93.994,-36.4,4.857,...,no,"(0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",(0.0),"(1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
2,37,mon,226,1,999,0,1.1,93.994,-36.4,4.857,...,no,"(1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",(0.0),"(1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,40,mon,151,1,999,0,1.1,93.994,-36.4,4.857,...,no,"(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",(0.0),"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,56,mon,307,1,999,0,1.1,93.994,-36.4,4.857,...,no,"(0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",(0.0),"(0.0, 1.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


In [5]:
# The number of instances is quite large, we could attempt to have a tripartition of the dataset
train, validation, test = df.randomSplit([3.0, 1.0, 1.0], 24)

print(train.count(), validation.count(), test.count())

24748 8130 8310


In [2]:
spark.stop()