# 2-class Classification

We want to assess the performance of a NN trained with a privacy engine based on SGD algorithm.
The dataset is public and references to its license could be found in the README.md in /data subdirectory of the repo.

In [None]:
# Let's call it the Ordinary stack
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", color_codes=True)
%matplotlib inline

# These are the new cowboys in the town
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader
from pyspark.sql import SparkSession
import torch
import opacus

In [None]:
# This is the entry point of our spark app
spark = SparkSession \
        .builder \
        .appName("2-class classification") \
        .getOrCreate()
        

In [None]:
# Unfortunately there's no direct way to populate Spark's Dataframes from remote. So... here's a hack
url = "https://raw.githubusercontent.com/alessio-proietti/dp-sgd-notebook/main/data/bank-additional-full.csv"
df = pd.read_csv(url, delimiter=";")

# This is the real thing, our dataframe exposed as Spark's Dataframe
df = spark.createDataFrame(df)
df.toPandas()

In [None]:
# !!!PLEASE DO NOT EXECUTE THIS CELL!!!

from pyspark.ml.feature import StringIndexer, OneHotEncoder
# We index the string category with numbers
stringIndexer = StringIndexer() \
    .setInputCols(["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]) \
    .setOutputCols(["jobIndex", "maritalIndex", "educationIndex", "defaultIndex", "housingIndex", "loanIndex", "contactIndex", "monthIndex", "poutcomeIndex"])

stringModel = stringIndexer.fit(df)
df = stringModel.transform(df).drop("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome")

# Note This is different from scikit-learn’s OneHotEncoder, which keeps all categories. 
# The output vectors are sparse.
ohe = OneHotEncoder() \
    .setInputCols(["jobIndex", "maritalIndex", "educationIndex", "defaultIndex", "housingIndex", "loanIndex", "contactIndex", "monthIndex", "poutcomeIndex"]) \
    .setOutputCols(["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"])

oheModel = ohe.fit(df)
df = oheModel.transform(df).drop("jobIndex", "maritalIndex", "educationIndex", "defaultIndex", "housingIndex", "loanIndex", "contactIndex", "monthIndex", "poutcomeIndex")

#to be deleted IS ONLY AN EXPERIMENT
df = df.drop("emp.var.rate", "cons.price.idx", "cons.conf.idx", "nr.employed")
df

In [None]:
# This cell is only to be used in development. We want to run a toy model in PyTorch
# !!! TO BE DELETED !!!
df = df[['age', 'y']]
df.show()

In [None]:
# The number of instances is quite large, we could attempt to have a tripartition of the dataset
train, validation, test = df.randomSplit([3.0, 1.0, 1.0], 24)

In [None]:
# Write on an Apache Parquet the results of the 'data wrangling'

test \
 .write \
 .mode('overwrite') \
 .parquet('data/spark_processed_data')


In [None]:
# substitute with the actual path
read = make_batch_reader('file:///path/to/dp-sgd-notebook/data/spark_processed_data')

DataLoader(
        read,
        batch_size=16
    )

In [None]:
spark.stop()