# 2-class Classification

The dataset is public and references to its license could be found in the README.md in /data subdirectory of the repo.

In [53]:
#!pip install autodp
#!pip install pyspark
#!pip install mxnet

import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon import nn, Trainer
from mxnet.gluon.data import DataLoader, ArrayDataset

# shipped with the repo
import dpdl_utils

# import packages for DP
from autodp import rdp_bank, rdp_acct

In [2]:
# This is the entry point of our spark app
spark = SparkSession \
        .builder \
        .appName("2-class classification") \
        .getOrCreate()

## Where to download the dataset

The dataset we use can downloaded from https://raw.githubusercontent.com/alessio-proietti/dp-sgd-notebook/main/data/bank-additional-full-new-label.csv. 

It's shipped with the repo itself though.

In [3]:
df = spark.createDataFrame(pd.read_csv("data/bank-additional-full-new-label.csv"))
df.printSchema()
df.show(n=2, vertical=True)

root
 |-- age: long (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp_var_rate: double (nullable = true)
 |-- cons_price_idx: double (nullable = true)
 |-- con_conf_idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr_employed: double (nullable = true)
 |-- y: string (nullable = true)

-RECORD 0---------------------
 age            | 56          
 job            | housemaid   
 marital        | married     
 education      | basic.4y    
 default        | 

In [4]:
numericCols = [field for (field, dataType) in df.dtypes if ( dataType != "string" )]
categoricalCols = [field for (field, dataType) in df.dtypes if (dataType == "string" and field != "y")]

indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stringIndexerLabel = StringIndexer(inputCol="y", outputCol="label", handleInvalid="skip")
labelModel = stringIndexerLabel.fit(df)
df = labelModel.transform(df)

lr = LogisticRegression()

In [5]:
train, validation, test = df.randomSplit([3.0, 1.0, 1.0], 24)

pipeline = Pipeline(stages=[stringIndexer, oheEncoder, vecAssembler, lr])
pipelineModel = pipeline.fit(train)
predDF = pipelineModel.transform(train).select("label","features","prediction")
predDF.show(5)

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(53,[6,12,15,21,2...|       0.0|
|  0.0|(53,[6,12,15,21,2...|       0.0|
|  0.0|(53,[6,12,15,21,2...|       0.0|
|  0.0|(53,[6,12,15,21,2...|       0.0|
|  0.0|(53,[0,12,16,21,2...|       0.0|
+-----+--------------------+----------+
only showing top 5 rows



In [6]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
areaUnderROC = evaluator.evaluate(predDF)
areaUnderROC

0.6803682895002037

In [52]:
data_array = nd.array( [row.features.toArray() for row in predDF.collect()])
label_array = nd.array( [row.label for row in predDF.collect()])
print(data_array.shape, label_array.shape)

(24748, 53) (24748,)


In [57]:
x_data = torch.tensor([row.features.toArray() for row in predDF.collect()])
x_data.shape

torch.Size([24748, 53])

In [8]:
#spark.stop()