In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [2]:
spark = SparkSession.builder.appName('Multilayer Perceptron').getOrCreate()

In [3]:
spark

In [4]:
file_path = 'data/bank-full.csv'

In [5]:
df = spark.read.csv(file_path, inferSchema=True, header=True, sep=';')

In [6]:
df.toPandas().head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
def vector_assemble(df, features_list, target):
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    string_indexer = StringIndexer(inputCol=target, outputCol=target+'_index')
    
    selected_cols = [target+'_index', 'features'] + features_list
    
    stages = [assembler, string_indexer]
    pipeline = Pipeline(stages=stages)
    
    pipeline_model = pipeline.fit(df)
    df = pipeline_model.transform(df).select(selected_cols)
    return df

In [11]:
target = 'y'
cols = df.select(['age', 'balance', 'day', 'duration',
                          'campaign', 'pdays', 'previous', 'y'])

features_list = cols.columns
features_list.remove(target)

In [12]:
# assembling the features:
df_final = vector_assemble(df, features_list, target)

In [13]:
df_final.printSchema()

root
 |-- y_index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)



In [14]:
# Creating the Neural Network:

clf = MultilayerPerceptronClassifier(featuresCol='features', labelCol='y_index', layers=[4, 4, 2])
# the output layer is 2 since we are solving a binary classification problem

In [15]:
clf_model = clf.fit(df_final)

In [18]:
clf_model.weights

DenseVector([-0.4212, 1.2427, 0.5959, -0.0086, -0.4802, 1.1242, 0.4269, -0.0, 0.7769, -0.152, 1.032, 0.0049, -0.9828, 1.1834, 0.7686, -0.0043, 0.9051, -0.6471, 0.7383, 0.7908, 0.1495, -1.3123, -1.6061, 1.1853, 1.322, 0.529, 3.7031, -3.463, 1.1045, -0.6502])