In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import NaiveBayes

In [3]:
spark = SparkSession.builder.appName('Naive Bayes').getOrCreate()

In [4]:
spark

In [5]:
file_path = 'data/bank-full.csv'

In [6]:
df = spark.read.csv(file_path, inferSchema=True, header=True, sep=';')

In [7]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [8]:
def assemble_vector(df, features, target):
    assembler = VectorAssembler(inputCols=features, outputCol='features')
    string_indexer = StringIndexer(inputCol=target, outputCol='label')
    
    stages = [assembler, string_indexer]
    
    pipeline = Pipeline(stages=stages)
    selected_cols = ['label', 'features'] + features
    pipeline_model = pipeline.fit(df)
    df = pipeline_model.transform(df).select(selected_cols)
    return df

In [11]:
target = 'y'
nonneg_df = df.select(['age', 'day', 'duration', 'campaign', 'previous', 'y'])

features_list = nonneg_df.columns
features_list.remove(target)

In [12]:
df = assemble_vector(df, features_list, target)

In [13]:
df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- previous: integer (nullable = true)



In [14]:
clf = NaiveBayes(featuresCol='features', labelCol='label')
clf_model = clf.fit(df)