In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.pipeline import Pipeline 
from pyspark.ml.classification import GBTClassifier

In [2]:
spark = SparkSession.builder.appName('Gradient Boosting').getOrCreate()

In [3]:
spark

In [4]:
filename = 'data/bank-full.csv'

In [5]:
df = spark.read.csv(filename, header=True, inferSchema=True, sep=';')

In [6]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [7]:
def assemble_vector(df, features_list, target):
    
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    string_indexer = StringIndexer(inputCol=target, outputCol=target+'_index')
    
    stages = [assembler, string_indexer]
    selected_cols = [target+'_index', 'features'] + features_list
    
    pipeline = Pipeline(stages=stages)
    assemble_model = pipeline.fit(df)
    df = assemble_model.transform(df).select(selected_cols)
    
    return df

In [9]:
# features and target:
target_variable_name = 'y'
gb_df = df.select(['age', 'balance', 'day', 'duration',
                          'campaign', 'pdays', 'previous', 'y'])

features_list = gb_df.columns
features_list.remove(target_variable_name)

In [10]:
df = assemble_vector(df, features_list, target_variable_name)

In [11]:
# Building the model:

clf = GBTClassifier(featuresCol='features', labelCol='y_index')

In [12]:
clf_model = clf.fit(df)

In [13]:
# printing the feature importance and interpretation of the model:

print(f"Feature Importance: \n{clf_model.featureImportances}")
print(f"Debug String:\n{clf_model.toDebugString}")

Feature Importance: 
(7,[0,1,2,3,4,5,6],[0.11142386184857173,0.0838803855010162,0.15309134664468318,0.45587843913590964,0.04089626763441027,0.14219816802556956,0.012631531209839512])
Debug String:
GBTClassificationModel: uid = GBTClassifier_5c330a76039d, numTrees=20, numClasses=2, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 490.5)
     If (feature 5 <= 9.5)
      If (feature 0 <= 60.5)
       If (feature 3 <= 219.5)
        If (feature 0 <= 28.5)
         Predict: -0.8586772187676653
        Else (feature 0 > 28.5)
         Predict: -0.9634662286595989
       Else (feature 3 > 219.5)
        If (feature 0 <= 25.5)
         Predict: -0.5055555555555555
        Else (feature 0 > 25.5)
         Predict: -0.8381466860585913
      Else (feature 0 > 60.5)
       If (feature 3 <= 120.5)
        If (feature 3 <= 92.5)
         Predict: -0.9560439560439561
        Else (feature 3 > 92.5)
         Predict: -0.7575757575757576
       Else (feature 3 > 120.5)
        If (feature 3 <=