In [25]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.pipeline import Pipeline

import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.appName('Decision Trees').getOrCreate()

In [3]:
spark

In [7]:
filename = 'data/bank-full.csv'

In [9]:
# reading the data:
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')

In [10]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [15]:
# assembling the features:
def assemble_vectors(df, features_list, target_variable):
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    string_indexer = StringIndexer(inputCol=target_variable, outputCol=target_variable+'_index')
    
    stages = [assembler, string_indexer]
    pipeline = Pipeline(stages=stages)
    
    selected_cols = [target_variable+'_index', 'features'] + features_list
    
    assembleModel = pipeline.fit(df)
    df = assembleModel.transform(df).select(selected_cols)
    
    return df

In [16]:
# independent features:
target_variable_name = 'y'
dt_df = data.select(['age', 'balance', 'day', 'duration',
                          'campaign', 'pdays', 'previous', 'y'])

features_list = dt_df.columns
features_list.remove(target_variable_name)

In [17]:
print(target_variable_name)
print(features_list)

y
['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [18]:
# assembling the features:

df = assemble_vectors(dt_df, features_list, target_variable_name)

In [19]:
df.printSchema()

root
 |-- y_index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)



In [20]:
# creating the models:

dt_clf1 = DecisionTreeClassifier(featuresCol='features', labelCol='y_index', impurity='gini')  # gini impurity 
dt_clf2 = DecisionTreeClassifier(featuresCol='features', labelCol='y_index', impurity='entropy')  # entropy impurity

In [21]:
# Fitting the models:

clf1_model = dt_clf1.fit(df)
clf2_model = dt_clf2.fit(df)

In [22]:
# looking at the feature importances:

print(f"Feature importance of GINI:\n {clf1_model.featureImportances}")
print(f"Feature importance of ENTROPY:\n {clf2_model.featureImportances}")

Feature importance of GINI:
 (7,[0,1,2,3,4,5],[0.0009151360672579515,0.0021920314153642787,0.022546846747201893,0.7303842591766411,0.0013091781991514313,0.2426525483943835])
Feature importance of ENTROPY:
 (7,[1,2,3,4,5],[0.0005994387014215233,0.008899987867155928,0.7160047618305356,0.0006113261278394541,0.2738844854730475])


In [23]:
# Looking at the rules:
clf1_model.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_cb75a67f1090, depth=5, numNodes=35, numClasses=2, numFeatures=7\n  If (feature 3 <= 547.5)\n   If (feature 5 <= 8.5)\n    Predict: 0.0\n   Else (feature 5 > 8.5)\n    If (feature 3 <= 166.5)\n     Predict: 0.0\n    Else (feature 3 > 166.5)\n     If (feature 5 <= 185.5)\n      If (feature 2 <= 15.5)\n       Predict: 1.0\n      Else (feature 2 > 15.5)\n       Predict: 0.0\n     Else (feature 5 > 185.5)\n      If (feature 5 <= 370.5)\n       Predict: 0.0\n      Else (feature 5 > 370.5)\n       Predict: 1.0\n  Else (feature 3 > 547.5)\n   If (feature 3 <= 836.5)\n    If (feature 5 <= 0.0)\n     If (feature 3 <= 664.5)\n      Predict: 0.0\n     Else (feature 3 > 664.5)\n      If (feature 2 <= 29.5)\n       Predict: 0.0\n      Else (feature 2 > 29.5)\n       Predict: 1.0\n    Else (feature 5 > 0.0)\n     If (feature 2 <= 21.5)\n      If (feature 2 <= 15.5)\n       Predict: 1.0\n      Else (feature 2 > 15.5)\n       Predict: 0.0\n  

In [24]:
clf2_model.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_df7275e510c1, depth=5, numNodes=31, numClasses=2, numFeatures=7\n  If (feature 3 <= 412.5)\n   If (feature 5 <= 18.0)\n    Predict: 0.0\n   Else (feature 5 > 18.0)\n    If (feature 3 <= 157.5)\n     Predict: 0.0\n    Else (feature 3 > 157.5)\n     If (feature 5 <= 185.5)\n      If (feature 2 <= 15.5)\n       Predict: 1.0\n      Else (feature 2 > 15.5)\n       Predict: 0.0\n     Else (feature 5 > 185.5)\n      If (feature 5 <= 370.5)\n       Predict: 0.0\n      Else (feature 5 > 370.5)\n       Predict: 1.0\n  Else (feature 3 > 412.5)\n   If (feature 3 <= 664.5)\n    If (feature 5 <= 8.5)\n     Predict: 0.0\n    Else (feature 5 > 8.5)\n     If (feature 5 <= 185.5)\n      Predict: 1.0\n     Else (feature 5 > 185.5)\n      If (feature 5 <= 370.5)\n       Predict: 0.0\n      Else (feature 5 > 370.5)\n       Predict: 1.0\n   Else (feature 3 > 664.5)\n    If (feature 3 <= 836.5)\n     If (feature 5 <= 0.0)\n      If (feature 2 <= 29