In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import RandomForestClassifier

import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.appName('RandomForest').getOrCreate()

In [3]:
spark

In [4]:
filename = 'data/bank-full.csv'

In [5]:
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')

In [6]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [8]:
# assembling the feature:

def assemble_vector(df, features_list, target):
    
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    string_indexer = StringIndexer(inputCol=target, outputCol=target+'_index')
    stages = [assembler, string_indexer]
    
    selected_cols = [target+'_index', 'features'] + features_list
    
    pipeline = Pipeline(stages=stages)
    assemble_model = pipeline.fit(df)
    df = assemble_model.transform(df).select(selected_cols)
    return df

In [10]:
# features and target:
target_variable_name = 'y'
rf_df = data.select(['age', 'balance', 'day', 'duration',
                          'campaign', 'pdays', 'previous', 'y'])

features_list = rf_df.columns
features_list.remove(target_variable_name)

In [15]:
# applying the function:
df = assemble_vector(data, features_list, target_variable_name)

In [16]:
# building the instance 
rf = RandomForestClassifier(featuresCol='features', labelCol='y_index')

In [18]:
# training the model on the data:
rf_model = rf.fit(df)

In [19]:
# looking at the feature importance:
print(f"Feature Importance:\n {rf_model.featureImportances}")

Feature Importance:
 (7,[0,1,2,3,4,5,6],[0.08584434644644709,0.011033366860813895,0.015559015434628657,0.6793873016945361,0.004066526221407603,0.14272552678180514,0.06138391656036157])


In [20]:
# looking at the tree decisions:
print(rf_model.toDebugString)

RandomForestClassificationModel: uid=RandomForestClassifier_278ff0c65c30, numTrees=20, numClasses=2, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 435.5)
     If (feature 3 <= 207.5)
      Predict: 0.0
     Else (feature 3 > 207.5)
      If (feature 5 <= 9.0)
       If (feature 0 <= 60.5)
        Predict: 0.0
       Else (feature 0 > 60.5)
        If (feature 3 <= 252.5)
         Predict: 0.0
        Else (feature 3 > 252.5)
         Predict: 1.0
      Else (feature 5 > 9.0)
       If (feature 0 <= 58.5)
        Predict: 0.0
       Else (feature 0 > 58.5)
        If (feature 5 <= 192.5)
         Predict: 1.0
        Else (feature 5 > 192.5)
         Predict: 0.0
    Else (feature 3 > 435.5)
     If (feature 3 <= 692.5)
      If (feature 6 <= 0.5)
       If (feature 0 <= 60.5)
        Predict: 0.0
       Else (feature 0 > 60.5)
        Predict: 1.0
      Else (feature 6 > 0.5)
       If (feature 0 <= 48.5)
        If (feature 5 <= 98.5)
         Predict: 1.0
        Else (fe