# Gradient-boosted tree Classification Model

Set up spark context and SparkSession

In [27]:
try:
    sc.stop
except:
    pass

from pyspark import SparkContext , SparkConf
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sparkContext = sc)

Load dataset

In [28]:
df = spark.read.format('csv') \
            .options(header='true', inferschema='true') \
            .options(delimiter=';')\
            .load("./data/bank.csv",header=True);
df.drop('day','month','poutcome').show(5)


+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [29]:
df.cache()

df.printSchema()
df.select('y').show(5)

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)

+---+
|  y|
+---+
| no|
| no|
| no|
| no|
| no|
+---+
only showing top 5 rows



Deal with categorical data and Convert the data to dense vector

In [30]:
def get_dummy(df,categoricalCols,continuousCols,labelCol):

    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]

    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',col(labelCol))

    return data.select('features','label')


In [31]:
catcols = ['job','marital','education','default',
           'housing','loan','contact','poutcome']

num_cols = ['balance', 'duration','campaign','pdays','previous',]
labelCol = 'y'

data = get_dummy(df,catcols,num_cols,labelCol)
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(29,[8,11,15,16,1...|   no|
|(29,[4,11,13,16,1...|   no|
|(29,[0,12,14,16,1...|   no|
|(29,[0,11,14,16,1...|   no|
|(29,[1,11,13,16,1...|   no|
+--------------------+-----+
only showing top 5 rows



Deal with Categorical Label and Variables

In [32]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed=indexed.drop('category')
indexed = indexed.selectExpr("id as id", "categoryIndex as category")

indexed.show()

+---+--------+
| id|category|
+---+--------+
|  0|     0.0|
|  1|     2.0|
|  2|     1.0|
|  3|     0.0|
|  4|     0.0|
|  5|     1.0|
+---+--------+



In [33]:
from pyspark.ml.feature import StringIndexer
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data).transform(data)
labelIndexer=labelIndexer.drop('label')
# indexed=labelIndexer.drop('label').show()

data.show()

+--------------------+------------+
|            features|indexedLabel|
+--------------------+------------+
|(29,[8,11,15,16,1...|         0.0|
|(29,[4,11,13,16,1...|         0.0|
|(29,[0,12,14,16,1...|         0.0|
|(29,[0,11,14,16,1...|         0.0|
|(29,[1,11,13,16,1...|         0.0|
|(29,[0,12,14,16,1...|         0.0|
|(29,[6,11,14,16,1...|         0.0|
|(29,[2,11,13,16,1...|         0.0|
|(29,[7,11,14,16,1...|         0.0|
|(29,[4,11,15,16,1...|         0.0|
|(29,[4,11,13,16,1...|         0.0|
|(29,[3,11,13,16,1...|         0.0|
|(29,[2,11,14,16,1...|         0.0|
|(29,[10,12,13,16,...|         1.0|
|(29,[1,11,13,16,1...|         0.0|
|(29,[0,11,14,16,1...|         0.0|
|(29,[2,11,13,16,1...|         0.0|
|(29,[3,12,14,16,1...|         0.0|
|(29,[1,12,15,16,1...|         0.0|
|(29,[4,11,13,16,1...|         0.0|
+--------------------+------------+
only showing top 20 rows



In [36]:
data = data.selectExpr("features as features", "indexedLabel as label")
data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(29,[8,11,15,16,1...|  0.0|
|(29,[4,11,13,16,1...|  0.0|
|(29,[0,12,14,16,1...|  0.0|
|(29,[0,11,14,16,1...|  0.0|
|(29,[1,11,13,16,1...|  0.0|
|(29,[0,12,14,16,1...|  0.0|
|(29,[6,11,14,16,1...|  0.0|
|(29,[2,11,13,16,1...|  0.0|
|(29,[7,11,14,16,1...|  0.0|
|(29,[4,11,15,16,1...|  0.0|
|(29,[4,11,13,16,1...|  0.0|
|(29,[3,11,13,16,1...|  0.0|
|(29,[2,11,14,16,1...|  0.0|
|(29,[10,12,13,16,...|  1.0|
|(29,[1,11,13,16,1...|  0.0|
|(29,[0,11,14,16,1...|  0.0|
|(29,[2,11,13,16,1...|  0.0|
|(29,[3,12,14,16,1...|  0.0|
|(29,[1,12,15,16,1...|  0.0|
|(29,[4,11,13,16,1...|  0.0|
+--------------------+-----+
only showing top 20 rows



Split the data to training and test data sets

In [37]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])

trainingData.show(5,False)
testData.show(5,False)

+------------------------------------------------------------------------------------------------+-----+
|features                                                                                        |label|
+------------------------------------------------------------------------------------------------+-----+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-588.0,81.0,4.0,-1.0])|0.0  |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,117.0,635.0,1.0,-1.0])|0.0  |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,238.0,808.0,1.0,-1.0])|0.0  |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,407.0,145.0,2.0,-1.0])|0.0  |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,725.0,266.0,1.0,-1.0])|0.0  |
+------------------------------------------------------------------------------------------------+-----+
only showing top 5 rows

+-----------------------------

##  Binomial logistic regression

In [45]:
# Fit Logistic Regression Model
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors # !!!!caution: not from pyspark.mllib.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

# Train a GBT model.
from pyspark.ml.classification import MultilayerPerceptronClassifier

layers = [4, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

Convert indexed labels back to original labels.


# Chain indexers and tree in a Pipeline

In [46]:
trainingData.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  1.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
|(29,[0,11,13,16,1...|  0.0|
+--------------------+-----+
only showing top 20 rows



Train model.  This also runs the indexers.

In [47]:
model = trainer.fit(trainingData)

IllegalArgumentException: requirement failed: OneHotEncoderModel expected 3 categorical values for input column label, but the input column had metadata specifying 2 values.

Make predictions.

In [None]:
predictions = model.transform(testData)

Select example rows to display.

In [None]:
predictions.select("features","label","predictedLabel").show(5)

# Evaluation

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:


# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only

visualization

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
class_temp = predictions.select("label").groupBy("label")\
                        .count().sort('count', ascending=False).toPandas()
class_names = class_temp["label"].values.tolist()
class_names

In [None]:
from sklearn.metrics import confusion_matrix
y_true = predictions.select("label")
y_true = y_true.toPandas()

y_pred = predictions.select("predictedLabel")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred,labels=class_names)
cnf_matrix

In [None]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
plt.show()

In [None]:
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()