# Spark
## Decision trees
### Documentation example

In [1]:
from tqdm import tqdm

import numpy as np
import pandas as pd

import findspark

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.mllib.evaluation import BinaryClassificationMetrics

from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
findspark.init()
findspark.find()

%matplotlib inline
sns.set_theme(style='darkgrid')
sns.set_context("notebook", rc={"lines.linewidth": 2.5})

In [3]:
random_seed = 0

In [4]:
%%capture

spark = SparkSession.builder.appName('decision_trees_documentation_example').getOrCreate()

your 131072x1 screen size is bogus. expect trouble


23/11/07 09:43:30 WARN Utils: Your hostname, Diego-desktop resolves to a loopback address: 127.0.1.1; using 172.27.76.109 instead (on interface eth0)
23/11/07 09:43:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/07 09:43:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = spark.read.format('libsvm').load('../data/sample_libsvm_data.txt')
df.printSchema()

23/11/07 09:43:37 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
[Stage 0:>                                                          (0 + 1) / 1]

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



                                                                                

In [6]:
df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [7]:
target = 'label'

df_zeros = df.filter(df[target] == 0)
df_ones = df.filter(df[target] == 1)

train_zeros, test_zeros = df_zeros.randomSplit([0.7, 0.3], seed=random_seed)
train_ones, test_ones = df_ones.randomSplit([0.7, 0.3], seed=random_seed)

train = train_zeros.union(train_ones)
test = test_zeros.union(test_ones)

In [8]:
DecisionTreeClassifier?

[0;31mInit signature:[0m
[0mDecisionTreeClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeaturesCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'features'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabelCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'label'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpredictionCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'prediction'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprobabilityCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'probability'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrawPredictionCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'rawPrediction'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaxDepth[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaxBins[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m32[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mminInstancesPerNode[0m[0;34m:[0m [0m

In [9]:
models_names = ['DecisionTreeClassifier', 'RandomForestClassifier', 'GBTClassifier']
models_list = [DecisionTreeClassifier, RandomForestClassifier, GBTClassifier]

models = {}

for i, model in enumerate(models_names):
    models[model] = {
        'model' : models_list[i](
            labelCol='label',
            featuresCol='features',
            seed=random_seed
        )
    }
    
    models[model] |= {'fit' : models[model]['model'].fit(train)}
    
    models[model] |= {'pred' : models[model]['fit'].transform(test)}

In [10]:
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

In [11]:
for name in models_names:
    models[name] |= {
            'accuracy' : evaluatorMulti.evaluate(models[name]['pred'], {evaluatorMulti.metricName: 'accuracy'})
        }

In [12]:
print('Models accuracy:')
for name in models_names:
    print(f'\t{name} : {models[name]['accuracy']:.2f}')

Models accuracy:
	DecisionTreeClassifier : 0.97
	RandomForestClassifier : 1.00
	GBTClassifier : 0.97


In [13]:
models[models_names[1]]['fit'].featureImportances

SparseVector(692, {149: 0.0028, 206: 0.0208, 243: 0.0031, 262: 0.0418, 268: 0.0026, 271: 0.0026, 299: 0.005, 329: 0.0411, 346: 0.0028, 350: 0.008, 351: 0.1356, 378: 0.047, 379: 0.0419, 385: 0.0446, 399: 0.0399, 401: 0.0101, 406: 0.05, 412: 0.0114, 433: 0.0528, 434: 0.0925, 440: 0.0251, 443: 0.0089, 453: 0.0386, 455: 0.045, 483: 0.0409, 485: 0.0091, 490: 0.083, 495: 0.0292, 517: 0.0469, 518: 0.0015, 520: 0.0019, 544: 0.0028, 599: 0.0091, 628: 0.0016})