# Spark
## Decision trees
### Consulting project

In [2]:
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import findspark

from pyspark import SparkConf
from pyspark import SparkContext

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.mllib.evaluation import BinaryClassificationMetrics

from matplotlib import pyplot as plt
import seaborn as sns

import optuna

In [3]:
findspark.init()
findspark.find()

%matplotlib inline
sns.set_theme(style='darkgrid')
sns.set_context("notebook", rc={"lines.linewidth": 2.5})

In [4]:
random_seed = 0

In [5]:
conf = SparkConf().setAppName("MyApp") \
    .setMaster('local') \
    .set('spark.executor.memory', '8g') \
    .set('spark.driver.maxResultSize', '8g') \
    .set("spark.memory.fraction", "0.6") \
    .set("spark.memory.storageFraction", "0.5") \
    .set("spark.sql.shuffle.partitions", "5") \
    .set("spark.memory.offHeap.enabled", "false") \
    .set("spark.reducer.maxSizeInFlight", "96m") \
    .set("spark.shuffle.file.buffer", "256k") \
    .set("spark.sql.debug.maxToStringFields", "100") \
    .set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [6]:
%%capture

spark = SparkSession.builder.config(conf=conf).getOrCreate()

your 131072x1 screen size is bogus. expect trouble
23/11/07 22:16:24 WARN Utils: Your hostname, Diego-desktop resolves to a loopback address: 127.0.1.1; using 172.27.76.109 instead (on interface eth0)
23/11/07 22:16:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/07 22:16:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df = spark.read.csv('../data/dog_food.csv', inferSchema=True, header=True)

                                                                                

In [8]:
df.rdd.getNumPartitions()

1

In [9]:
df.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [10]:
features = [
    'A',
    'B',
    'C',
    'D'
]

target = 'Spoiled'

In [11]:
assembler = VectorAssembler(inputCols=features, outputCol='features')
data_table = assembler.transform(df)
data_table = data_table.select(target, 'features')
data_table.show(5)

+-------+------------------+
|Spoiled|          features|
+-------+------------------+
|    1.0|[4.0,2.0,12.0,3.0]|
|    1.0|[5.0,6.0,12.0,7.0]|
|    1.0|[6.0,2.0,13.0,6.0]|
|    1.0|[4.0,2.0,12.0,1.0]|
|    1.0|[4.0,2.0,12.0,3.0]|
+-------+------------------+
only showing top 5 rows



In [12]:
df_zeros = data_table.filter(data_table[target] == 0)
df_ones = data_table.filter(data_table[target] == 1)

train_zeros, test_zeros = df_zeros.randomSplit([0.7, 0.3], seed=random_seed)
train_ones, test_ones = df_ones.randomSplit([0.7, 0.3], seed=random_seed)

train = train_zeros.union(train_ones)
test = test_zeros.union(test_ones)

In [13]:
evaluatorMulti = MulticlassClassificationEvaluator(labelCol=target, predictionCol="prediction")

In [14]:
def objective(trial):
    
    params = {
        'maxDepth' : trial.suggest_int('maxDepth', 2, 10), 
        'maxBins': trial.suggest_int('maxBins', 2, 10),
        'subsamplingRate': trial.suggest_float('subsamplingRate', 0.1, 1.0, step=0.1),
        'maxIter': trial.suggest_int('maxIter', 20, 100, step=10),
        'stepSize' : trial.suggest_float('stepSize', 0.1, 1.0, step=0.1),
    }
   
    model = GBTClassifier(
        featuresCol='features',
        labelCol=target,
        seed=random_seed, 
        **params
    ) 
    
    model = model.fit(train)
    preds = model.transform(test)
    
    acc = evaluatorMulti.evaluate(preds, {evaluatorMulti.metricName: 'accuracy'})
    
    return acc

In [15]:
def callback(study, trial):
    if trial.value > 0.98:
        study.stop()

In [16]:
study = optuna.create_study(study_name='study_GBTClassifier', direction='maximize')
study.optimize(objective, callbacks=[callback], n_trials=30, show_progress_bar=False)
#study.optimize(objective, n_trials=50)
#print('Number of finished trials:', len(study.trials))
#print('Best trial:', study.best_trial.params)

[I 2023-11-07 22:16:43,274] A new study created in memory with name: study_GBTClassifier
[I 2023-11-07 22:17:10,524] Trial 0 finished with value: 0.9642857142857143 and parameters: {'maxDepth': 7, 'maxBins': 10, 'subsamplingRate': 0.4, 'maxIter': 40, 'stepSize': 0.5}. Best is trial 0 with value: 0.9642857142857143.
[I 2023-11-07 22:17:28,016] Trial 1 finished with value: 0.9714285714285714 and parameters: {'maxDepth': 10, 'maxBins': 4, 'subsamplingRate': 0.1, 'maxIter': 40, 'stepSize': 0.1}. Best is trial 1 with value: 0.9714285714285714.
[I 2023-11-07 22:17:46,245] Trial 2 finished with value: 0.9642857142857143 and parameters: {'maxDepth': 7, 'maxBins': 9, 'subsamplingRate': 0.5, 'maxIter': 40, 'stepSize': 0.30000000000000004}. Best is trial 1 with value: 0.9714285714285714.
[I 2023-11-07 22:18:00,059] Trial 3 finished with value: 0.9714285714285714 and parameters: {'maxDepth': 2, 'maxBins': 4, 'subsamplingRate': 1.0, 'maxIter': 80, 'stepSize': 0.30000000000000004}. Best is trial 1 w

In [17]:
print('Number of finished trials:', len(study.trials))

Number of finished trials: 30


In [18]:
print('Best trial:', study.best_trial.params)

Best trial: {'maxDepth': 10, 'maxBins': 4, 'subsamplingRate': 0.1, 'maxIter': 40, 'stepSize': 0.1}


In [19]:
model = GBTClassifier(
    featuresCol='features',
    labelCol=target,
    seed=random_seed,
    **study.best_trial.params
).fit(data_table)

In [20]:
model.featureImportances

SparseVector(4, {0: 0.1752, 1: 0.2135, 2: 0.4408, 3: 0.1705})