# Spark
## Decision trees
### Code along

In [63]:
from tqdm import tqdm

import numpy as np
import pandas as pd

import findspark

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.mllib.evaluation import BinaryClassificationMetrics

from matplotlib import pyplot as plt
import seaborn as sns

In [64]:
findspark.init()
findspark.find()

%matplotlib inline
sns.set_theme(style='darkgrid')
sns.set_context("notebook", rc={"lines.linewidth": 2.5})

In [65]:
random_seed = 0

In [66]:
%%capture

spark = SparkSession.builder.appName('decision_trees_code_along').getOrCreate()

In [67]:
df = spark.read.csv('../data/College.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [68]:
df.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [69]:
df.head(1)[0].asDict()

{'School': 'Abilene Christian University',
 'Private': 'Yes',
 'Apps': 1660,
 'Accept': 1232,
 'Enroll': 721,
 'Top10perc': 23,
 'Top25perc': 52,
 'F_Undergrad': 2885,
 'P_Undergrad': 537,
 'Outstate': 7440,
 'Room_Board': 3300,
 'Books': 450,
 'Personal': 2200,
 'PhD': 70,
 'Terminal': 78,
 'S_F_Ratio': 18.1,
 'perc_alumni': 12,
 'Expend': 7041,
 'Grad_Rate': 60}

In [70]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [71]:
target = 'Private'

features = [
    #'School',
    #'Private',
    'Apps',
    'Accept',
    'Enroll',
    'Top10perc',
    'Top25perc',
    'F_Undergrad',
    'P_Undergrad',
    'Outstate',
    'Room_Board',
    'Books',
    'Personal',
    'PhD',
    'Terminal',
    'S_F_Ratio',
    'perc_alumni',
    'Expend',
    'Grad_Rate'
]

In [72]:
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [73]:
df_new = assembler.transform(df)

In [74]:
df_new.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924

In [75]:
indexer = StringIndexer(inputCol='Private', outputCol='Private_Index')

In [76]:
df_new = indexer.fit(df_new).transform(df_new)
df_new.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+-------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|Private_Index|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+-------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|          0.0|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|    

In [78]:
data_table = df_new.select('features', 'Private_Index')
data_table.show(5)

+--------------------+-------------+
|            features|Private_Index|
+--------------------+-------------+
|[1660.0,1232.0,72...|          0.0|
|[2186.0,1924.0,51...|          0.0|
|[1428.0,1097.0,33...|          0.0|
|[417.0,349.0,137....|          0.0|
|[193.0,146.0,55.0...|          0.0|
+--------------------+-------------+
only showing top 5 rows



In [115]:
target = 'Private_Index'

df_zeros = data_table.filter(data_table[target] == 0)
df_ones = data_table.filter(data_table[target] == 1)

train_zeros, test_zeros = df_zeros.randomSplit([0.7, 0.3], seed=random_seed)
train_ones, test_ones = df_ones.randomSplit([0.7, 0.3], seed=random_seed)

train = train_zeros.union(train_ones)
test = test_zeros.union(test_ones)

In [137]:
train, test = data_table.randomSplit([0.7, 0.3], seed=random_seed)

In [138]:
models_names = ['DecisionTreeClassifier', 'RandomForestClassifier', 'GBTClassifier']
models_list = [DecisionTreeClassifier, RandomForestClassifier, GBTClassifier]

models = {}

for i, model in enumerate(models_names):
    models[model] = {
        'model' : models_list[i](
            labelCol=target,
            featuresCol='features',
            #seed=random_seed
        )
    }
    
    models[model] |= {'fit' : models[model]['model'].fit(train)}
    
    models[model] |= {'pred' : models[model]['fit'].transform(test)}

In [139]:
evaluatorMulti = MulticlassClassificationEvaluator(labelCol=target, predictionCol="prediction")
evaluatorBinary = BinaryClassificationEvaluator(labelCol=target)

In [140]:
for name in models_names:
    models[name] |= {
            'accuracy' : evaluatorMulti.evaluate(models[name]['pred'], {evaluatorMulti.metricName: 'accuracy'})
        }
    models[name] |= {
            'AUC' : evaluatorBinary.evaluate(models[name]['pred'])
        }

In [141]:
models

{'DecisionTreeClassifier': {'model': DecisionTreeClassifier_b54a3641fb0a,
  'fit': DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b54a3641fb0a, depth=5, numNodes=43, numClasses=2, numFeatures=17,
  'pred': DataFrame[features: vector, Private_Index: double, rawPrediction: vector, probability: vector, prediction: double],
  'accuracy': 0.8927038626609443,
  'AUC': 0.925989749027925},
 'RandomForestClassifier': {'model': RandomForestClassifier_662693befd2e,
  'fit': RandomForestClassificationModel: uid=RandomForestClassifier_662693befd2e, numTrees=20, numClasses=2, numFeatures=17,
  'pred': DataFrame[features: vector, Private_Index: double, rawPrediction: vector, probability: vector, prediction: double],
  'accuracy': 0.9399141630901288,
  'AUC': 0.9781724991162956},
 'GBTClassifier': {'model': GBTClassifier_94e8d0c129e8,
  'fit': GBTClassificationModel: uid = GBTClassifier_94e8d0c129e8, numTrees=20, numClasses=2, numFeatures=17,
  'pred': DataFrame[features: vector, Private_

In [130]:
evaluatorMulti.getMetricName()

'f1'