In [1]:
import findspark
findspark.init('/home/osboxes/spark-2.4.3-bin-hadoop2.7')
import pyspark
import os

In [2]:
myPath = os.path.join('/home', 'osboxes', 'CourseMaterial', 'Spark_for_Machine_Learning', 'Tree_Methods')

print(myPath)

os.chdir(myPath)

/home/osboxes/CourseMaterial/Spark_for_Machine_Learning/Tree_Methods


In [3]:
os.listdir()

['Tree Methods Code Along.ipynb',
 '.ipynb_checkpoints',
 'sample_libsvm_data.txt',
 'Tree_Methods_Consulting_Project.ipynb',
 'Tree_Methods_Consulting_Project_SOLUTION.ipynb',
 'College.csv',
 'dog_food.csv',
 'Tree_Methods_Doc_Example.ipynb']

In [4]:
# set up the session info

from pyspark.sql import SparkSession

name = 'tree'

spark = SparkSession.builder.appName(name).getOrCreate()

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [6]:
# example data is pre-configured

data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
train, test = data.randomSplit([0.7, 0.3])

In [9]:
# make the classifier objects

dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [10]:
# fit them

dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [14]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)


In [12]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[150,151,152...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[123,124,125...|   [0.0,38.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|   [0.0,38.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [15]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.46423667347340...|[0.94923616086619...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.50604268547122...|[0.95311712695076...|       0.0|
|  0.0|(692,[150,151,152...|[1.23673776344,-1...|[0.92226131008836...|       0.0|
|  0.0|(692,[153,154,155...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[153,154,155...|[1.23673776343999...|[0.92226131008836...|       0.0|
|  0.0|(692,[154

In [16]:
# build one for accuracy

acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [17]:
print('DTC Accuracy:')
acc_eval.evaluate(dtc_preds)

DTC Accuracy:


1.0

In [18]:
print('RFC Accuracy:')
acc_eval.evaluate(rfc_preds)

RFC Accuracy:


1.0

In [19]:
print('GBT Accuracy:')
acc_eval.evaluate(gbt_preds)

GBT Accuracy:


1.0

In [20]:
# variable importance

rfc_model.featureImportances

SparseVector(692, {99: 0.0005, 100: 0.0009, 129: 0.0006, 147: 0.0007, 148: 0.002, 158: 0.0009, 177: 0.0006, 187: 0.0003, 207: 0.0021, 208: 0.0019, 209: 0.0005, 210: 0.0008, 212: 0.0005, 214: 0.0004, 215: 0.0012, 216: 0.0276, 217: 0.0005, 234: 0.0057, 240: 0.0004, 242: 0.0029, 243: 0.0111, 244: 0.015, 245: 0.0006, 261: 0.0021, 262: 0.009, 263: 0.0001, 272: 0.0404, 273: 0.0084, 274: 0.0006, 290: 0.0075, 291: 0.0011, 292: 0.0005, 295: 0.0056, 300: 0.0317, 301: 0.0288, 302: 0.0006, 315: 0.0006, 318: 0.0007, 320: 0.0005, 322: 0.0109, 323: 0.0007, 326: 0.0006, 327: 0.0059, 328: 0.0079, 330: 0.007, 343: 0.0011, 345: 0.0003, 346: 0.0024, 349: 0.0005, 350: 0.0116, 351: 0.0118, 354: 0.0004, 355: 0.0006, 358: 0.0142, 369: 0.0005, 370: 0.0021, 372: 0.008, 373: 0.001, 375: 0.0017, 376: 0.0004, 377: 0.0059, 378: 0.0026, 379: 0.0336, 383: 0.001, 384: 0.0018, 385: 0.0333, 386: 0.0073, 400: 0.0072, 401: 0.0024, 403: 0.0011, 405: 0.0157, 406: 0.038, 407: 0.0121, 410: 0.0005, 411: 0.0006, 415: 0.0029, 41