In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [2]:
from pyspark.ml import Pipeline

In [4]:
from pyspark.ml.classification import (RandomForestClassifier,
                                      GBTClassifier,
                                      DecisionTreeClassifier)

In [5]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [7]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [8]:
dtc = DecisionTreeClassifier()

In [10]:
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [11]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [12]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [13]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[124,125,126...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[129,130,131...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[155,156,180...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[119,120,121...|   [0.0,38.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|   [0.0,38.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [16]:
print('DTC ACCURACY')
acc_eval.evaluate(dtc_preds)

DTC ACCURACY


1.0

In [17]:
print('RFC ACCURACY')
acc_eval.evaluate(rfc_preds)

RFC ACCURACY


1.0

In [18]:
print('GBT ACCURACY')
acc_eval.evaluate(gbt_preds)

GBT ACCURACY


1.0

In [19]:
rfc_model.featureImportances

SparseVector(692, {98: 0.0006, 99: 0.0011, 158: 0.0004, 182: 0.0036, 185: 0.0004, 186: 0.0004, 203: 0.0005, 205: 0.0014, 206: 0.0022, 207: 0.0087, 214: 0.0011, 215: 0.005, 230: 0.0006, 232: 0.0035, 234: 0.0018, 238: 0.0001, 239: 0.0007, 242: 0.0002, 243: 0.0061, 260: 0.0023, 262: 0.0322, 264: 0.0003, 267: 0.0005, 272: 0.0011, 273: 0.0005, 287: 0.0039, 289: 0.0145, 290: 0.0088, 294: 0.001, 295: 0.0021, 297: 0.0006, 298: 0.0011, 299: 0.0083, 300: 0.0015, 301: 0.0017, 302: 0.0095, 313: 0.0009, 317: 0.0003, 318: 0.0005, 322: 0.0002, 323: 0.0011, 327: 0.001, 328: 0.0044, 329: 0.0003, 343: 0.0006, 344: 0.0005, 346: 0.0091, 347: 0.0005, 350: 0.0338, 351: 0.0407, 352: 0.0039, 354: 0.0005, 356: 0.0017, 357: 0.0026, 358: 0.0154, 369: 0.0033, 370: 0.0006, 373: 0.0075, 375: 0.0005, 376: 0.0012, 377: 0.0304, 378: 0.0026, 379: 0.0262, 381: 0.0007, 385: 0.0068, 386: 0.0091, 397: 0.0006, 398: 0.0005, 400: 0.0121, 401: 0.0032, 405: 0.039, 406: 0.0575, 407: 0.0383, 415: 0.0009, 426: 0.0012, 427: 0.0005,