In [1]:
from pyspark.sql import SparkSession
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .getOrCreate()

sc = spark.sparkContext

In [4]:
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])



In [5]:
trainingData.count(), testData.count()

(71, 29)

In [7]:
type(data)

pyspark.rdd.PipelinedRDD

In [8]:
data.take(5)

[LabeledPoint(0.0, (692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0

In [9]:
data.toDF().show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
|(692,[129,130,131...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[99,100,101,...|  1.0|
|(692,[154,155,156...|  0.0|
|(692,[127,128,129...|  0.0|
|(692,[154,155,156...|  1.0|
|(692,[153,154,155...|  0.0|
|(692,[151,152,153...|  0.0|
|(692,[129,130,131...|  1.0|
|(692,[154,155,156...|  0.0|
|(692,[150,151,152...|  1.0|
|(692,[124,125,126...|  0.0|
|(692,[152,153,154...|  0.0|
|(692,[97,98,99,12...|  1.0|
|(692,[124,125,126...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [None]:
trainingData.count(), testData.count()

In [28]:

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=10)



In [29]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())


Test Error = 0.06896551724137931
Learned classification tree model:
DecisionTreeModel classifier of depth 1 with 3 nodes
  If (feature 406 <= 0.0)
   Predict: 0.0
  Else (feature 406 > 0.0)
   Predict: 1.0



In [31]:

# Save and load model
model.save(sc, "target/tmp/myDecisionTreeClassificationModel_1")
sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel_1")

In [32]:
predictions_from_sameModel = sameModel.predict(testData.map(lambda x: x.features))

In [33]:
labelsAndPredictions_from_sameModel = testData.map(lambda lp: lp.label).zip(predictions_from_sameModel)

In [34]:
testErr = labelsAndPredictions_from_sameModel.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.06896551724137931
Learned classification tree model:
DecisionTreeModel classifier of depth 1 with 3 nodes
  If (feature 406 <= 0.0)
   Predict: 0.0
  Else (feature 406 > 0.0)
   Predict: 1.0



In [35]:
print(predictions.take(5))
print(predictions_from_sameModel.take(5))

[1.0, 1.0, 0.0, 1.0, 0.0]
[1.0, 1.0, 0.0, 1.0, 0.0]


In [36]:
predictions.collect() == predictions_from_sameModel.collect()

True

## Randome Forest

In [37]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [38]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("../data/sample_libsvm_data.txt")

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [39]:
trainingData

DataFrame[label: double, features: vector]

In [48]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=3).fit(trainingData)

In [49]:
# Make predictions.
predictions = rf.transform(testData)

In [50]:
# Select example rows to display.
predictions.show(5)

+-----+--------------------+-------------+--------------------+----------+
|label|            features|rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------------+----------+
|  0.0|(692,[100,101,102...|    [1.0,2.0]|[0.33333333333333...|       1.0|
|  0.0|(692,[121,122,123...|    [2.0,1.0]|[0.66666666666666...|       0.0|
|  0.0|(692,[126,127,128...|    [2.0,1.0]|[0.66666666666666...|       0.0|
|  0.0|(692,[126,127,128...|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|    [3.0,0.0]|           [1.0,0.0]|       0.0|
+-----+--------------------+-------------+--------------------+----------+
only showing top 5 rows



In [51]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))


Test Error = 0.0555556
