In [1]:
import pyspark

from pyspark.sql import SQLContext

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
sc = pyspark.SparkContext('local[*]')
sql = SQLContext(sc)

In [3]:
# Load and parse the data file, converting it to a DataFrame.
data = sql.read.format("libsvm").load("/usr/local/spark-2.2.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

In [4]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [5]:
data.head()

Row(label=0.0, features=SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0,

In [6]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

In [11]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [12]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [13]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [14]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [15]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

In [16]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [17]:
# Make predictions.
predictions = model.transform(testData)

In [18]:
predictions.head()

Row(label=0.0, features=SparseVector(692, {95: 56.0, 96: 247.0, 97: 121.0, 123: 24.0, 124: 242.0, 125: 245.0, 126: 122.0, 152: 231.0, 153: 253.0, 154: 253.0, 155: 104.0, 156: 12.0, 180: 90.0, 181: 253.0, 182: 253.0, 183: 254.0, 184: 221.0, 185: 120.0, 186: 120.0, 187: 85.0, 205: 67.0, 206: 75.0, 207: 36.0, 208: 11.0, 209: 56.0, 210: 222.0, 211: 254.0, 212: 253.0, 213: 253.0, 214: 253.0, 215: 245.0, 216: 207.0, 217: 36.0, 232: 86.0, 233: 245.0, 234: 249.0, 235: 105.0, 238: 44.0, 239: 224.0, 240: 230.0, 241: 253.0, 242: 253.0, 243: 253.0, 244: 253.0, 245: 214.0, 246: 10.0, 259: 8.0, 260: 191.0, 261: 253.0, 262: 143.0, 268: 29.0, 269: 119.0, 270: 119.0, 271: 158.0, 272: 253.0, 273: 253.0, 274: 94.0, 287: 15.0, 288: 253.0, 289: 226.0, 290: 48.0, 299: 4.0, 300: 183.0, 301: 253.0, 302: 248.0, 303: 56.0, 315: 42.0, 316: 253.0, 317: 178.0, 328: 179.0, 329: 253.0, 330: 184.0, 331: 14.0, 343: 164.0, 344: 253.0, 345: 178.0, 356: 179.0, 357: 253.0, 358: 163.0, 370: 61.0, 371: 254.0, 372: 254.0, 37

In [19]:
# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[95,96,97,12...|
|           0.0|  0.0|(692,[98,99,100,1...|
|           0.0|  0.0|(692,[122,123,124...|
|           0.0|  0.0|(692,[122,123,148...|
|           0.0|  0.0|(692,[124,125,126...|
+--------------+-----+--------------------+
only showing top 5 rows



In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
rfModel = model.stages[2]
print(rfModel)  # summary only