## Random Forest Classifier with PySpark

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('rf').getOrCreate()

25/07/15 01:50:09 WARN Utils: Your hostname, aditya-HP-Laptop-15s-eq1xxx resolves to a loopback address: 127.0.1.1; using 10.200.82.42 instead (on interface wlo1)
25/07/15 01:50:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/15 01:50:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/15 01:50:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/15 01:50:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/07/15 01:50:26 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
df = spark.read.format("libsvm").load("sample_libsvm_data.txt")

25/07/15 01:52:50 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

In [7]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



#### Train test split

In [8]:
(train, test) = df.randomSplit([0.7, 0.3], seed=42)

In [9]:
train.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[95,96,97,12...|
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[121,122,123...|
|  0.0|(692,[122,123,124...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[128,129,130...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
+-----+--------------------+
only showing top 20 rows



In [10]:
test.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[100,101,102...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[125,126,127...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[129,130,131...|
|  0.0|(692,[150,151,152...|
|  0.0|(692,[151,152,153...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[234,235,237...|
|  1.0|(692,[97,98,99,12...|
+-----+--------------------+
only showing top 20 rows



In [11]:
train.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



#### Train RF Model

In [13]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20, seed=42)

In [14]:
model = rf.fit(train)

In [15]:
pred = model.transform(test)

In [16]:
pred.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [17]:
pred.select("prediction", "label", "features").show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[129,130,131...|
|       0.0|  0.0|(692,[150,151,152...|
|       0.0|  0.0|(692,[151,152,153...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[153,154,155...|
|       0.0|  0.0|(692,[153,154,155...|
|       0.0|  0.0|(692,[154,155,156...|
|       0.0|  0.0|(692,[234,235,237...|
|       1.0|  1.0|(692,[97,98,99,12...|
+----------+-----+--------------------+
only showing top 20 rows



In [18]:
evalu = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [19]:
acc = evalu.evaluate(pred)

In [20]:
print("Test Error = %g" % (1.0 - acc))

Test Error = 0


In [21]:
model.featureImportances

SparseVector(692, {183: 0.0041, 272: 0.0463, 299: 0.0091, 300: 0.0441, 327: 0.0083, 351: 0.05, 373: 0.0403, 397: 0.003, 399: 0.037, 400: 0.0338, 405: 0.0397, 406: 0.05, 407: 0.1575, 412: 0.0428, 413: 0.091, 426: 0.0072, 429: 0.0028, 430: 0.0069, 435: 0.0163, 455: 0.0548, 460: 0.0031, 468: 0.0061, 469: 0.0037, 483: 0.0472, 510: 0.0409, 511: 0.0912, 518: 0.005, 568: 0.0371, 603: 0.009, 606: 0.0027, 634: 0.0089})

### Gradient Boosted Trees

In [22]:
from pyspark.ml.classification import GBTClassifier

In [23]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [25]:
train.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[95,96,97,12...|
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[121,122,123...|
|  0.0|(692,[122,123,124...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[128,129,130...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
+-----+--------------------+
only showing top 20 rows



In [26]:
test.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[100,101,102...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[125,126,127...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[129,130,131...|
|  0.0|(692,[150,151,152...|
|  0.0|(692,[151,152,153...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[234,235,237...|
|  1.0|(692,[97,98,99,12...|
+-----+--------------------+
only showing top 20 rows



In [27]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10, seed=42)

In [28]:
model = gbt.fit(train)

In [29]:
pred = model.transform(test)

In [30]:
pred.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



25/07/15 02:07:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [32]:
evalu = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
acc = evalu.evaluate(pred)
print("Test Error: %g" % (1 - acc))

Test Error: 0.0571429
