## Assignment 4 - Random Forest Classification

### Import PySpark and MlLib  Libraries required

In [1]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SQLContext,SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Initialize Spark Session

In [2]:
#Initializing the SparkSession
spark=SparkSession.builder.appName("RandomForest").getOrCreate()


There are 490 rows 5 columns in the data.


## Load  the Dataset

In [4]:
#loading the dataset
dog_df=spark.read.option("header",'True').option("inferSchema","True").csv("dog_food.csv")

In [5]:
#describe the dataframe
dog_df.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



Print the Schema

In [6]:
dog_df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



## Import the VectorAssembler

In [7]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol="features")
output = assembler.transform(dog_df)

##  Spliting the Training and Test data

In [17]:
#Spliting 70% as Training set and 30% as Testing set
train_df,test_df = output.randomSplit([0.7,0.3])


In [18]:
train_df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



## Model Training

In [20]:
#training randomforest model

rf = RandomForestClassifier(labelCol="Spoiled", featuresCol="features", numTrees=20)
model = rf.fit(train_df)

### Make predictions and print Schema

In [22]:
# Make predictions.
predictions = model.transform(test_df)
predictions.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
predictions.select("prediction", "Spoiled", "features").show(5)

+----------+-------+------------------+
|prediction|Spoiled|          features|
+----------+-------+------------------+
|       1.0|    1.0|[1.0,1.0,12.0,2.0]|
|       0.0|    0.0| [1.0,2.0,9.0,1.0]|
|       0.0|    0.0| [1.0,3.0,8.0,3.0]|
|       0.0|    0.0| [1.0,4.0,8.0,7.0]|
|       0.0|    0.0| [1.0,8.0,8.0,8.0]|
+----------+-------+------------------+
only showing top 5 rows



In [25]:
# Select (prediction, Spoiled) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="Spoiled", predictionCol="prediction", metricName="accuracy")

In [26]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0140845


In [27]:
# Not a very good example to show this!
model.featureImportances

SparseVector(4, {0: 0.0362, 1: 0.0335, 2: 0.8982, 3: 0.0321})