## Random Forest Classifier
##### Random forest are a popular family of classification and regresasion methods

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName("Random Forest Classificier").getOrCreate()
data = spark.read.csv("Data/iris.csv", header=True, inferSchema=True)
data = data.printSchema()


root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- class: string (nullable = true)



In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['credit_policy',
                                              'int_rate',
                                              'installment',
                                              'log_annual_inc',
                                              'dti',
                                              'fico',
                                              'days_with_cr_line',
                                              'revol_bal',
                                              'revol_util',
                                              'inq_last_6mths',
                                              'delinq_2yrs',
                                              'pub_rec'], outputCol='features')


In [None]:
labelIndexer = StringIndexer(inputCol = "class", outputCol = "indexedLabel")
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4)

In [None]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

* Model Training using pipeline

In [None]:
rf = RandomForestClassifier(labelCol = "indexedLabel", features = "indexedFeatures", numTrees = 10)

#Convert indexed label back to original labels
labelConverter = IndexToString(inputCol = "prediction", outputCol = "predictedLabel", labels = labelIndexer.labels)

In [None]:
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, rf, labelConverter])

In [None]:
model = pipeline.fit(trainingData)

In [None]:
predictions = model.transform(testData)

In [None]:
predictions.select("predictionLabel", "label", "features").show(5)

* Evaluations

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " (1.0 - accuracy))

In [None]:
rfModel = model.stages[2]
print(rfModel)