In [1]:
import findspark
findspark.init('C:\Spark\spark-3.0.1-bin-hadoop2.7')
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('mytree').getOrCreate()

In [2]:
from pyspark.ml import Pipeline

In [3]:
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                       DecisionTreeClassifier)

In [4]:
data = spark.read.csv('dog_food.csv',
                      inferSchema=True,
                      header=True)

In [5]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [6]:
data.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [9]:
assembler = VectorAssembler(
    inputCols=['A', 'B', 'C', 'D'],
    outputCol='features'
)

In [10]:
output = assembler.transform(data)

In [11]:
rfc = RandomForestClassifier(
    labelCol='Spoiled',
    featuresCol='features'
)

In [12]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
final_data = output.select(['features', 'Spoiled'])

In [14]:
final_data.show(5)

+------------------+-------+
|          features|Spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
|[5.0,6.0,12.0,7.0]|    1.0|
|[6.0,2.0,13.0,6.0]|    1.0|
|[4.0,2.0,12.0,1.0]|    1.0|
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 5 rows



In [15]:
rfc_model = rfc.fit(final_data)

In [16]:
final_data.head(1)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

In [17]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0201, 1: 0.0252, 2: 0.9291, 3: 0.0256})

In [18]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [19]:
rfc = RandomForestClassifier(numTrees=150, labelCol='Spoiled', featuresCol='features')

In [20]:
rfc = RandomForestClassifier(
    labelCol='Spoiled',
    featuresCol='features'
)

In [21]:
rfc_model = rfc.fit(train_data)

In [22]:
rfc_preds = rfc_model.transform(test_data)

In [23]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [24]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='Spoiled')

In [25]:
print('RFC')
print(my_binary_eval.evaluate(rfc_preds))

RFC
0.9770381836945304


In [26]:
acc_eval = MulticlassClassificationEvaluator(labelCol='Spoiled',
                                             metricName='accuracy')

In [27]:
rfc_acc = acc_eval.evaluate(rfc_preds)
rfc_acc

0.9696969696969697