In [1]:
import findspark
findspark.init('C:\spark-3.0.0-bin-hadoop2.7')
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('dog_food_example').getOrCreate()

In [4]:
data=spark.read.csv('dog_food.csv',inferSchema=True,header=True)

In [5]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
assembler=VectorAssembler(inputCols=['A', 'B', 'C', 'D'],
                         outputCol='features')

In [8]:
final_data=assembler.transform(data)

In [9]:
final_data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [11]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [13]:
dtc=DecisionTreeClassifier(labelCol='Spoiled')
gbc=GBTClassifier(labelCol='Spoiled')
rfc=RandomForestClassifier(labelCol='Spoiled')

In [14]:
dtc_model=dtc.fit(train_data)
gbc_model=gbc.fit(train_data)
rfc_model=rfc.fit(train_data)

In [34]:
# Higher teh feature Importance Value Issue more with the component
rfc_model.featureImportances

SparseVector(4, {0: 0.0187, 1: 0.0162, 2: 0.9444, 3: 0.0206})

In [15]:
dtc_predictions=dtc_model.transform(test_data)
gbc_predictions=gbc_model.transform(test_data)
rfc_predictions=rfc_model.transform(test_data)

In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
evalutor=BinaryClassificationEvaluator(labelCol='Spoiled')
evalutor2=BinaryClassificationEvaluator(labelCol='Spoiled',rawPredictionCol='prediction')

In [24]:
dtc_eval=evalutor.evaluate(dtc_predictions)
gbc_eval=evalutor2.evaluate(gbc_predictions)
rfc_eval=evalutor.evaluate(rfc_predictions)

In [25]:
dtc_eval

0.9731396321070234

In [26]:
gbc_eval

0.979515050167224

In [27]:
rfc_eval

0.9926839464882943

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [29]:
mul_eval=MulticlassClassificationEvaluator(metricName="accuracy",labelCol='Spoiled')

In [30]:
dtc_auc=mul_eval.evaluate(dtc_predictions)
gbc_auc=mul_eval.evaluate(gbc_predictions)
rfc_auc=mul_eval.evaluate(rfc_predictions)

In [31]:
dtc_auc

0.98

In [32]:
gbc_auc

0.98

In [33]:
rfc_auc

0.9866666666666667