In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DF").getOrCreate()

In [7]:
df=spark.read.csv('../data/dog_food.csv',inferSchema=True,header=True)

In [8]:
df.describe().toPandas()

Unnamed: 0,summary,A,B,C,D,Spoiled
0,count,490.0,490.0,490.0,490.0,490.0
1,mean,5.53469387755102,5.504081632653061,9.126530612244895,5.579591836734694,0.2857142857142857
2,stddev,2.9515204234399057,2.8537966089662063,2.0555451971054275,2.8548369309982857,0.4522156316461346
3,min,1.0,1.0,5.0,1.0,0.0
4,max,10.0,10.0,14.0,10.0,1.0


In [9]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [10]:
df.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [17]:
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
dtc = DecisionTreeClassifier(labelCol='Spoiled')
rfc = RandomForestClassifier(numTrees = 100,labelCol='Spoiled')
gbt = GBTClassifier(labelCol='Spoiled')

In [14]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=['A','B','C','D'],outputCol='features')
data=assembler.transform(df)
train,test=data.randomSplit([0.7,0.3])

In [18]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [19]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator 
evalMC = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Spoiled', metricName='accuracy')
evalBC = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Spoiled',metricName="areaUnderROC")

In [28]:
print(f'DTC accuracy: {evalMC.evaluate(dtc_preds)}') 
print(f'DTC area Under ROC: {evalBC.evaluate(dtc_preds)}') 
print(f'RFC accuracy: {evalMC.evaluate(rfc_preds)}')
print(f'RFC area Under ROC: {evalBC.evaluate(rfc_preds)}')
print(f'GBT accuracy: {evalMC.evaluate(gbt_preds)}')
print(f'GBT area Under ROC: {evalBC.evaluate(gbt_preds)}')

DTC accuracy: 0.9709302325581395
DTC area Under ROC: 0.9803149606299212
RFC accuracy: 1.0
RFC area Under ROC: 1.0
GBT accuracy: 0.9709302325581395
GBT area Under ROC: 0.9803149606299212


### The best model to check for the feature importances is Random Forest.

In [40]:
print(f'Importance of A is: {rfc_model.featureImportances[0]}')
print(f'Importance of B is: {rfc_model.featureImportances[1]}')
print(f'Importance of C is: {rfc_model.featureImportances[2]}')
print(f'Importance of D is: {rfc_model.featureImportances[3]}')

Importance of A is: 0.030750311804022618
Importance of B is: 0.044804564084640834
Importance of C is: 0.8935639337298104
Importance of D is: 0.03088119038152623


### Chemical preservative C has the biggest effect on the dog food being spoiled.