In [64]:
from pyspark.sql import SparkSession

In [65]:
spark = SparkSession.builder.appName('dogfoodProj').getOrCreate()

In [66]:
data = spark.read.csv('dog_food.csv',inferSchema=True,header=True)

In [67]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [68]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [69]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [93]:
data.count()
# number of rows

490

In [92]:
len(data.columns)
# number of columns

5

In [70]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [71]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,GBTClassifier
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [72]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol='features')

In [73]:
output = assembler.transform(dataset=data)

In [74]:
indexer = StringIndexer(inputCol='Spoiled',outputCol='SpoiledIndex')
output_fixed = indexer.fit(output).transform(output)

In [75]:
final_data = output_fixed.select('features','SpoiledIndex')

In [76]:
train_data,test_data = final_data.randomSplit([0.7,0.3],seed=3)

In [77]:
dtc = DecisionTreeClassifier(labelCol='SpoiledIndex')
rfc = RandomForestClassifier(labelCol='SpoiledIndex')
gbt = GBTClassifier(labelCol='SpoiledIndex')

In [78]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [79]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [80]:
my_binary_eval =  BinaryClassificationEvaluator(labelCol='SpoiledIndex')

In [81]:
print('DTC')
print(my_binary_eval.evaluate(dtc_preds))

DTC
0.9873636835727886


In [82]:
print('RFC')
print(my_binary_eval.evaluate(rfc_preds))

RFC
0.9994806993249091


In [83]:
print('GBT')
print(my_binary_eval.evaluate(gbt_preds))

GBT
1.0


In [84]:
acc_eval = MulticlassClassificationEvaluator(labelCol='SpoiledIndex',metricName='accuracy')

In [85]:
print('DTC')
print(acc_eval.evaluate(dtc_preds))

DTC
0.9876543209876543


In [86]:
print('RFC')
print(acc_eval.evaluate(rfc_preds))

RFC
0.9938271604938271


In [87]:
print('GBT')
print(acc_eval.evaluate(gbt_preds))

GBT
0.9876543209876543


In [95]:
dtc_model.featureImportances

SparseVector(4, {0: 0.0052, 1: 0.0079, 2: 0.9527, 3: 0.0342})

In [94]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0274, 1: 0.032, 2: 0.9164, 3: 0.0243})

In [96]:
gbt_model.featureImportances

SparseVector(4, {0: 0.0494, 1: 0.0403, 2: 0.8142, 3: 0.0961})

In [88]:
#!pip install PyArrow 