In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Tree_Project').getOrCreate()

In [4]:
data = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

In [8]:
data.head(4)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0),
 Row(A=5, B=6, C=12.0, D=7, Spoiled=1.0),
 Row(A=6, B=2, C=13.0, D=6, Spoiled=1.0),
 Row(A=4, B=2, C=12.0, D=1, Spoiled=1.0)]

In [9]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler

In [10]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [11]:
assembler = VectorAssembler(inputCols=['A', 'B','C','D'],outputCol='features')

In [12]:
output = assembler.transform(data)

In [19]:
rfc = RandomForestClassifier(labelCol='spoiled',featuresCol='features')

In [20]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [21]:
final_data = output.select('features','spoiled')

In [22]:
final_data.show()

+-------------------+-------+
|           features|spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [23]:
rfc_model = rfc.fit(final_data)

In [24]:
final_data.head(1)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), spoiled=1.0)]

In [25]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0264, 1: 0.0246, 2: 0.9166, 3: 0.0323})