<a href="https://colab.research.google.com/github/amitadhainje/MY_NOTES/blob/master/TreeMethods_With_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**LEARNING/GETTING HANDS-ON WITH THE TREE METHODS (DECISION TREEES, RANDOM FOREST AND GRADIENT BOOSTED CLASSIFIERS) USING SPARK**

In [1]:
%%time
#spark installation
!pip install pyspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/9a/5a/271c416c1c2185b6cb0151b29a91fff6fcaed80173c8584ff6d20e46b465/pyspark-2.4.5.tar.gz (217.8MB)
[K     |████████████████████████████████| 217.8MB 59kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 45.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.5-py2.py3-none-any.whl size=218257927 sha256=22ba01d88b9f52853ff22413e7e9aad10cd2bd22e6678ce5d3d7fed925b3c926
  Stored in directory: /root/.cache/pip/wheels/bf/db/04/61d66a5939364e756eb1c1be4ec5bdce6e04047fc7929a3c3c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.5
CPU 

In [0]:

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TreeMethods").getOrCreate()

In [0]:
from google.colab import files
files.upload()

Saving sample_libsvm_data.txt to sample_libsvm_data.txt


{'sample_libsvm_data.txt': b'0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 5

In [0]:
df = spark.read.format('libsvm').load("sample_libsvm_data.txt")
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [0]:
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [0]:
df.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                100|
|   mean|               0.57|
| stddev|0.49756985195624287|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [0]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

train_data, test_data = df.randomSplit([0.75,0.25])
print (train_data.describe().show())
print (test_data.describe().show())

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20)
model = rf.fit(train_data)
# Make predictions.
predictions = model.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Accuracy = %g" % (accuracy))

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                75|
|   mean|0.5466666666666666|
| stddev|0.5011698027327376|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+

None
+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                 25|
|   mean|               0.64|
| stddev|0.48989794855663554|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

None
Test Error = 0.04
Test Accuracy = 0.96


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

gbc = GBTClassifier(labelCol="label", featuresCol="features")
model = gbc.fit(train_data)
# Make predictions.
predictions = model.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Accuracy = %g" % (accuracy))

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

gbc = DecisionTreeClassifier(labelCol="label", featuresCol="features")
model = gbc.fit(train_data)
# Make predictions.
predictions = model.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Accuracy = %g" % (accuracy))

Test Error = 0.04
Test Accuracy = 0.96


In [0]:
from google.colab import files
files.upload()

Saving College.csv to College.csv


{'College.csv': b"School,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F_Undergrad,P_Undergrad,Outstate,Room_Board,Books,Personal,PhD,Terminal,S_F_Ratio,perc_alumni,Expend,Grad_Rate\r\nAbilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60\r\nAdelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56\r\nAdrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54\r\nAgnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59\r\nAlaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15\r\nAlbertson College,Yes,587,479,158,38,62,678,41,13500,3335,500,675,67,73,9.4,11,9727,55\r\nAlbertus Magnus College,Yes,353,340,103,17,45,416,230,13290,5720,500,1500,90,93,11.5,26,8861,63\r\nAlbion College,Yes,1899,1720,489,37,68,1594,32,13868,4826,450,850,89,100,13.7,37,11487,73\r\nAlbright College,Yes,1038,839,227,30,63,973

In [0]:
collegeDataset = spark.read.csv("College.csv", inferSchema=True, header=True)
collegeDataset.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [0]:
collegeDataset.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [0]:
collegeDataset.describe().show()

+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+
|summary|              School|Private|              Apps|            Accept|          Enroll|         Top10perc|         Top25perc|      F_Undergrad|      P_Undergrad|          Outstate|        Room_Board|             Books|          Personal|               PhD|          Terminal|         S_F_Ratio|       perc_alumni|          Expend|         Grad_Rate|
+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

output = assembler.transform(collegeDataset)
output.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924

In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features",'PrivateIndex')
final_data.show()

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[587.0,479.0,158....|         0.0|
|[353.0,340.0,103....|         0.0|
|[1899.0,1720.0,48...|         0.0|
|[1038.0,839.0,227...|         0.0|
|[582.0,498.0,172....|         0.0|
|[1732.0,1425.0,47...|         0.0|
|[2652.0,1900.0,48...|         0.0|
|[1179.0,780.0,290...|         0.0|
|[1267.0,1080.0,38...|         0.0|
|[494.0,313.0,157....|         0.0|
|[1420.0,1093.0,22...|         0.0|
|[4302.0,992.0,418...|         0.0|
|[1216.0,908.0,423...|         0.0|
|[1130.0,704.0,322...|         0.0|
|[3540.0,2001.0,10...|         1.0|
+--------------------+------------+
only showing top 20 rows



In [0]:
train_data, test_data = final_data.randomSplit([0.75,0.25])
print (train_data.describe().show())
print (test_data.describe().show())


+-------+-------------------+
|summary|       PrivateIndex|
+-------+-------------------+
|  count|                567|
|   mean|0.26102292768959434|
| stddev| 0.4395802018862467|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

None
+-------+-------------------+
|summary|       PrivateIndex|
+-------+-------------------+
|  count|                210|
|   mean| 0.3047619047619048|
| stddev|0.46140641093700746|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

None


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print ("DECISION TREE CLASSIFIER PERFORMANCE ----")
dtc = DecisionTreeClassifier(labelCol="PrivateIndex", featuresCol="features")
model = dtc.fit(train_data)
# Make predictions.
predictions = model.transform(test_data)
print (predictions.printSchema())
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Accuracy = %g" % (accuracy))

DECISION TREE CLASSIFIER PERFORMANCE ----
root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

None
Test Error = 0.0857143
Test Accuracy = 0.914286


In [0]:
print ("GRADIENT BOOSTED TREE CLASSIFIER PERFORMANCE ----")

gbc = GBTClassifier(labelCol="PrivateIndex", featuresCol="features")
model = gbc.fit(train_data)
# Make predictions.
predictions = model.transform(test_data)
print (predictions.printSchema())
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Accuracy = %g" % (accuracy))

GRADIENT BOOSTED TREE CLASSIFIER PERFORMANCE ----
root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

None
Test Error = 0.0761905
Test Accuracy = 0.92381


In [0]:
print ("RANDOM FOREST CLASSIFIER PERFORMANCE ----")
rfc = RandomForestClassifier(labelCol="PrivateIndex", featuresCol="features", numTrees=20)
model = rfc.fit(train_data)
# Make predictions.
predictions = model.transform(test_data)
print (predictions.printSchema())
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Test Accuracy = %g" % (accuracy))

RANDOM FOREST CLASSIFIER PERFORMANCE ----
root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

None
Test Error = 0.0619048
Test Accuracy = 0.938095


In [4]:
from google.colab import files
files.upload()

Saving dog_food.csv to dog_food.csv


{'dog_food.csv': b'A,B,C,D,Spoiled\n4,2,12.0,3,1.0\n5,6,12.0,7,1.0\n6,2,13.0,6,1.0\n4,2,12.0,1,1.0\n4,2,12.0,3,1.0\n10,3,13.0,9,1.0\n8,5,14.0,5,1.0\n5,8,12.0,8,1.0\n6,5,12.0,9,1.0\n3,3,12.0,1,1.0\n9,8,11.0,3,1.0\n1,10,12.0,3,1.0\n1,5,13.0,10,1.0\n2,10,12.0,6,1.0\n1,10,11.0,4,1.0\n5,3,12.0,2,1.0\n4,9,11.0,8,1.0\n5,1,11.0,1,1.0\n4,9,12.0,10,1.0\n5,8,10.0,9,1.0\n5,7,11.0,9,1.0\n4,10,13.0,8,1.0\n10,5,12.0,9,1.0\n2,4,13.0,4,1.0\n1,4,13.0,10,1.0\n1,8,12.0,1,1.0\n2,10,13.0,4,1.0\n6,2,12.0,4,1.0\n8,2,13.0,3,1.0\n6,4,12.0,2,1.0\n3,2,11.0,9,1.0\n10,6,12.0,10,1.0\n9,5,13.0,3,1.0\n9,2,12.0,5,1.0\n2,6,13.0,9,1.0\n4,2,12.0,10,1.0\n4,3,12.0,6,1.0\n7,1,12.0,1,1.0\n1,7,11.0,10,1.0\n9,2,11.0,10,1.0\n2,6,12.0,2,1.0\n9,4,11.0,5,1.0\n6,2,11.0,10,1.0\n3,10,11.0,4,1.0\n6,9,11.0,2,1.0\n10,6,11.0,9,1.0\n6,7,11.0,9,1.0\n7,2,13.0,8,1.0\n9,2,13.0,5,1.0\n8,7,12.0,6,1.0\n9,1,12.0,9,1.0\n3,5,14.0,3,1.0\n7,1,11.0,3,1.0\n5,9,12.0,7,1.0\n3,10,12.0,7,1.0\n9,8,13.0,9,1.0\n10,9,12.0,9,1.0\n10,7,11.0,2,1.0\n10,3,11.0,1,1.0

You've been hired by a dog food company to try to predict why some batches of their dog food are spoiling much quicker than intended! Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative chemicals (A,B,C,D) and then is completed with a "filler" chemical. The food scientists beelive one of the A,B,C, or D preservatives is causing the problem, but need your help to figure out which one! Use Machine Learning with RF to find out which parameter had the most predicitive power, thus finding out which chemical causes the early spoiling! So create a model and then find out how you can decide which chemical is the problem!

*   Pres_A : Percentage of preservative A in the mix
*   Pres_B : Percentage of preservative B in the mix
*   Pres_C : Percentage of preservative C in the mix
*   Pres_D : Percentage of preservative D in the mix
*   Spoiled: Label indicating whether or not the dog food batch was spoiled.

Objective -  To find which preservative has the bigger effect in spoiling the dog food.

In [5]:
dogFoodDf = spark.read.csv("dog_food.csv", inferSchema=True, header=True)
dogFoodDf.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [7]:
dogFoodDf.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [8]:
dogFoodDf.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [10]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol="features")
output = assembler.transform(dogFoodDf)
output.show()

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

In [11]:
from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier

finalData = output.select('features', 'Spoiled')
dtc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')
dtc_model = dtc.fit(finalData)
dtc_model.featureImportances

SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

In [12]:
rfc = RandomForestClassifier(labelCol='Spoiled',featuresCol='features', numTrees=20)
rfc_model = rfc.fit(finalData)
rfc_model.featureImportances

SparseVector(4, {0: 0.0248, 1: 0.0252, 2: 0.9271, 3: 0.023})