In [1]:
import findspark
findspark.init('/home/osboxes/SparkClass/spark-2.4.3-bin-hadoop2.7')
import pyspark
import os

In [2]:
myPath = os.path.join('/home', 'osboxes', 'SparkClass', 'Spark_for_Machine_Learning', 'Tree_Methods')

print(myPath)

os.chdir(myPath)

/home/osboxes/SparkClass/Spark_for_Machine_Learning/Tree_Methods


In [3]:
os.listdir()

['Tree Methods Code Along.ipynb',
 'Tree_Methods_Doc_Example.ipynb',
 '.ipynb_checkpoints',
 'dog_food.csv',
 'College.csv',
 'sample_libsvm_data.txt',
 'Tree_Methods_Consulting_Project.ipynb',
 'Tree_Methods_Consulting_Project_SOLUTION.ipynb']

In [4]:
# set up the session info
# import tree-based classifiers

from pyspark.sql import SparkSession

name = 'tree'

spark = SparkSession.builder.appName(name).getOrCreate()

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#  make a binary classification evaluator AUC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Import VectorAssembler and Vectors

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# get string indexer for any binaries
from pyspark.ml.feature import StringIndexer




In [5]:
# Load training data
data = spark.read.csv('dog_food.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [7]:
data.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [8]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [9]:
# use all the columns in the assembler

assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol="features")

In [10]:
# make the output

output = assembler.transform(data)

In [15]:
# use the decision tree classifier

rfc = RandomForestClassifier(labelCol='Spoiled',featuresCol='features')

In [16]:
# check the data set

output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [18]:
final_data = output.select('features','Spoiled')
final_data.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [19]:
rfc_model = rfc.fit(final_data)

In [20]:
final_data.head(1)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

In [22]:
# find the main chemical contributing to spoilage
# use the feature importance for it

rfc_model.featureImportances

SparseVector(4, {0: 0.0182, 1: 0.0146, 2: 0.9428, 3: 0.0244})