In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('preservatives').getOrCreate()

In [6]:
data = spark.read.csv('data/Tree_Methods/dog_food.csv', inferSchema=True, header=True)

### In this example we are interested in feature selection

In [7]:
data.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [10]:
output = assembler.transform(data)

In [11]:
output.show()

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

In [12]:
from pyspark.ml.classification import RandomForestClassifier

In [16]:
model = RandomForestClassifier(labelCol='Spoiled')

In [17]:
fitted_model = model.fit(output)

In [18]:
feature_importance = fitted_model.featureImportances

In [19]:
print(feature_importance)

(4,[0,1,2,3],[0.028816975228224684,0.02093729139700888,0.9227506619199861,0.027495071454780362])


In [20]:
type(feature_importance)

pyspark.ml.linalg.SparseVector

In [25]:
for index, value in zip(feature_importance.indices, feature_importance.values):
    print(f'Feature {index} - Importance {value}')

Feature 0 - Importance 0.028816975228224684
Feature 1 - Importance 0.02093729139700888
Feature 2 - Importance 0.9227506619199861
Feature 3 - Importance 0.027495071454780362
