In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
    builder.\
    master("local").\
    appName('tree-methods-project').\
    getOrCreate()

In [2]:
df = spark.read.csv(
    path='D:/learn-ab/learning-PySpark/sample-data/dog-food.csv',
    inferSchema=True,
    header=True
)
df.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [3]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [4]:
from pyspark.ml.feature import VectorAssembler

In [5]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [6]:
assembler = VectorAssembler(
    inputCols=['A', 'B', 'C', 'D'],
    outputCol='features'
)
df_final = assembler.transform(df).select('features','Spoiled')
df_final.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [7]:
from pyspark.ml.classification import RandomForestClassifier

In [8]:
rf_clf = RandomForestClassifier(
    featuresCol='features',
    labelCol='Spoiled'
)
rf_clf_model = rf_clf.fit(df_final)

In [9]:
rf_clf_model.featureImportances

SparseVector(4, {0: 0.0218, 1: 0.0263, 2: 0.9314, 3: 0.0205})

In [10]:
feat_names = df.columns[:4]
feat_imp = rf_clf_model.featureImportances.values
feat_imp_dict = dict(zip(feat_names, feat_imp))

In [11]:
most_imp_col_name = max(feat_imp_dict, key=feat_imp_dict.get)
most_imp_col_val = feat_imp_dict[most_imp_col_name]
print(f'The "{most_imp_col_name}" chemical has the maximum influence on spoiling the dog food and its influence is almost {round(most_imp_col_val *100, 2)}%')

The "C" chemical has the maximum influence on spoiling the dog food and its influence is almost 93.14%
