In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Practice").getOrCreate()
spark

In [4]:
df_pyspark = spark.read.csv("mushrooms.csv",inferSchema=True, header=True)

In [5]:
df_pyspark.show (5)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap_shape|cap_surface|cap_color|bruises|odor|gill_attachment|gill_spacing|gill_size|gill_color|stalk_shape|stalk_root|stalk_surface_above_ring|stalk_surface_below_ring|stalk_color_above_ring|stalk_color_below_ring|veil_type|veil_color|ring_number|ring_type|spore_print_color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [6]:
df_pyspark.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap_shape: string (nullable = true)
 |-- cap_surface: string (nullable = true)
 |-- cap_color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill_attachment: string (nullable = true)
 |-- gill_spacing: string (nullable = true)
 |-- gill_size: string (nullable = true)
 |-- gill_color: string (nullable = true)
 |-- stalk_shape: string (nullable = true)
 |-- stalk_root: string (nullable = true)
 |-- stalk_surface_above_ring: string (nullable = true)
 |-- stalk_surface_below_ring: string (nullable = true)
 |-- stalk_color_above_ring: string (nullable = true)
 |-- stalk_color_below_ring: string (nullable = true)
 |-- veil_type: string (nullable = true)
 |-- veil_color: string (nullable = true)
 |-- ring_number: string (nullable = true)
 |-- ring_type: string (nullable = true)
 |-- spore_print_color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string 

As we can see, we have String Columns. Let's encode them into Integers using Pyspark StringIndexer.

In [7]:
from pyspark.ml.feature import StringIndexer


In [8]:
categoricalColumns = ["class", "cap_shape", "cap_surface", "cap_color", "bruises", "odor", "gill_attachment", "gill_spacing", "gill_size", "gill_color", "stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type", "spore_print_color", "population", "habitat"]

In [10]:
# Appliquer StringIndexer pour chaque colonne catégorielle
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, 
                                  outputCol=categoricalCol + "_encoded").fit(df_pyspark)
    df_pyspark = stringIndexer.transform(df_pyspark)
    # Conversion du type après transformation
    df_pyspark = df_pyspark.withColumn(categoricalCol + "_encoded", 
                                       df_pyspark[categoricalCol + "_encoded"].cast('int'))

In [11]:
# Sélectionner les colonnes encodées
encoded_df = df_pyspark.select("class_encoded", "cap_shape_encoded", "cap_surface_encoded", 
                                "cap_color_encoded", "bruises_encoded", 
                                "odor_encoded", "gill_attachment_encoded", 
                                "gill_spacing_encoded", "gill_size_encoded", 
                                "gill_color_encoded", "stalk_shape_encoded", 
                                "stalk_root_encoded", "stalk_surface_above_ring_encoded", 
                                "stalk_surface_below_ring_encoded", 
                                "stalk_color_above_ring_encoded", 
                                "stalk_color_below_ring_encoded", 
                                "veil_type_encoded", "veil_color_encoded", 
                                "ring_number_encoded", "ring_type_encoded", 
                                "spore_print_color_encoded", 
                                "population_encoded", "habitat_encoded")

In [12]:
encoded_df.show(5)

+-------------+-----------------+-------------------+-----------------+---------------+------------+-----------------------+--------------------+-----------------+------------------+-------------------+------------------+--------------------------------+--------------------------------+------------------------------+------------------------------+-----------------+------------------+-------------------+-----------------+-------------------------+------------------+---------------+
|class_encoded|cap_shape_encoded|cap_surface_encoded|cap_color_encoded|bruises_encoded|odor_encoded|gill_attachment_encoded|gill_spacing_encoded|gill_size_encoded|gill_color_encoded|stalk_shape_encoded|stalk_root_encoded|stalk_surface_above_ring_encoded|stalk_surface_below_ring_encoded|stalk_color_above_ring_encoded|stalk_color_below_ring_encoded|veil_type_encoded|veil_color_encoded|ring_number_encoded|ring_type_encoded|spore_print_color_encoded|population_encoded|habitat_encoded|
+-------------+-------------

In [13]:
from pyspark.ml.feature import VectorAssembler

In [14]:
featureAssembler = VectorAssembler(inputCols=["cap_shape_encoded", "cap_surface_encoded", 
                                "cap_color_encoded", "bruises_encoded", 
                                "odor_encoded", "gill_attachment_encoded", 
                                "gill_spacing_encoded", "gill_size_encoded", 
                                "gill_color_encoded", "stalk_shape_encoded", 
                                "stalk_root_encoded", "stalk_surface_above_ring_encoded", 
                                "stalk_surface_below_ring_encoded", 
                                "stalk_color_above_ring_encoded", 
                                "stalk_color_below_ring_encoded", 
                                "veil_type_encoded", "veil_color_encoded", 
                                "ring_number_encoded", "ring_type_encoded", 
                                "spore_print_color_encoded", 
                                "population_encoded", "habitat_encoded"],outputCol="features")

In [15]:
output = featureAssembler.transform(encoded_df)

In [16]:
output.select("features","class_encoded").show(5)


+--------------------+-------------+
|            features|class_encoded|
+--------------------+-------------+
|(22,[1,3,4,7,8,9,...|            1|
|(22,[1,2,3,4,8,9,...|            0|
|(22,[0,1,2,3,4,8,...|            0|
|(22,[2,3,4,7,8,9,...|            1|
|(22,[1,2,6,8,10,1...|            0|
+--------------------+-------------+
only showing top 5 rows



In [17]:
train, test = output.randomSplit([0.8, 0.2], seed=17)

In [18]:
print("Size of training data: ", train.count())
print("Size of testing data: ", test.count())

Size of training data:  6471
Size of testing data:  1653


In [19]:
from pyspark.ml.classification import LogisticRegression

In [20]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'class_encoded', maxIter=10)

In [21]:
lrModel = lr.fit(train)

In [22]:
predictions = lrModel.transform(test)

In [23]:
predictions.show(5)

+-------------+-----------------+-------------------+-----------------+---------------+------------+-----------------------+--------------------+-----------------+------------------+-------------------+------------------+--------------------------------+--------------------------------+------------------------------+------------------------------+-----------------+------------------+-------------------+-----------------+-------------------------+------------------+---------------+--------------------+--------------------+--------------------+----------+
|class_encoded|cap_shape_encoded|cap_surface_encoded|cap_color_encoded|bruises_encoded|odor_encoded|gill_attachment_encoded|gill_spacing_encoded|gill_size_encoded|gill_color_encoded|stalk_shape_encoded|stalk_root_encoded|stalk_surface_above_ring_encoded|stalk_surface_below_ring_encoded|stalk_color_above_ring_encoded|stalk_color_below_ring_encoded|veil_type_encoded|veil_color_encoded|ring_number_encoded|ring_type_encoded|spore_print_colo

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
evaluator = MulticlassClassificationEvaluator()


In [26]:
evaluator.setLabelCol("class_encoded")


MulticlassClassificationEvaluator_29024fde9395

In [27]:
evaluator.setPredictionCol("prediction")


MulticlassClassificationEvaluator_29024fde9395

In [28]:
evaluator.evaluate(predictions)


0.9915305505142167

Résultat avec LinearRegression

In [29]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
#Training Model

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'class_encoded', maxDepth = 3)
dtModel = dt.fit(train)

In [32]:
#Prediction

predictions = dtModel.transform(test)

In [35]:
#Evaluating the performance

evaluator = MulticlassClassificationEvaluator()
evaluator.setLabelCol("class_encoded")
evaluator.setPredictionCol("prediction")

MulticlassClassificationEvaluator_e4d88b01ddde

In [36]:
print("Test Area Under ROC: ",evaluator.evaluate(predictions))

Test Area Under ROC:  0.9872947089637973


Résultat avec Desicion Tree

In [37]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [38]:
#Training Model
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'class_encoded', numTrees = 500, maxDepth = 10)
rfModel = rf.fit(train)

In [39]:
#Prediction
predictions = rfModel.transform(test)

In [40]:
#Evaluating the performance
evaluator = MulticlassClassificationEvaluator()
evaluator.setLabelCol("class_encoded")
evaluator.setPredictionCol("prediction")
print("Test Area Under ROC: ",evaluator.evaluate(predictions))

Test Area Under ROC:  1.0
