In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Practice").getOrCreate()
spark

In [2]:
df_pyspark = spark.read.csv("mushrooms.csv",inferSchema=True, header=True)

In [3]:
from pyspark.sql import SparkSession

# Initialisez la session Spark
spark = SparkSession.builder.appName("MushroomClassification").getOrCreate()

# Chargez le fichier CSV
data = spark.read.csv("mushrooms.csv", header=True, inferSchema=True)

# Affichez les premières lignes
data.show(5)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [4]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Indexez d'abord la colonne "class" pour en faire la colonne "label"
label_indexer = StringIndexer(inputCol="class", outputCol="label")

# Indexez toutes les autres colonnes catégorielles
indexers = [StringIndexer(inputCol=column, outputCol=column + "_indexed") for column in data.columns if column != "class"]

# Assembleur de caractéristiques
assembler = VectorAssembler(inputCols=[column + "_indexed" for column in data.columns if column != "class"], outputCol="features")

# Créez un pipeline intégrant toutes les étapes
pipeline = Pipeline(stages=[label_indexer] + indexers + [assembler])

# Transformez les données
processed_data = pipeline.fit(data).transform(data)

In [5]:
# Séparer les données en entraînement et test
train_data, test_data = processed_data.randomSplit([0.7, 0.3], seed=42)


In [6]:
from pyspark.ml.classification import RandomForestClassifier

# Initialisez le classificateur
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

# Entraînez le modèle
model = rf.fit(train_data)


In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Prédictions sur les données de test
predictions = model.transform(test_data)

# Évaluation de la précision
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Précision du modèle : {accuracy * 100:.2f}%")

Précision du modèle : 100.00%


In [8]:
# Sauvegarder le modèle sur le disque
model.write().overwrite().save("mushroom_rf_model")

In [9]:
from pyspark.ml.classification import RandomForestClassificationModel

# Charger le modèle
loaded_model = RandomForestClassificationModel.load("mushroom_rf_model")

In [10]:
from pyspark.sql.functions import col

# Prédire les classes sur les données de test
predictions = loaded_model.transform(test_data)

# Ajouter une colonne "correct" qui vaut True si la prédiction est correcte, sinon False
predictions = predictions.withColumn("correct", col("label") == col("prediction"))

# Afficher les données de test avec les prédictions et l'indication de précision
predictions.select("label", "prediction", "correct", "features").show(10)

+-----+----------+-------+--------------------+
|label|prediction|correct|            features|
+-----+----------+-------+--------------------+
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|   true|(22,[0,1,2,6,8,9,...|
+-----+----------+-------+--------------------+
only showing top 10 rows



In [11]:
# Afficher les prédictions correctes
print("Prédictions correctes :")
predictions.filter(predictions["correct"] == True).select("label", "prediction", "features").show()

# Afficher les prédictions incorrectes
print("Prédictions incorrectes :")
predictions.filter(predictions["correct"] == False).select("label", "prediction", "features").show()

Prédictions correctes :
+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
|  0.0|       0.0|(22,[0,1,2,6,8,9,...|
+-----+----------+--------------------+
only showing top

In [12]:
# Afficher l'importance des caractéristiques
feature_importances = loaded_model.featureImportances
print("Importance des caractéristiques :", feature_importances)

Importance des caractéristiques : (22,[0,1,2,3,4,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21],[6.73430939341632e-05,0.003008244801803672,0.020367368415827324,0.04716864956219263,0.21981570454821492,0.006567239773777263,0.04651639427302828,0.09521561419591489,0.010095238553742851,0.04015690044100515,0.11922966838016684,0.03917750980359718,0.0037602291507936813,0.01741671294246353,0.0003513704972491032,0.04018868682771317,0.04676864720365738,0.1633811240913697,0.03553154482872713,0.0452158086148213])


In [13]:
!pip install flask

Collecting flask
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug>=3.0.0 (from flask)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.0.3-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.0.6-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: Werkzeug, itsdangerous, flask
Successfully installed Werkzeug-3.0.6 flask-3.0.3 itsdangerous-2.2.0


In [20]:
# Importer les bibliothèques
from flask import Flask, request, jsonify
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.linalg import Vectors
from werkzeug.serving import make_server
import threading

# Charger le modèle PySpark (Random Forest)
model = RandomForestClassificationModel.load("mushroom_rf_model")

# Initialiser l’application Flask
app = Flask(__name__)

# Créer le point d’entrée pour les prédictions
@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Recevoir les données JSON du champignon à classifier
        data = request.json
        
        # Vérifiez que les données contiennent la clé 'features'
        if 'features' not in data:
            return jsonify({"error": "Invalid input format, 'features' key missing."}), 400

        # Convertir les caractéristiques en vecteur
        features = Vectors.dense(data["features"])
        
        # Faire la prédiction
        prediction = model.predict(features)
        
        # Retourner la prédiction
        return jsonify({"prediction": int(prediction)})
    except Exception as e:
        return jsonify({"error": str(e)})

# Lancer le serveur Flask dans un thread séparé pour le maintenir actif dans Jupyter
class FlaskThread(threading.Thread):
    def __init__(self, app):
        threading.Thread.__init__(self)
        self.server = make_server('0.0.0.0', 5000, app)
        self.ctx = app.app_context()
        self.ctx.push()

    def run(self):
        print("API démarrée sur http://localhost:5000")
        self.server.serve_forever()

    def shutdown(self):
        self.server.shutdown()

# Créer et démarrer le thread Flask
flask_thread = FlaskThread(app)
flask_thread.start()


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


AttributeError: 'tuple' object has no attribute 'tb_frame'

INFO:werkzeug:172.17.0.1 - - [30/Oct/2024 12:33:34] "[31m[1mGET /predict HTTP/1.1[0m" 405 -


In [23]:
import requests

# Exemple de caractéristiques pour le test
features = [0.0,1.0, 4.0, 1.0, 4.0, 0.0, 0.0, 1.0, 7.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 4.0]

# URL de l'API (vérifiez que localhost est correct, sinon remplacez-le par l'adresse IP de la machine Docker)
url = "http://localhost:5000/predict"

# Envoyer la requête POST avec les caractéristiques en JSON
try:
    response = requests.post(url, json={"features": features})
    
    # Vérifiez si la réponse est réussie (code 200)
    if response.status_code == 200:
        # Afficher la réponse de l'API
        print("Réponse de l'API :", response.json())
    else:
        print("Erreur lors de la requête :", response.status_code, response.text)

except requests.exceptions.RequestException as e:
    print("Une erreur s'est produite lors de la connexion à l'API :", str(e))

INFO:werkzeug:127.0.0.1 - - [30/Oct/2024 12:34:10] "POST /predict HTTP/1.1" 200 -


Réponse de l'API : {'prediction': 1}


In [25]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Downloading rich-13.9.3-py3-none-any.whl.metadata (18 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<6,>=2.1.5 (from streamlit)
  Downloading watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting markdown-it-py>=2.2.0 (from rich<14,>=10.14.0->streamlit)
  Downloading markdown_it_py-3.0.0-py3-n

In [27]:
!pip install ipywidgets



In [28]:
import requests
import ipywidgets as widgets
from IPython.display import display

# Fonction pour envoyer la requête
def predict(features):
    url = "http://localhost:5000/predict"
    response = requests.post(url, json={"features": features})
    if response.status_code == 200:
        prediction = response.json().get("prediction")
        print(f"La prédiction est : {prediction}")
    else:
        print(f"Erreur lors de la requête: {response.status_code} - {response.text}")

# Créer des champs d'entrée pour les caractéristiques
feature_inputs = []
for i in range(22):  # Ajustez le nombre selon le nombre de caractéristiques
    feature_input = widgets.FloatText(value=0.0, description=f"Caractéristique {i + 1}:")
    feature_inputs.append(feature_input)
    display(feature_input)

# Créer un bouton pour envoyer la requête
button = widgets.Button(description="Envoyer")

# Fonction pour le bouton
def on_button_click(b):
    features = [input.value for input in feature_inputs]
    predict(features)

button.on_click(on_button_click)
display(button)


FloatText(value=0.0, description='Caractéristique 1:')

FloatText(value=0.0, description='Caractéristique 2:')

FloatText(value=0.0, description='Caractéristique 3:')

FloatText(value=0.0, description='Caractéristique 4:')

FloatText(value=0.0, description='Caractéristique 5:')

FloatText(value=0.0, description='Caractéristique 6:')

FloatText(value=0.0, description='Caractéristique 7:')

FloatText(value=0.0, description='Caractéristique 8:')

FloatText(value=0.0, description='Caractéristique 9:')

FloatText(value=0.0, description='Caractéristique 10:')

FloatText(value=0.0, description='Caractéristique 11:')

FloatText(value=0.0, description='Caractéristique 12:')

FloatText(value=0.0, description='Caractéristique 13:')

FloatText(value=0.0, description='Caractéristique 14:')

FloatText(value=0.0, description='Caractéristique 15:')

FloatText(value=0.0, description='Caractéristique 16:')

FloatText(value=0.0, description='Caractéristique 17:')

FloatText(value=0.0, description='Caractéristique 18:')

FloatText(value=0.0, description='Caractéristique 19:')

FloatText(value=0.0, description='Caractéristique 20:')

FloatText(value=0.0, description='Caractéristique 21:')

FloatText(value=0.0, description='Caractéristique 22:')

Button(description='Envoyer', style=ButtonStyle())