# Modellazione con PySpark MLlib

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Avvio della sessione Spark
spark = SparkSession.builder.appName("SymptomClassification").getOrCreate()


In [None]:
# Caricamento del dataset con sintomi aumentati
df = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/drive/MyDrive/BigData/dataset_augmented.csv")

# Identificazione delle colonne sintomatiche
symptom_cols = [c for c in df.columns if "Symptom_" in c]


In [None]:
# Creazione di colonne binarie per ogni sintomo presente
from pyspark.sql.functions import when

for col_name in symptom_cols:
    df = df.withColumn(col_name, when(col(col_name).isNotNull(), 1).otherwise(0))


In [None]:
# Indicizzazione della colonna Disease
indexer = StringIndexer(inputCol="Disease", outputCol="label")
df = indexer.fit(df).transform(df)


In [None]:
# Assemblaggio delle colonne sintomo in un unico vettore
assembler = VectorAssembler(inputCols=symptom_cols, outputCol="features")
df = assembler.transform(df)


In [None]:
# Divisione train/test
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)


In [None]:
# Random Forest Classifiera
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
model = rf.fit(train_data)
predictions = model.transform(test_data)


In [None]:
# Valutazione
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)
# Salvataggio modello (opzionale)
model.save("content/drive/MyDrive/BigData/rf_model_symptoms")
