In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Avvio Spark
spark = SparkSession.builder.appName("Evaluation").getOrCreate()


In [None]:
# Dataset con sintomi pesati
df = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/drive/MyDrive/BigData/dataset_weighted.csv")
symptom_cols = [col_name for col_name in df.columns if "Symptom_" in col_name]


In [None]:
indexer = StringIndexer(inputCol="Disease", outputCol="label")
df = indexer.fit(df).transform(df)

assembler = VectorAssembler(inputCols=symptom_cols, outputCol="features")
df = assembler.transform(df)

train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)


In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
model = rf.fit(train_data)
predictions = model.transform(test_data)


In [None]:
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
evaluator_prec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_rec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

print("Accuracy:", evaluator_acc.evaluate(predictions))
print("F1-score:", evaluator_f1.evaluate(predictions))
print("Precision:", evaluator_prec.evaluate(predictions))
print("Recall:", evaluator_rec.evaluate(predictions))


In [None]:
# Conversione in Pandas
pdf = predictions.select("label", "prediction").toPandas()

# Confusion matrix
cm = confusion_matrix(pdf["label"], pdf["prediction"])
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


In [None]:
# Report completo
print(classification_report(pdf["label"], pdf["prediction"]))
