# Gradient Boosted Trees (GBTClassifier)


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, when

# Avvio della sessione Spark
spark = SparkSession.builder.appName("GBTClassification").getOrCreate()


In [None]:
# Caricamento del dataset aumentato
df = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/drive/MyDrive/BigData/dataset_augmented.csv")

# Identificazione colonne sintomi
symptom_cols = [c for c in df.columns if "Symptom_" in c]


In [None]:
# Conversione sintomi in binario (1 = presente, 0 = assente)
for col_name in symptom_cols:
    df = df.withColumn(col_name, when(col(col_name).isNotNull(), 1).otherwise(0))


In [None]:
# Indicizzazione delle etichette
indexer = StringIndexer(inputCol="Disease", outputCol="label")
df = indexer.fit(df).transform(df)


In [None]:
# Creazione del vettore delle feature
assembler = VectorAssembler(inputCols=symptom_cols, outputCol="features")
df = assembler.transform(df)


In [None]:
# Divisione del dataset
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)


In [None]:
# Random Forest Classifier (instead of GBTClassifier for multi-class)
# GBTClassifier in PySpark MLlib is primarily for binary classification
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
model = rf.fit(train_data)
predictions = model.transform(test_data)

In [None]:
# Valutazione dell'accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)
