# Decision Tree con GridSearchCV

Questo notebook ottimizza un modello Decision Tree sui dati `selfMade` usando GridSearchCV su diversi valori di `max_depth` e criteri di split (`gini`, `entropy`).

In [15]:
import pandas as pd

X_train = pd.read_csv("../data/splitted/X_train.csv")
X_test = pd.read_csv("../data/splitted/X_test.csv")
y_train = pd.read_csv("../data/splitted/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/splitted/y_test.csv").values.ravel()

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': list(range(2, 20)),
    'criterion': ['gini', 'entropy']
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scoring, refit='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Migliori parametri trovati:", grid_search.best_params_)

Migliori parametri trovati: {'criterion': 'entropy', 'max_depth': 9}


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

Accuracy: 0.7274
Precision: 0.7494
Recall: 0.9114
F1-score: 0.8225

Confusion Matrix:
 [[ 50 110]
 [ 32 329]]

Classification Report:
               precision    recall  f1-score   support

       False       0.61      0.31      0.41       160
        True       0.75      0.91      0.82       361

    accuracy                           0.73       521
   macro avg       0.68      0.61      0.62       521
weighted avg       0.71      0.73      0.70       521



In [18]:
import os
import joblib

# Salvataggio metriche
with open(os.path.join("../results/classification_selfMade/decision_tree/gridsearch", "metrics_gridsearch_acc.txt"), "w") as f:
    f.write(f"Migliori parametri trovati (in base a accuracy): {grid_search.best_params_}\n\n")
    f.write(f"Accuracy: {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1-Score: {f1:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(report)
    f.write("\nConfusion Matrix:\n")
    f.write(str(cm))
joblib.dump(best_model, "../models/decision_tree_gridsearch_acc.joblib")    

['../models/decision_tree_gridsearch_acc.joblib']

In [19]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Salvataggio immagine dell'albero (solo preview max_depth=3)
plt.figure(figsize=(20, 10))
plot_tree(best_model, filled=True, max_depth=3, feature_names=X_train.columns, class_names=["False", "True"])
plt.savefig(os.path.join("../results/classification_selfMade/decision_tree/gridsearch", "decision_tree_grid_preview_acc.png"))
plt.close()

In [20]:
import pandas as pd
import matplotlib.pyplot as plt

results_df = pd.DataFrame(grid_search.cv_results_)

plt.figure(figsize=(10, 6))
for criterion in results_df["param_criterion"].unique():
    subset = results_df[results_df["param_criterion"] == criterion]
    plt.plot(subset["param_max_depth"], subset["mean_test_accuracy"], marker='o', label=criterion.capitalize())

plt.title("Confronto accuracy medio: Gini vs Entropy")
plt.xlabel("max_depth")
plt.ylabel("Mean accuracy (cross-validation)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("../results/classification_selfMade/decision_tree/gridsearch/accuracy_comparison.png")
plt.close()