# Decision Tree con GridSearchCV

Questo notebook ottimizza un modello Decision Tree sui dati `selfMade` usando GridSearchCV su diversi valori di `max_depth` e criteri di split (`gini`, `entropy`).

In [1]:
import pandas as pd

X_train = pd.read_csv("../data/splitted/X_train.csv")
X_test = pd.read_csv("../data/splitted/X_test.csv")
y_train = pd.read_csv("../data/splitted/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/splitted/y_test.csv").values.ravel()

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': list(range(2, 20)),
    'criterion': ['gini', 'entropy']
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scoring, refit='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Migliori parametri trovati:", grid_search.best_params_)

Migliori parametri trovati: {'criterion': 'entropy', 'max_depth': 7}


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("TEST METRICS:")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Confusion Matrix:")
print(cm)

y_pred_train = best_model.predict(X_train)

print("TRAIN METRICS:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Precision:", precision_score(y_train, y_pred_train))
print("Recall:", recall_score(y_train, y_pred_train))
print("F1-Score:", f1_score(y_train, y_pred_train))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))

TEST METRICS:
Accuracy: 0.7260
Precision: 0.7599
Recall: 0.8835
F1-Score: 0.8171
Confusion Matrix:
[[ 89 151]
 [ 63 478]]
TRAIN METRICS:
Accuracy: 0.7589236683141132
Precision: 0.7807377049180327
Recall: 0.9064234734337827
F1-Score: 0.8388990825688073
Confusion Matrix:
[[ 239  321]
 [ 118 1143]]


In [4]:
import os
import joblib

with open(os.path.join("../results/classification_selfMade/decision_tree/gridsearch", "metrics_gridsearch_F1.txt"), "w") as f:
    f.write(f"Best params: {grid_search.best_params_}\n\n")
    f.write(f"Accuracy: {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1-Score: {f1:.4f}\n")
    f.write("\nClassification Report:\n")
    
    f.write("\nConfusion Matrix:\n")
    f.write(str(cm))

joblib.dump(best_model, "../models/decision_tree_gridsearch_f1.joblib")

['../models/decision_tree_gridsearch_f1.joblib']

In [5]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(best_model, filled=True, max_depth=3, feature_names=X_train.columns, class_names=["False", "True"])
plt.savefig("../results/classification_selfMade/decision_tree/gridsearch/decision_tree_grid_preview_f1.png")
plt.close()

In [6]:
import pandas as pd
import matplotlib.pyplot as plt

results_df = pd.DataFrame(grid_search.cv_results_)

plt.figure(figsize=(10, 6))
for criterion in results_df["param_criterion"].unique():
    subset = results_df[results_df["param_criterion"] == criterion]
    plt.plot(subset["param_max_depth"], subset["mean_test_f1"], marker='o', label=criterion.capitalize())

plt.title("Confronto F1-score medio: Gini vs Entropy")
plt.xlabel("max_depth")
plt.ylabel("Mean F1-Score (cross-validation)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("../results/classification_selfMade/decision_tree/gridsearch/f1_score_comparison.png")
plt.close()

In [7]:
results_df = pd.DataFrame(grid_search.cv_results_)

top_f1 = results_df.sort_values("mean_test_f1", ascending=False).head(3)
top_acc = results_df.sort_values("mean_test_accuracy", ascending=False).head(3)

print("Top 3 modelli per F1-score:")
display(top_f1[["param_max_depth", "param_criterion", "mean_test_f1"]])

print("Top 3 modelli per Accuracy:")
display(top_acc[["param_max_depth", "param_criterion", "mean_test_accuracy"]])



Top 3 modelli per F1-score:


Unnamed: 0,param_max_depth,param_criterion,mean_test_f1
23,7,entropy,0.820415
24,8,entropy,0.817496
21,5,entropy,0.817276


Top 3 modelli per Accuracy:


Unnamed: 0,param_max_depth,param_criterion,mean_test_accuracy
26,10,entropy,0.72653
23,7,entropy,0.725434
29,13,entropy,0.724325
