In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

print("Target names (classes):", data.target_names)

# This function evaluates every random_state and every ccp_alpha for that state
def evaluate_random_state_and_alpha(rs):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs)
    clf = DecisionTreeClassifier(criterion='entropy', random_state=rs)
    clf.fit(X_train, y_train)

    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas

    scores_for_alphas = []

    for alpha in ccp_alphas:
        model = DecisionTreeClassifier(criterion='entropy', random_state=rs, ccp_alpha=alpha)
        scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)
        mean_score = scores.mean()
        scores_for_alphas.append((rs, alpha, mean_score))

    return scores_for_alphas

# Run parallel evaluation across all random states
all_results_nested = Parallel(n_jobs=-1, backend="loky")(delayed(evaluate_random_state_and_alpha)(rs) for rs in range(0, 10001))

# Flatten the results
results = [item for sublist in all_results_nested for item in sublist if sublist]

# Get best configuration
best_rs, best_alpha, best_score = max(results, key=lambda x: x[2])

print(f"\nBest Random State: {best_rs}")
print(f"Best Alpha: {best_alpha:.4f}")
print(f"Highest Cross-Validation Accuracy: {best_score:.4f}")

# Train final model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=best_rs)
final_model = DecisionTreeClassifier(criterion='entropy', random_state=best_rs, ccp_alpha=best_alpha)
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Best Model: {test_accuracy:.4f}")

# Plot the tree
plt.figure(figsize=(20, 20))
plot_tree(final_model,
          filled=True,
          feature_names=data.feature_names,
          class_names=data.target_names,
          rounded=True,
          fontsize=12)
plt.title("Best Decision Tree with Optimized Random State and Pruning (ccp_alpha)", fontsize=16)
plt.tight_layout()
plt.show()


Target names (classes): ['malignant' 'benign']
