In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import warnings

warnings.filterwarnings("ignore")  # suppress convergence warnings for cleaner output

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

print("Target names (classes):", data.target_names)

best_overall_accuracy = 0
best_random_state = 0
best_alpha = 0.0
best_model = None

# Loop over all random states from 0 to 10000
for rs in range(0, 10001):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs)
    
    # Initial model to get pruning path
    clf = DecisionTreeClassifier(criterion='entropy', random_state=rs)
    clf.fit(X_train, y_train)
    
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas
    
    alpha_scores = []
    for alpha in ccp_alphas:
        model = DecisionTreeClassifier(criterion='entropy', random_state=rs, ccp_alpha=alpha)
        scores = cross_val_score(model, X_train, y_train, cv=5)
        mean_score = scores.mean()
        alpha_scores.append((alpha, mean_score))
    
    if not alpha_scores:
        continue
    
    current_best_alpha, current_best_score = max(alpha_scores, key=lambda x: x[1])
    
    if current_best_score > best_overall_accuracy:
        best_overall_accuracy = current_best_score
        best_random_state = rs
        best_alpha = current_best_alpha
        best_model = DecisionTreeClassifier(criterion='entropy', random_state=rs, ccp_alpha=best_alpha)
        best_model.fit(X_train, y_train)

print(f"\n Best Random State: {best_random_state}")
print(f" Best Alpha: {best_alpha:.4f}")
print(f" Highest Cross-Validation Accuracy: {best_overall_accuracy:.4f}")

# Predict on test set with best model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=best_random_state)
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f" Test Accuracy with Best Model: {test_accuracy:.4f}")

# Plot best tree
plt.figure(figsize=(20, 20))
plot_tree(best_model,
          filled=True,
          feature_names=data.feature_names,
          class_names=data.target_names,
          rounded=True,
          fontsize=12)
plt.title("Best Decision Tree with Pruning and Optimal Random State", fontsize=16)
plt.tight_layout()
plt.show()


Target names (classes): ['malignant' 'benign']
