In [23]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [2]:
BreastData = load_breast_cancer()

In [4]:
#X Data
X = BreastData.data
print('X shape is ' , X.shape)

X shape is  (569, 30)


In [5]:
y = BreastData.target
print('y shape is ' , y.shape)

y shape is  (569,)


In [7]:
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle =True)

In [29]:
DecisionTreeClassifierModel = DecisionTreeClassifier(random_state=33) 

In [30]:
DecisionTreeClassifierModel.fit(X_train, y_train)

DecisionTreeClassifier(random_state=33)

In [31]:
print('DecisionTreeClassifierModel Train Score is : ' , DecisionTreeClassifierModel.score(X_train, y_train))
print('DecisionTreeClassifierModel Test Score is : ' , DecisionTreeClassifierModel.score(X_test, y_test))

DecisionTreeClassifierModel Train Score is :  1.0
DecisionTreeClassifierModel Test Score is :  0.9148936170212766


In [32]:
#cross validation
cv_scores = cross_val_score(DecisionTreeClassifierModel, X_train , y_train, cv=5)
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", mean_cv_score)
print("Std CV score:", std_cv_score)

Cross-validation scores: [0.98701299 0.94736842 0.93421053 0.84210526 0.92105263]
Mean CV score: 0.9263499658236501
Std CV score: 0.04755795977251937


In [20]:
# Define the hyperparameter grid
param_dist = {"max_depth": [3, None],
              "max_features": np.arange(1, X.shape[1]+1),
              "min_samples_leaf": np.arange(1, 10),
              "criterion": ["gini", "entropy"]}

In [24]:
# Define the randomized search object
rs = RandomizedSearchCV(DecisionTreeClassifierModel, param_distributions=param_dist, n_iter=20, cv=5, n_jobs=-1, random_state=42)

In [26]:
# Get the best hyperparameters
best_params = rs.best_params_
print("Best hyperparameters:", best_params)

Best hyperparameters: {'min_samples_leaf': 6, 'max_features': 5, 'max_depth': 3, 'criterion': 'entropy'}


In [28]:
dt_best = DecisionTreeClassifier(random_state=42, **best_params)
dt_best.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features=5,
                       min_samples_leaf=6, random_state=42)

In [33]:
y_predict = dt_best.predict(X_test)

In [35]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", conf_matrix)


Confusion matrix:
 [[ 58  10]
 [  5 115]]
