In [1]:
#import necessary libraries
import pandas as pd                 #for data manipulation
import numpy as np                  #for numerical operations
from sklearn.model_selection import train_test_split  # For splitting the dataset

#ff you haven't already installed this package, you can uncomment this and run it in a notebook cell:
#!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo  #to fetch datasets from the UCI Machine Learning Repository

from sklearn.tree import DecisionTreeClassifier      #decision Tree model
from sklearn.metrics import accuracy_score           #to evaluate model performance

In [2]:
#fetch the Breast Cancer Wisconsin (Diagnostic) dataset (ID: 17 in UCI ML Repo)
breast_cancer = fetch_ucirepo(id=17)

#extract features (X) and target (y) 
X = breast_cancer.data.features
y = breast_cancer.data.targets

#convert target values to binary numeric format: 'M' (malignant) -> 1, 'B' (benign) -> 0
y = y.replace({'M': 1, 'B': 0})

#split the data into training and testing sets (80% train, 20% test), set random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [3]:
#try different max_depth values for the Decision Tree to evaluate performance
for depth in range(1, 11):
    #create a DecisionTreeClassifier with a specific max_depth
    clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
    
    #train the classifier on training data
    clf.fit(X_train, y_train)
    
    #predict the labels on the test set
    y_pred = clf.predict(X_test)
    
    #calculate the accuracy of the predictions
    test_accuracy = accuracy_score(y_test, y_pred)
    
    #print test accuracy for each depth
    print(f"max_depth = {depth} | Test Accuracy: {test_accuracy:.2f}")

max_depth = 1 | Test Accuracy: 0.89
max_depth = 2 | Test Accuracy: 0.93
max_depth = 3 | Test Accuracy: 0.95
max_depth = 4 | Test Accuracy: 0.95
max_depth = 5 | Test Accuracy: 0.95
max_depth = 6 | Test Accuracy: 0.94
max_depth = 7 | Test Accuracy: 0.95
max_depth = 8 | Test Accuracy: 0.95
max_depth = 9 | Test Accuracy: 0.95
max_depth = 10 | Test Accuracy: 0.95


In [4]:
#select the best-performing depth (based on previous loop results)
best_depth = 4

#train a final Decision Tree using that depth
final_clf = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
final_clf.fit(X_train, y_train)

#predict the labels on the test set
y_pred = final_clf.predict(X_test)

In [5]:
#create a list to store indices where predictions were incorrect
misclassified_indices = []

#loop through each prediction and compare with actual label
for i in range(len(y_pred)):
    if y_pred[i] != y_test.values[i]:
        misclassified_indices.append(i)

#print details of each misclassified sample
print("\n--- Misclassified Samples ---")
print(f"Total Misclassified: {len(misclassified_indices)}\n")

for idx in misclassified_indices:
    print(f"Index: {idx}")
    print(f"Predicted: {y_pred[idx]}, Actual: {y_test.values[idx]}")
    print("Feature values:")
    print(X_test.iloc[idx])
    print("-" * 40)  


--- Misclassified Samples ---
Total Misclassified: 6

Index: 8
Predicted: 1, Actual: [0]
Feature values:
radius1                13.340000
texture1               15.860000
perimeter1             86.490000
area1                 520.000000
smoothness1             0.107800
compactness1            0.153500
concavity1              0.116900
concave_points1         0.069870
symmetry1               0.194200
fractal_dimension1      0.069020
radius2                 0.286000
texture2                1.016000
perimeter2              1.535000
area2                  12.960000
smoothness2             0.006794
compactness2            0.035750
concavity2              0.039800
concave_points2         0.013830
symmetry2               0.021340
fractal_dimension2      0.004603
radius3                15.530000
texture3               23.190000
perimeter3             96.660000
area3                 614.900000
smoothness3             0.153600
compactness3            0.479100
concavity3              0.485800
con

In [7]:
#printing the number of values in the test set 
print(len(X_test))

114


In [15]:
#create a copy of the feature data
df = X.copy()
#add target column
df['target'] = y 

#split dataframe into two, one for benign and one for malignant
benign_df = df[df['target'] == 0]
malignant_df = df[df['target'] == 1]

#calculate the average feature values for each tumor
benign_avg = benign_df.drop(columns=['target']).mean()  
malignant_avg = malignant_df.drop(columns=['target']).mean()

#combine the averages into a dataframe to print side by side
comparison_df = pd.DataFrame({
    'Benign Average': benign_avg,
    'Malignant Average': malignant_avg
})

#print the side-by-side comparison
print(comparison_df)

                    Benign Average  Malignant Average
radius1                  12.146524          17.462830
texture1                 17.914762          21.604906
perimeter1               78.075406         115.365377
area1                   462.790196         978.376415
smoothness1               0.092478           0.102898
compactness1              0.080085           0.145188
concavity1                0.046058           0.160775
concave_points1           0.025717           0.087990
symmetry1                 0.174186           0.192909
fractal_dimension1        0.062867           0.062680
radius2                   0.284082           0.609083
texture2                  1.220380           1.210915
perimeter2                2.000321           4.323929
area2                    21.135148          72.672406
smoothness2               0.007196           0.006780
compactness2              0.021438           0.032281
concavity2                0.025997           0.041824
concave_points2           0.