In [54]:
#import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import entropy

In [55]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"

In [56]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_names = ["ID", "Clump_Thickness", 
                "Uniformity_Cell_Size", "Uniformity_Cell_Shape", 
                "Marginal_Adhesion", "Single_Epithelial_Cell_Size",
                "Bare_Nuclei","Bland_Chromatin", 
                "Normal_Nucleoli", "Mitoses", "Class"]

In [57]:
df = pd.read_csv(url, names=column_names, na_values="?", dtype=str)
df = df.dropna()  
df = df.drop(columns=["ID"])  
df = df.astype(int) 

In [62]:
X = df.drop(columns=["Class"])
y = df["Class"].replace({2: 0, 4: 1}) 
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [63]:
Model= DecisionTreeClassifier(max_depth=2, min_samples_split=5, min_samples_leaf=2, random_state=42).fit(X_train, y_train)

In [64]:
def gini(p):
    return 1 - np.sum(np.square(p))

In [65]:
def entropy_func(p):
    return entropy(p, base=2)

In [66]:
def misclassification_error(p):
    return 1 - np.max(p)

In [67]:
feature_index = Model.tree_.feature[0]
threshold =Model.tree_.threshold[0] 
feature_name = X.columns[feature_index]

In [68]:
n_total =Model.tree_.value[0].sum()
p_root =Model.tree_.value[0][0]/n_total

In [69]:
gini_root = gini(p_root)
entropy_root = entropy_func(p_root)
mis_error_root = misclassification_error(p_root)

In [70]:
def information_gain(root_impurity, left_impurity, right_impurity, left_weight, right_weight):
    return root_impurity - (left_weight * left_impurity + right_weight * right_impurity)

In [71]:
left_node_samples =Model.tree_.value[1].sum()
right_node_samples =Model.tree_.value[2].sum()
left_p =Model.tree_.value[1][0] / left_node_samples
right_p = Model.tree_.value[2][0] / right_node_samples

In [72]:
gini_left =gini(left_p)
gini_right =gini(right_p)
entropy_left =entropy_func(left_p)
entropy_right =entropy_func(right_p)

In [73]:
left_weight = left_node_samples/n_total
right_weight = right_node_samples/n_total

In [74]:
info_gain = information_gain(gini_root, gini_left, gini_right, left_weight, right_weight)

In [75]:
print(f"First split feature: {feature_name}")
print(f"Decision boundary value: {threshold:.2f}")
print(f"Gini impurity: {gini_root:.4f}")
print(f"Entropy: {entropy_root:.4f}")
print(f"Misclassification error: {mis_error_root:.4f}")
print(f"Information gain: {info_gain:.4f}")

First split feature: Uniformity_Cell_Shape
Decision boundary value: 3.50
Gini impurity: 0.4549
Entropy: 0.9339
Misclassification error: 0.3498
Information gain: 0.2863
