In [1]:
import numpy as np

# Task 1 - Decision Trees

In [20]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import graphviz
from sklearn import tree
breast_cancer = load_breast_cancer ()
X, y = breast_cancer.data , breast_cancer.target
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size =0.2,
random_state =42)

clf = tree.DecisionTreeClassifier(criterion="gini",
                                    max_depth=3,
                                    random_state =42)
clf.fit(X_train, y_train)

# visualize the model with
dot_data = tree.export_graphviz(clf , out_file=None)
graph = graphviz.Source(dot_data)
graph.render("DTC_breastcancer")

array([0.02281065, 0.44444444, 0.        , 0.26035503, 0.29752066,
       0.        , 0.        , 0.        ])

## Task 1.1d - At what max depth do we have pure leaves?

In [26]:
try_tree = tree.DecisionTreeClassifier(criterion="gini",
                                    max_depth=1,
                                    random_state=42)
try_tree.fit(X_train, y_train)

while True:
    # Update is_leaf mask after each fit
    is_leaf = (try_tree.tree_.children_left == -1) & (try_tree.tree_.children_right == -1)
    
    # Check if all leaves are pure
    if try_tree.tree_.impurity[is_leaf].sum() == 0:
        break
        
    try_tree.max_depth += 1
    try_tree.fit(X_train, y_train)

print(f"At max depth {try_tree.max_depth} there are only pure leaves")

At max depth 7 there are only pure leaves


## Task 1.1e - Compare accuracies of the two trees!

In [27]:
# Compare accuracies of both models on test set
acc_depth3 = clf.score(X_test, y_test)
acc_pure_leaves = try_tree.score(X_test, y_test)

print(f"Accuracy of tree with depth 3: {acc_depth3:.4f}")
print(f"Accuracy of tree with pure leaves (depth {try_tree.max_depth}): {acc_pure_leaves:.4f}")

if abs(acc_depth3 - acc_pure_leaves) < 0.01:
    print("\nTheir performance on the test set is nearly identical")
elif acc_depth3 > acc_pure_leaves + 0.05:
    print("\nThe depth 3 tree clearly performs better")
elif acc_pure_leaves > acc_depth3 + 0.05:
    print("\nThe deeper tree clearly performs better") 
elif acc_depth3 > acc_pure_leaves:
    print("\nAlmost the same, but depth 3 shows slightly better generalization")
else:
    print("\nAlmost the same, but the deeper tree can get a few more samples right")


Accuracy of tree with depth 3: 0.9474
Accuracy of tree with pure leaves (depth 7): 0.9474

Their performance on the test set is nearly identical


## 1.2 - Feature importance

In [29]:
# Get feature importances from depth 3 tree
importances = clf.feature_importances_

# Get feature names
feature_names = breast_cancer.feature_names

# Find features with importance > 5%
important_features = [(name, imp) for name, imp in zip(feature_names, importances) if imp > 0.05]

print("Features with importance > 5%:")
for name, importance in sorted(important_features, key=lambda x: x[1], reverse=True):
    print(f"{name}: {importance:.2%}")


Features with importance > 5%:
mean concave points: 75.23%
worst concave points: 7.14%
worst radius: 5.69%
worst perimeter: 5.60%


In [30]:
# Calculate permutation importance
from sklearn.inspection import permutation_importance

# Get permutation importance scores
r = permutation_importance(clf, X_test, y_test,
                         n_repeats=50,
                         random_state=0, 
                         scoring='accuracy')

# Find features with importance > 5%
perm_important_features = [(name, imp) for name, imp in 
                          zip(feature_names, r.importances_mean) if imp > 0.05]

print("\nFeatures with permutation importance > 5%:")
for name, importance in sorted(perm_important_features, key=lambda x: x[1], reverse=True):
    print(f"{name}: {importance:.2%}")

# Compare with previous Gini importance results
print("\nComparing with Gini importance results:")
for name, gini_imp in sorted(important_features, key=lambda x: x[1], reverse=True):
    perm_imp = r.importances_mean[list(feature_names).index(name)]
    print(f"{name}:")
    print(f"  Gini importance: {gini_imp:.2%}")
    print(f"  Permutation importance: {perm_imp:.2%}")



Features with permutation importance > 5%:
area error: 15.05%
worst radius: 13.63%

Comparing with Gini importance results:
mean concave points:
  Gini importance: 75.23%
  Permutation importance: 1.05%
worst concave points:
  Gini importance: 7.14%
  Permutation importance: 1.51%
worst radius:
  Gini importance: 5.69%
  Permutation importance: 13.63%
worst perimeter:
  Gini importance: 5.60%
  Permutation importance: 2.30%
