<a href="https://colab.research.google.com/github/apekshamehta/machine-learning-examples/blob/main/decision_tree_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.7.2


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
!pwd

/content


In [None]:
df = pd.read_csv("Breast_Cancer.csv")

In [None]:
print("Dataset Length: ", len(df))
print("Dataset Shape: ", df.shape)
df.head(3)

Dataset Length:  4024
Dataset Shape:  (4024, 16)


Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive


In [None]:
df.columns = [c.strip() for c in df.columns]
df["Grade"] = (
    df["Grade"].astype(str).str.strip().replace({"anaplastic; Grade IV": "4"," anaplastic; Grade IV": "4"})
)

In [None]:
TARGET = "Status"

FEATURES = [
    "Age",
    "Tumor Size",
    "N Stage",
    "Grade",
    "Estrogen Status",
    "Progesterone Status",
]

X = df[FEATURES].copy()
y = df[TARGET].astype(str)
print(X)
print(y)

      Age  Tumor Size N Stage Grade Estrogen Status Progesterone Status
0      68           4      N1     3        Positive            Positive
1      50          35      N2     2        Positive            Positive
2      58          63      N3     2        Positive            Positive
3      58          18      N1     3        Positive            Positive
4      47          41      N1     3        Positive            Positive
...   ...         ...     ...   ...             ...                 ...
4019   62           9      N1     2        Positive            Positive
4020   56          46      N2     2        Positive            Positive
4021   68          22      N1     2        Positive            Negative
4022   58          44      N1     2        Positive            Positive
4023   46          30      N1     2        Positive            Positive

[4024 rows x 6 columns]
0       Alive
1       Alive
2       Alive
3       Alive
4       Alive
        ...  
4019    Alive
4020    Alive

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# which columns are numeric vs categorical?
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in FEATURES if c not in num_cols]

print("Numeric:", num_cols)
print("Categorical:", cat_cols)

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols),
])


Numeric: ['Age', 'Tumor Size']
Categorical: ['N Stage', 'Grade', 'Estrogen Status', 'Progesterone Status']


In [None]:
alive = (y == "Alive").sum()
dead = (y == "Dead").sum()
ratio = alive / dead
print("Alive:", alive, "Dead:", dead, "Alive/Dead ratio:", ratio)
class_weight = {
    "Alive": 1.0,
    "Dead": ratio   # make Dead roughly as important as Alive overall
}
print("Class weight:", class_weight)

Alive: 3408 Dead: 616 Alive/Dead ratio: 5.532467532467533
Class weight: {'Alive': 1.0, 'Dead': np.float64(5.532467532467533)}


In [None]:
clf = DecisionTreeClassifier(
    class_weight=class_weight,
    max_depth=5,          # small tree = easier to understand
    min_samples_leaf=10,  # avoid tiny, overfitted leaves
    random_state=42
)

model = Pipeline([
    ("pre", preprocess),
    ("clf", clf),
])

# train the model
model.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
from sklearn.metrics import precision_recall_fscore_support,roc_auc_score

In [None]:
# predictions
y_pred = model.predict(X_test)

# probabilities (for AUC)
proba = model.predict_proba(X_test)
classes = list(model.named_steps["clf"].classes_)

dead_idx = classes.index("Dead")
# print(dead_idx)
proba_dead = proba[:, dead_idx]
#print(proba_dead)
# confusion matrix

print("Confusion matrix [rows=true, cols=pred] (Dead, Alive):")
print(confusion_matrix(y_test, y_pred, labels=["Dead","Alive"]))

print(classification_report(y_test, y_pred, digits=3))
auc = roc_auc_score((y_test == "Dead").astype(int), proba_dead)
print("ROC AUC (Dead vs Alive):", round(auc, 3))

# precision, recall, F1 for Dead only
p, r, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, labels=["Dead"], average=None, zero_division=0
)
p_dead, r_dead, f1_dead = float(p[0]), float(r[0]), float(f1[0])

# F2 for Dead (recall-focused)
f2_dead = fbeta_score(
    (y_test == "Dead").astype(int),
    (y_pred == "Dead").astype(int),
    beta=2,
    zero_division=0
)

print("\nDead class metrics:")
print("Precision (Dead):", round(p_dead, 3))
print("Recall (Dead):   ", round(r_dead, 3))
print("F1 (Dead):       ", round(f1_dead, 3))
print("F2 (Dead):       ", round(f2_dead, 3))

Confusion matrix [rows=true, cols=pred] (Dead, Alive):
[[ 75  48]
 [223 459]]
              precision    recall  f1-score   support

       Alive      0.905     0.673     0.772       682
        Dead      0.252     0.610     0.356       123

    accuracy                          0.663       805
   macro avg      0.579     0.641     0.564       805
weighted avg      0.805     0.663     0.709       805

ROC AUC (Dead vs Alive): 0.695

Dead class metrics:
Precision (Dead): 0.252
Recall (Dead):    0.61
F1 (Dead):        0.356
F2 (Dead):        0.475


In [None]:
# Try a higher threshold, e.g., 0.7
threshold = 0.7
y_pred_strict = np.where(proba_dead >= threshold, "Dead", "Alive")


print("Confusion matrix with threshold =", threshold)
print(confusion_matrix(y_test, y_pred_strict, labels=["Dead","Alive"]))

p, r, f1, _ = precision_recall_fscore_support(
    y_test, y_pred_strict,
    labels=["Dead"],
    average=None,
    zero_division=0
)

print("Precision (Dead):", round(float(p[0]), 3))
print("Recall (Dead):   ", round(float(r[0]), 3))

Confusion matrix with threshold = 0.7
[[ 31  92]
 [ 53 629]]
Precision (Dead): 0.369
Recall (Dead):    0.252


In [None]:
# predictions
y_pred = model.predict(X_test)
print(y_pred)

In [None]:
tree = model.named_steps["clf"]
enc = model.named_steps["pre"].named_transformers_["cat"]


ohe_features = enc.get_feature_names_out(cat_cols).tolist() if cat_cols else []
all_features = ohe_features + num_cols

importances = pd.Series(tree.feature_importances_, index=all_features)

def base_name(col):
    left = col.split("_", 1)[0]
    return left if left in cat_cols else col

agg_importance = (
    importances
    .groupby(importances.index.map(base_name))
    .sum()
    .sort_values(ascending=False)
)

print("\nAggregated feature importances (%):")
print((agg_importance / agg_importance.sum() * 100).round(1))


Aggregated feature importances (%):
N Stage                50.6
Grade                  11.6
Estrogen Status        11.0
Progesterone Status    10.3
Age                     8.7
Tumor Size              7.7
dtype: float64
