Decision Tree Classification


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix 


In [3]:
from ucimlrepo import fetch_ucirepo

adult = fetch_ucirepo(id=2)


In [4]:
print(type(adult))
print(adult.keys())

<class 'ucimlrepo.dotdict.dotdict'>
dict_keys(['data', 'metadata', 'variables'])


In [5]:
import pandas as pd

# adult.data.features contains the features as a DataFrame
X = adult.data.features

# adult.data.targets contains the target column as a DataFrame
y = adult.data.targets

# Combine features and target into a single DataFrame
df = pd.concat([X, y], axis=1)



In [6]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
df.isnull().any()

age               False
workclass          True
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation         True
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
income            False
dtype: bool

In [8]:
df.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,47621.0,47621.0,47621.0,47621.0,47621.0,47621.0
mean,38.640684,189727.1,10.090821,1091.137649,87.853489,40.60005
std,13.558961,105569.5,2.56832,7487.228336,404.010612,12.260345
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117584.0,9.0,0.0,0.0,40.0
50%,37.0,178282.0,10.0,0.0,0.0,40.0
75%,48.0,237720.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [12]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
df['income'] = df['income'].str.replace('.','',regex=False).str.strip() 
print(df['income'].value_counts())

income
<=50K    36080
>50K     11541
Name: count, dtype: int64


In [14]:
categorical_cols = df.select_dtypes(include='object').columns
print(categorical_cols)


Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')


In [15]:
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47621 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             47621 non-null  int64
 1   workclass       47621 non-null  int64
 2   fnlwgt          47621 non-null  int64
 3   education       47621 non-null  int64
 4   education-num   47621 non-null  int64
 5   marital-status  47621 non-null  int64
 6   occupation      47621 non-null  int64
 7   relationship    47621 non-null  int64
 8   race            47621 non-null  int64
 9   sex             47621 non-null  int64
 10  capital-gain    47621 non-null  int64
 11  capital-loss    47621 non-null  int64
 12  hours-per-week  47621 non-null  int64
 13  native-country  47621 non-null  int64
 14  income          47621 non-null  int64
dtypes: int64(15)
memory usage: 5.8 MB


In [17]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [18]:
print("0 - Income <= $50k")
print("1 - Income > $50k")
df['income'].value_counts()

0 - Income <= $50k
1 - Income > $50k


income
0    36080
1    11541
Name: count, dtype: int64

In [19]:
X = df.drop(columns=['income'])
y = df['income']

In [20]:
X_train, X_temp, y_train, y_temp = train_test_split(X,y,train_size=0.60,stratify=y,random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp,train_size=0.50,stratify=y_temp,random_state=42)
print('Train: ', X_train.shape,'\nVal: ', X_val.shape ,'\nTest: ', X_test.shape)

Train:  (28572, 14) 
Val:  (9524, 14) 
Test:  (9525, 14)


In [21]:
import math
from copy import deepcopy

class TreeNode:
    def __init__(self, *, feature=None, threshold=None, left=None, right=None, is_leaf=False, prediction=None, depth=0, samples=0):
        self.feature = feature          # column name or index
        self.threshold = threshold      # value to compare (<= goes left)
        self.left = left
        self.right = right
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.depth = depth
        self.samples = samples

def gini_impurity(y):
    # y is 1D array-like labels
    counts = np.bincount(y)
    prob = counts / counts.sum()
    return 1.0 - np.sum(prob**2)

def entropy_impurity(y):
    counts = np.bincount(y)
    probs = counts[counts > 0] / counts.sum()
    return -np.sum(probs * np.log2(probs))

def weighted_impurity(left_y, right_y, impurity_func):
    n = len(left_y) + len(right_y)
    return (len(left_y) * impurity_func(left_y) + len(right_y) * impurity_func(right_y)) / n

def majority_class(y):
    vals, cnts = np.unique(y, return_counts=True)
    return vals[np.argmax(cnts)]


In [22]:
def find_best_split(X_df, y_arr, impurity_func):
    # X_df pandas DataFrame for single node
    best_feat, best_thr, best_impurity = None, None, float('inf')
    best_left_idx, best_right_idx = None, None

    nrows, ncols = X_df.shape
    for col in X_df.columns:
        col_vals = X_df[col].values
        # candidate thresholds: midpoints between sorted unique values
        unique_vals = np.unique(col_vals)
        if len(unique_vals) == 1:
            continue
        # For speed: if many unique, consider unique sorted values and midpoints
        sorted_vals = np.sort(unique_vals)
        thresholds = (sorted_vals[:-1] + sorted_vals[1:]) / 2.0

        for thr in thresholds:
            left_mask = col_vals <= thr
            right_mask = ~left_mask
            if left_mask.sum() == 0 or right_mask.sum() == 0:
                continue
            left_y = y_arr[left_mask]
            right_y = y_arr[right_mask]
            imp = weighted_impurity(left_y, right_y, impurity_func)
            if imp < best_impurity:
                best_impurity = imp
                best_feat = col
                best_thr = thr
                best_left_idx = left_mask
                best_right_idx = right_mask

    return best_feat, best_thr, best_impurity, best_left_idx, best_right_idx


In [23]:
def build_tree(X_df, y_arr, impurity='gini', max_depth=None, min_samples_split=2, min_impurity_decrease=0.0, depth=0):
    # impurity: 'gini' or 'entropy'
    impurity_func = gini_impurity if impurity == 'gini' else entropy_impurity
    node = TreeNode(depth=depth, samples=len(y_arr))

    # stopping conditions
    if len(np.unique(y_arr)) == 1:
        node.is_leaf = True
        node.prediction = y_arr[0]
        return node

    if max_depth is not None and depth >= max_depth:
        node.is_leaf = True
        node.prediction = majority_class(y_arr)
        return node

    if len(y_arr) < min_samples_split:
        node.is_leaf = True
        node.prediction = majority_class(y_arr)
        return node

    current_impurity = impurity_func(y_arr)
    feat, thr, best_imp, left_mask, right_mask = find_best_split(X_df, y_arr, impurity_func)
    if feat is None:
        node.is_leaf = True
        node.prediction = majority_class(y_arr)
        return node

    impurity_decrease = current_impurity - best_imp
    if impurity_decrease < min_impurity_decrease:
        node.is_leaf = True
        node.prediction = majority_class(y_arr)
        return node

    # create children
    node.feature = feat
    node.threshold = thr

    left_X = X_df[left_mask].reset_index(drop=True)
    left_y = y_arr[left_mask]
    right_X = X_df[right_mask].reset_index(drop=True)
    right_y = y_arr[right_mask]

    node.left = build_tree(left_X, left_y, impurity, max_depth, min_samples_split, min_impurity_decrease, depth+1)
    node.right = build_tree(right_X, right_y, impurity, max_depth, min_samples_split, min_impurity_decrease, depth+1)
    return node


In [24]:
def predict_single(node, x_row):
    # x_row is pandas Series
    while not node.is_leaf:
        if x_row[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
    return node.prediction

def predict(node, X_df):
    preds = []
    for idx in range(len(X_df)):
        preds.append(predict_single(node, X_df.iloc[idx]))
    return np.array(preds)


In [25]:
print(y_train)

1177     1
36894    0
38198    1
21440    0
6995     0
        ..
16378    0
3444     0
34133    0
16818    1
34670    1
Name: income, Length: 28572, dtype: int64


In [26]:
configs = [
    {'impurity':'gini','max_depth':2},
    {'impurity':'gini','max_depth':4},
    {'impurity':'gini','max_depth':6},
    {'impurity':'gini','max_depth':None},
    {'impurity':'entropy','max_depth':2},
    {'impurity':'entropy','max_depth':4},
    {'impurity':'entropy','max_depth':6},
    {'impurity':'entropy','max_depth':None},
]

results = []
for cfg in configs:
    print("Training:", cfg)
    tree = build_tree(X_train, y_train.values, impurity=cfg['impurity'], max_depth=cfg['max_depth'], min_samples_split=5)
    ypred_val = predict(tree, X_val)
    acc_val = accuracy_score(y_val, ypred_val)
    prf = precision_recall_fscore_support(y_val, ypred_val, average='binary', zero_division=0)
    results.append((cfg, tree, acc_val, prf))
    print("Val acc:", acc_val)


Training: {'impurity': 'gini', 'max_depth': 2}
Val acc: 0.8241285174296514
Training: {'impurity': 'gini', 'max_depth': 4}
Val acc: 0.8394582108357833
Training: {'impurity': 'gini', 'max_depth': 6}
Val acc: 0.8492230155396892
Training: {'impurity': 'gini', 'max_depth': None}
Val acc: 0.8139437211255774
Training: {'impurity': 'entropy', 'max_depth': 2}
Val acc: 0.8022889542209156
Training: {'impurity': 'entropy', 'max_depth': 4}
Val acc: 0.8394582108357833
Training: {'impurity': 'entropy', 'max_depth': 6}
Val acc: 0.8494330113397732
Training: {'impurity': 'entropy', 'max_depth': None}
Val acc: 0.8161486770264594


In [45]:
best_cfg = max(results, key=lambda x: x[2])  # x[2] is validation accuracy
best_tree = best_cfg[1]
best_depth = best_cfg[0]['max_depth']
best_impurity = best_cfg[0]['impurity']
print("Best depth:", best_depth)
print("Best impurity:", best_impurity)


Best depth: 6
Best impurity: entropy


In [46]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Predictions
test_preds = predict(best_tree, X_test)

# Metrics
acc = accuracy_score(y_test, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, test_preds, average='binary', zero_division=0)
cm = confusion_matrix(y_test, test_preds)

print("\nTest set performance of best pre-pruned tree:")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:\n", cm)



Test set performance of best pre-pruned tree:
Accuracy: 0.8528083989501313
Precision: 0.7650496785505553
Recall: 0.56691208315288
F1-score: 0.6512437810945274
Confusion Matrix:
 [[6814  402]
 [1000 1309]]


Now using Scikit-Library 


In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


In [50]:
# Use the same settings as your best custom tree
sk_criterion = 'gini' if best_impurity == 'gini' else 'entropy'

clf = DecisionTreeClassifier(
    criterion=sk_criterion,
    max_depth=best_depth,      # best depth from your results
    min_samples_split=5,       # same as in your pre-pruning
    random_state=42
)


In [52]:
# Fit the classifier on the training data
clf.fit(X_train, y_train.values)


0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,6
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [53]:
# Predict on the test set
y_pred_sklearn = clf.predict(X_test)


In [54]:
# Accuracy
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)

# Precision, Recall, F1-score
precision_sklearn, recall_sklearn, f1_sklearn, _ = precision_recall_fscore_support(
    y_test, y_pred_sklearn, average='binary', zero_division=0)

# Confusion matrix
cm_sklearn = confusion_matrix(y_test, y_pred_sklearn)

# Print results
print("=== scikit-learn Decision Tree Results ===")
print("Accuracy:", acc_sklearn)
print("Precision:", precision_sklearn)
print("Recall:", recall_sklearn)
print("F1-score:", f1_sklearn)
print("Confusion Matrix:\n", cm_sklearn)


=== scikit-learn Decision Tree Results ===
Accuracy: 0.8528083989501313
Precision: 0.7650496785505553
Recall: 0.56691208315288
F1-score: 0.6512437810945274
Confusion Matrix:
 [[6814  402]
 [1000 1309]]


In [55]:
# Predictions using your custom tree
y_pred_custom = predict(best_tree, X_test)


In [56]:
acc_custom = accuracy_score(y_test, y_pred_custom)
precision_custom, recall_custom, f1_custom, _ = precision_recall_fscore_support(
    y_test, y_pred_custom, average='binary', zero_division=0)
cm_custom = confusion_matrix(y_test, y_pred_custom)


In [57]:
import pandas as pd

comparison = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Custom Tree": [acc_custom, precision_custom, recall_custom, f1_custom],
    "Scikit-learn Tree": [acc_sklearn, precision_sklearn, recall_sklearn, f1_sklearn]
})

print(comparison)


      Metric  Custom Tree  Scikit-learn Tree
0   Accuracy     0.852808           0.852808
1  Precision     0.765050           0.765050
2     Recall     0.566912           0.566912
3   F1-score     0.651244           0.651244


In [58]:
print("\nCustom Tree Confusion Matrix:\n", cm_custom)
print("\nScikit-learn Tree Confusion Matrix:\n", cm_sklearn)



Custom Tree Confusion Matrix:
 [[6814  402]
 [1000 1309]]

Scikit-learn Tree Confusion Matrix:
 [[6814  402]
 [1000 1309]]


Thank You 
