DecisionTreeClassifier: a custom built class implementation for the decision tree classifier logic.

src.shared imports: \
numpy as np \
pandas as pd \
matplotlib.pyplot as plt \
utils like load_dataset, calculating metrics, etc.

In [None]:
import sys
from pathlib import Path

project_root = str(Path.cwd().parents[1])

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.classification.decision_tree import DecisionTreeClassifier
from src.shared import *

I chose a dataset suitable for classification. \
The moons dataset fits this criterion well and is perfect for demonstrating the decision tree classifier.

In [None]:
# --- 1. Load Data ---
df = load_dataset('moons')

# --- 2. Data Cleaning ---
# no need to clean as the dataset is already clean


# --- 3. Analysis ---
print("Statistics:")
print(df.describe())

# Average measurements per moon label
moons_stats = df.groupby('label').mean()

print("\nAverage measurements per moon:\n")
print(moons_stats)

# --- 4. Visualization ---
plt.figure(figsize=(10, 6))

for i, (name, group) in enumerate(df.groupby('label')):
    plt.scatter(
        x=group['X1'],
        y=group['X2'],
        label=name,
        edgecolors='black',
        alpha=0.8,
        s=100
    )

plt.title(f"X1 vs X2 by Moon Label")
plt.xlabel('X1')
plt.ylabel('X2')

plt.grid(alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 10))

ax = plt.subplot(111, projection='3d')

for name, group in df.groupby('label'):
    ax.scatter(
        xs=group['X1'],
        ys=group['X2'],
        zs=group['X3'],
        label=name,
        edgecolors='black',
        alpha=0.8,
        s=100
    )

ax.set_title("X1 vs X2 vs X3 by Moon Label")
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.set_zlabel('X3')

ax.legend()
plt.tight_layout()
plt.show()


In [None]:
# --- 1. Select Features & Target ---
X = df.drop('label', axis=1).values
y = df['label'].values


# --- 2. Split Data for training & testing ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)


# Normalize features to 0-1 range for better performance
train_min = X_train.min(axis=0)
train_range = X_train.max(axis=0) - train_min + 1e-15

X_train = (X_train - train_min) / train_range
X_test = (X_test - train_min) / train_range


# --- 3. Training ---
print(f"Training on {len(X_train)} samples")

model = DecisionTreeClassifier()
model.fit(X_train, y_train)


# --- 4. Evaluation ---
print(f"Testing on {len(X_test)} samples")

print("\n--- Testing Results ---")
predictions = model.predict(X_test)

accuracy = Metrics.accuracy(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")