In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error

1. Custom Decision Tree (from scratch)

In [2]:
import numpy as np

class SimpleDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        """Build the decision tree using the training data."""
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, current_depth=0):
        n_samples, n_features = X.shape
        unique_labels = np.unique(y)

        if len(unique_labels) == 1:
            return {'label': unique_labels[0]}
        if n_samples == 0 or (self.max_depth is not None and current_depth >= self.max_depth):
            return {'label': np.bincount(y).argmax()}
        best_gain = -np.inf
        best_split = None
        for feature_idx in range(n_features):
            for threshold in np.unique(X[:, feature_idx]):
                left_indices = X[:, feature_idx] <= threshold
                right_indices = ~left_indices
                left_labels, right_labels = y[left_indices], y[right_indices]

                gain = self._compute_info_gain(y, left_labels, right_labels)
                if gain > best_gain:
                    best_gain = gain
                    best_split = {
                        'feature': feature_idx,
                        'threshold': threshold,
                        'left_indices': left_indices,
                        'right_indices': right_indices
                    }
        if best_split is None:
            return {'label': np.bincount(y).argmax()}
        left_subtree = self._grow_tree(X[best_split['left_indices']], y[best_split['left_indices']], current_depth + 1)
        right_subtree = self._grow_tree(X[best_split['right_indices']], y[best_split['right_indices']], current_depth + 1)

        return {
            'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }

    def _compute_info_gain(self, parent, left, right):
        """Calculate information gain from a potential split."""
        parent_entropy = self._calculate_entropy(parent)
        left_entropy = self._calculate_entropy(left)
        right_entropy = self._calculate_entropy(right)

        weighted_entropy = (len(left)/len(parent)) * left_entropy + (len(right)/len(parent)) * right_entropy
        return parent_entropy - weighted_entropy

    def _calculate_entropy(self, y):
        """Compute entropy for a set of labels."""
        if len(y) == 0:
            return 0
        probabilities = np.bincount(y) / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-9))

    def predict(self, X):
        """Predict labels for multiple samples."""
        return [self._predict_one(sample, self.root) for sample in X]

    def _predict_one(self, x, node):
        """Predict the label for a single sample."""
        if 'label' in node:
            return node['label']
        feature_val = x[node['feature']]
        if feature_val <= node['threshold']:
            return self._predict_one(x, node['left'])
        else:
            return self._predict_one(x, node['right'])


2. Iris dataset: Custom vs Scikit-Learn Decision Tree

In [3]:
iris_data = load_iris()
features, labels = iris_data.data, iris_data.target

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)
my_tree = SimpleDecisionTree(max_depth=3)
my_tree.fit(X_train, y_train)
y_pred_my_tree = my_tree.predict(X_test)
accuracy_my_tree = accuracy_score(y_test, y_pred_my_tree)

sk_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sk_tree.fit(X_train, y_train)
y_pred_sk_tree = sk_tree.predict(X_test)
accuracy_sk_tree = accuracy_score(y_test, y_pred_sk_tree)

print("\n--- Iris Dataset: Decision Tree Comparison ---")
print(f"Custom Tree Accuracy: {accuracy_my_tree:.4f}")
print(f"Scikit-learn Tree Accuracy: {accuracy_sk_tree:.4f}")


--- Iris Dataset: Decision Tree Comparison ---
Custom Tree Accuracy: 1.0000
Scikit-learn Tree Accuracy: 1.0000


3. Ensemble Methods(Wine Dataset)

In [4]:
wine_data = load_wine()
X_features, y_labels = wine_data.data, wine_data.target

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_labels, test_size=0.2, random_state=42
)
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
f1_dt_score = f1_score(y_test, y_pred_dt, average='macro')

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
f1_rf_score = f1_score(y_test, y_pred_rf, average='macro')

print("\n--- Ensemble Methods on Wine Dataset ---")
print(f"Decision Tree F1 Score: {f1_dt_score:.4f}")
print(f"Random Forest F1 Score: {f1_rf_score:.4f}")


--- Ensemble Methods on Wine Dataset ---
Decision Tree F1 Score: 0.9425
Random Forest F1 Score: 1.0000


4. Hyperparameter Tuning (Random Forest Classifier)

In [5]:
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_params,
    scoring='f1_macro',
    cv=3
)
rf_grid.fit(X_train, y_train)
print("\n--- Random Forest Hyperparameter Tuning ---")
print("Optimal Parameters:", rf_grid.best_params_)
print(f"Best Cross-Validated F1 Score: {rf_grid.best_score_:.4f}")



--- Random Forest Hyperparameter Tuning ---
Optimal Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validated F1 Score: 0.9863


5. Regression Models (Wine dataset features â†’ target continuous)

In [7]:
y_wine_reg = y_labels.astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_wine_reg, test_size=0.2, random_state=42
)
dt_model_reg = DecisionTreeRegressor(random_state=42)
dt_model_reg.fit(X_train, y_train)
y_pred_dt = dt_model_reg.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)

rf_model_reg = RandomForestRegressor(random_state=42)
rf_model_reg.fit(X_train, y_train)
y_pred_rf = rf_model_reg.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)

print("\n--- Regression Models on Wine Dataset ---")
print(f"Decision Tree MSE: {mse_dt:.4f}")
print(f"Random Forest MSE: {mse_rf:.4f}")

rf_param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_param_dist,
    n_iter=5,
    cv=3,
    scoring='neg_mean_squared_error',
    random_state=42
)

rf_random_search.fit(X_train, y_train)

print("\n--- Random Forest Regressor Hyperparameter Tuning ---")
print("Optimal Parameters:", rf_random_search.best_params_)
print(f"Best Cross-Validated Score (Neg MSE): {rf_random_search.best_score_:.4f}")



--- Regression Models on Wine Dataset ---
Decision Tree MSE: 0.1667
Random Forest MSE: 0.0648

--- Random Forest Regressor Hyperparameter Tuning ---
Optimal Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'max_depth': 10}
Best Cross-Validated Score (Neg MSE): -0.0509
