# Model

In [10]:
import numpy as np

class DecisionTree:
    """ A class representing a decision tree model used in gradient boosting.

    Attributes:
        max_depth (int): Maximum depth of the tree.
        min_samples_split (int): Minimum number of samples required to split a node.
        tree (dict or float): The root of the tree, represented as a dictionary or a leaf value.
    """

    def __init__(self, max_depth=3, min_samples_split=2):
        """ Initializes the DecisionTree with maximum depth and minimum samples required to split.

        @params:
            max_depth (int): The maximum depth of the tree. Default is 3.
            min_samples_split (int): Minimum samples required to split a node. Default is 2.
        """
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def train(self, X, y, grad, hess):
        """ Trains the decision tree using the given data, gradients, and hessians.

        @params:
            X (numpy.ndarray): A 2D array of shape (n_samples, n_features) with training data.
            y (numpy.ndarray): A 1D array of shape (n_samples,) with target values.
            grad (numpy.ndarray): A 1D array of gradients for each sample.
            hess (numpy.ndarray): A 1D array of hessians for each sample.
        """
        self.tree = self._build_tree(X, y, grad, hess)

    def _split(self, X, y, grad, hess):
        """ Finds the best split for the data to maximize the gain.

        @params:
            X (numpy.ndarray): A 2D array of shape (n_samples, n_features) with training data.
            y (numpy.ndarray): A 1D array of shape (n_samples,) with target values.
            grad (numpy.ndarray): A 1D array of gradients for each sample.
            hess (numpy.ndarray): A 1D array of hessians for each sample.
        @return:
            tuple: The best feature index and threshold for the split.
        """
        best_gain = -np.inf
        best_split = None
        # Iterate over all features to find the best split point
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])  # Unique thresholds for the feature
            for threshold in thresholds:
                left_mask = X[:, feature_index] < threshold
                right_mask = ~left_mask
                if sum(left_mask) >= self.min_samples_split and sum(right_mask) >= self.min_samples_split:
                    gain = self._gain(y, grad, hess, left_mask, right_mask)
                    if gain > best_gain:
                        best_gain = gain
                        best_split = (feature_index, threshold)
        return best_split

    def _gain(self, y, grad, hess, left_mask, right_mask):
        """ Calculates the gain of the proposed split.

        @params:
            y (numpy.ndarray): A 1D array of target values.
            grad (numpy.ndarray): A 1D array of gradients for each sample.
            hess (numpy.ndarray): A 1D array of hessians for each sample.
            left_mask (numpy.ndarray): A boolean mask for the left split.
            right_mask (numpy.ndarray): A boolean mask for the right split.
        @return:
            float: The calculated gain for the split.
        """
        # Compute sums of gradients and hessians for the left and right splits
        grad_left, grad_right = grad[left_mask], grad[right_mask]
        hess_left, hess_right = hess[left_mask], hess[right_mask]
        G_L, H_L = np.sum(grad_left), np.sum(hess_left)
        G_R, H_R = np.sum(grad_right), np.sum(hess_right)
        # Calculate the gain using the formula
        gain = 0.5 * ((G_L ** 2) / (H_L + 1e-10) + (G_R ** 2) / (H_R + 1e-10) - ((G_L + G_R) ** 2) / (H_L + H_R + 1e-10))
        return gain

    def _build_tree(self, X, y, grad, hess, depth=0):
        """ Recursively builds the decision tree based on the provided data.

        @params:
            X (numpy.ndarray): A 2D array of shape (n_samples, n_features) with training data.
            y (numpy.ndarray): A 1D array of target values.
            grad (numpy.ndarray): A 1D array of gradients for each sample.
            hess (numpy.ndarray): A 1D array of hessians for each sample.
            depth (int): The current depth of the tree.
        @return:
            dict or float: A dictionary representing the subtree or a leaf value.
        """
        if depth == self.max_depth or len(y) < self.min_samples_split:
            # Return leaf value if maximum depth is reached or samples are insufficient
            leaf_value = -np.sum(grad) / (np.sum(hess) + 1e-10)
            return leaf_value

        best_split = self._split(X, y, grad, hess)
        if not best_split:
            # Return leaf value if no valid split is found
            return -np.sum(grad) / (np.sum(hess) + 1e-10)

        feature_index, threshold = best_split
        left_mask = X[:, feature_index] < threshold
        right_mask = ~left_mask

        # Recursively build left and right subtrees
        left_subtree = self._build_tree(X[left_mask], y[left_mask], grad[left_mask], hess[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], grad[right_mask], hess[right_mask], depth + 1)
        return {"feature_index": feature_index, "threshold": threshold, "left": left_subtree, "right": right_subtree}

    def _predict(self, x, tree):
        """ Predicts the value for a single sample using the decision tree.

        @params:
            x (numpy.ndarray): A 1D array of feature values for a single sample.
            tree (dict or float): The decision tree or leaf value to predict with.
        @return:
            float: The predicted value for the sample.
        """
        if not isinstance(tree, dict):
            # Return the leaf value if the node is a leaf
            return tree

        feature_index = tree["feature_index"]
        threshold = tree["threshold"]
        # Traverse left or right subtree based on the feature value
        if x[feature_index] < threshold:
            return self._predict(x, tree["left"])
        else:
            return self._predict(x, tree["right"])

    def predict(self, X):
        """ Predicts the values for multiple samples using the decision tree.

        @params:
            X (numpy.ndarray): A 2D array of shape (n_samples, n_features) with input data.
        @return:
            numpy.ndarray: A 1D array of predicted values for each sample.
        """
        return np.array([self._predict(x, self.tree) for x in X])

    def print_tree(self, tree=None, depth=0):
        """ Prints the structure of the decision tree.

        @params:
            tree (dict or float, optional): The tree to print. If None, use the root tree.
            depth (int): The current depth in the tree for formatting purposes.
        """
        if tree is None:
            tree = self.tree

        indent = "\t" * depth  # Increase indentation based on the depth
        if not isinstance(tree, dict):  # If the node is a leaf
            print(f"{indent}{tree:.4f}")  # Print the leaf value
        else:  # If the node is not a leaf
            feature_index = tree["feature_index"]
            threshold = tree["threshold"]
            # Print the splitting condition
            print(f"{indent}{depth}:[f{feature_index}<{threshold:.8f}] yes={depth + 1},no={depth + 2},missing={depth + 2}")
            print(f"{indent}\tyes->")
            self.print_tree(tree["left"], depth + 1)
            print(f"{indent}\tno->")
            self.print_tree(tree["right"], depth + 1)
            

In [14]:
class XGBoost:
    """ A simple implementation of XGBoost for binary classification.

    Attributes:
        n_estimators (int): Number of boosting rounds (trees).
        max_depth (int): Maximum depth of each decision tree.
        min_samples_split (int): Minimum number of samples required to split an internal node.
        learning_rate (float): Step size shrinkage used in update to prevent overfitting.
        trees (list): List of trained decision tree models.
    """

    def __init__(self, n_estimators=10, max_depth=3, min_samples_split=2, learning_rate=0.1):
        """ Initializes the XGBoost model with the specified parameters.

        @params:
            n_estimators (int): Number of trees to fit. Default is 10.
            max_depth (int): Maximum depth of each tree. Default is 3.
            min_samples_split (int): Minimum samples required to split a node. Default is 2.
            learning_rate (float): Learning rate for the model. Default is 0.1.
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.learning_rate = learning_rate
        self.trees = []

    def train(self, X, y):
        """ Trains the XGBoost model using the training data.

        @params:
            X (numpy.ndarray): A 2D array of shape (n_samples, n_features) with the training data.
            y (numpy.ndarray): A 1D array of shape (n_samples,) with the target labels.
        """
        # Initialize predictions with zeros
        y_pred = np.zeros_like(y, dtype=float)

        # Train each tree iteratively
        for i in range(self.n_estimators):
            # Compute gradients and hessians
            grad, hess = self._compute_gradients(y, y_pred)
            
            # Initialize and train a new decision tree
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.train(X, y, grad, hess)
            
            # Make predictions using the trained tree
            predictions = tree.predict(X)
            
            # Update predictions with the learning rate
            y_pred += self.learning_rate * predictions
            self.trees.append(tree)  # Store the trained tree
            
            # Print the structure of the current tree
            print(f"Tree {i + 1}:")
            tree.print_tree()

            # Calculate and print the cross-entropy loss
            loss = self._cross_entropy_loss(y, 1 / (1 + np.exp(-y_pred)))  # Apply sigmoid to get probabilities
            print(f"Loss after Tree {i + 1}: {loss}")
            
            print("\n")

    def _compute_gradients(self, y, y_pred):
        """ Computes the gradients and hessians for the loss function.

        @params:
            y (numpy.ndarray): A 1D array of target labels.
            y_pred (numpy.ndarray): A 1D array of current predictions.
        @return:
            tuple: A tuple containing:
                - grad (numpy.ndarray): A 1D array of gradients for each sample.
                - hess (numpy.ndarray): A 1D array of hessians for each sample.
        """
        # Compute the first derivative (gradient)
        grad = y_pred - y
        
        # For the squared error loss, the second derivative (hessian) is constant and equal to 1
        hess = np.ones_like(y)
        return grad, hess

    def _cross_entropy_loss(self, y, y_pred):
        """ Computes the cross-entropy loss between true and predicted labels.

        @params:
            y (numpy.ndarray): A 1D array of target labels.
            y_pred (numpy.ndarray): A 1D array of predicted probabilities.
        @return:
            float: The mean cross-entropy loss.
        """
        # Clip predicted probabilities to avoid log(0)
        y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
        loss = -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
        return loss

    def predict(self, X):
        """ Outputs predicted labels for the input data using the trained trees.

        @params:
            X (numpy.ndarray): A 2D array of shape (n_samples, n_features) with the input data.
        @return:
            numpy.ndarray: A 1D array of predicted values for each sample.
        """
        # Initialize predictions with zeros
        y_pred = np.zeros(X.shape[0])

        # Aggregate predictions from each tree
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred


# Main

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset from a CSV file using numpy's genfromtxt
data = np.genfromtxt('breast+cancer+wisconsin+diagnostic/wdbc.data', delimiter=',', dtype=str)

# Extract features and convert them to float
X = data[:, 2:].astype(float)  # Features start from the 3rd column (index 2)
# Convert the labels to binary (Malignant -> 1, Benign -> 0)
y = np.where(data[:, 1] == 'M', 1, 0)

# Normalize the features using mean and standard deviation
X = (X - X.mean(axis=0)) / X.std(axis=0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use the XGBoost model we implemented earlier
model = XGBoost(n_estimators=5, max_depth=5, learning_rate=0.3)

# Trains the XGBoost model on the training data.
model.train(X_train, y_train)

# Generates predictions on the test data.
predictions = model.predict(X_test)

# Convert predictions to binary labels (0 or 1) using a threshold of 0.5
predictions = np.where(predictions >= 0.5, 1, 0)

# Calculate the accuracy of the model
accuracy = np.mean(predictions == y_test)
print("Model Accuracy:", accuracy)


Tree 1:
0:[f7<0.07482458] yes=1,no=2,missing=2
	yes->
	1:[f20<0.12855895] yes=2,no=3,missing=3
		yes->
		2:[f10<0.86558942] yes=3,no=4,missing=4
			yes->
			3:[f24<1.94769028] yes=4,no=5,missing=5
				yes->
				4:[f14<-1.23437635] yes=5,no=6,missing=6
					yes->
					0.1429
					no->
					0.0040
				no->
				0.5000
			no->
			0.6667
		no->
		2:[f1<-0.60728273] yes=3,no=4,missing=4
			yes->
			3:[f0<0.32454537] yes=4,no=5,missing=5
				yes->
				-0.0000
				no->
				4:[f0<0.61707999] yes=5,no=6,missing=6
					yes->
					-0.0000
					no->
					-0.0000
			no->
			3:[f17<-0.23782161] yes=4,no=5,missing=5
				yes->
				4:[f0<0.26206224] yes=5,no=6,missing=6
					yes->
					1.0000
					no->
					1.0000
				no->
				-0.0000
	no->
	1:[f27<0.48715636] yes=2,no=3,missing=3
		yes->
		2:[f22<0.26624928] yes=3,no=4,missing=4
			yes->
			3:[f1<0.47945762] yes=4,no=5,missing=5
				yes->
				4:[f0<-1.01032135] yes=5,no=6,missing=6
					yes->
					-0.0000
					no->
					-0.0000
				no->
				4:[f0<0.134

# Results of xgboost in sklearn

In [16]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data from a CSV file
data = np.genfromtxt('breast+cancer+wisconsin+diagnostic/wdbc.data', delimiter=',', dtype=str)

# Extract features and convert them to float for numerical operations
X = data[:, 2:].astype(float)  # Features start from the 3rd column (index 2)

# Convert labels from 'M' (Malignant) and 'B' (Benign) to binary (1 and 0)
y = np.where(data[:, 1] == 'M', 1, 0)

# Normalize the features by subtracting the mean and dividing by the standard deviation
X = (X - X.mean(axis=0)) / X.std(axis=0)

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost model with log loss as the evaluation metric
model = xgb.XGBClassifier(eval_metric='logloss')

# Fits the XGBoost model to the training data.
model.fit(X_train, y_train)

# Retrieve and print model parameters for reference
params = model.get_params()
n_estimators = params['n_estimators']
max_depth = params['max_depth']
learning_rate = params['learning_rate']

print("Number of estimators:", n_estimators)
print("Max depth:", max_depth)
print("Learning rate:", learning_rate)

# Retrieve the trained booster (underlying model) from XGBoost
booster = model.get_booster()

# Iterate over each tree in the model and print its structure
for i, tree in enumerate(booster.get_dump()):
    print(f"Tree {i + 1} structure:\n{tree}\n")

# Make predictions on the test data
y_pred = model.predict(X_test)

# Compute and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Number of estimators: None
Max depth: None
Learning rate: None
Tree 1 structure:
0:[f27<0.42320466] yes=1,no=2,missing=2
	1:[f23<0.151913315] yes=3,no=4,missing=4
		3:[f13<0.0616690516] yes=7,no=8,missing=8
			7:[f1<0.740089118] yes=11,no=12,missing=12
				11:leaf=-0.466026366
				12:[f22<-0.20734556] yes=13,no=14,missing=14
					13:leaf=-0.357332468
					14:leaf=0.156942263
			8:leaf=0.0610413142
		4:[f26<-0.377509862] yes=9,no=10,missing=10
			9:leaf=0.0178343765
			10:leaf=0.624066651
	2:[f20<-0.293889403] yes=5,no=6,missing=6
		5:leaf=-0.035497915
		6:leaf=0.752050877


Tree 2 structure:
0:[f7<0.046967078] yes=1,no=2,missing=2
	1:[f20<0.118204817] yes=3,no=4,missing=4
		3:[f12<0.613808632] yes=7,no=8,missing=8
			7:[f14<-1.23437631] yes=13,no=14,missing=14
				13:leaf=-0.114261463
				14:[f24<1.49618065] yes=17,no=18,missing=18
					17:leaf=-0.403840512
					18:leaf=-0.0984430537
			8:leaf=0.0410378166
		4:[f1<-0.230297849] yes=9,no=10,missing=10
			9:leaf=-0.182360739
			10:leaf=0