In [290]:
# importing required libraries
from sklearn.datasets import load_iris
import numpy as np
import math
import random
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [327]:
class RandomForestClassifier():

    def __init__(self, num_trees=100, min_samples_split=2,max_depth=4, max_features='sqrt'):

        self.num_trees = num_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.base_learners = [] #holds list of decision tree objects
        self.min_samples_split = min_samples_split

    def bootstrap(self, dataset):

        """
        Makes a boostrapped dataset to train decision tree on. Bootstrapping data in an ensemble model introduces 
        randomness to avoid overfitting
        """

        bootstrapped_dataset = []

        for _ in range(len(dataset)):

            # grabs random row index from dataset
            random_row_idx = np.random.randint(0, len(dataset))

            bootstrapped_dataset.append(dataset[random_row_idx])

        return np.array(bootstrapped_dataset)

    def fit(self, dataset_features, dataset_targets):

        # finding the number of features to use to train decision trees
        if self.max_features == "sqrt":
            max_features = int(math.pow(len(dataset_features[0]), 0.5))
        elif self.max_features == "log2":
            max_features = int(math.log2(len(dataset_features[0])))
        else:
            max_features = self.max_features

        # check if max_features is greater than the number of features in dataset
        if max_features > dataset_features.shape[1]:
            raise ValueError(f"max_features ({max_features}) cannot be greater than the number of features in dataset({dataset_features.shape[1]})")
            
        # training base learners
        for _ in range(self.num_trees):

            # get a bootstrapped dataset
            bootstrapped_dataset = self.bootstrap(np.column_stack((dataset_features, dataset_targets)))
            training_dataset_x = bootstrapped_dataset[:, :-1]
            training_dataset_y = bootstrapped_dataset[:, -1]

            # get random max_features amount of features to train the model
            random_features = np.array(random.sample(range(len(dataset_features[0])), max_features))
            training_dataset_x = training_dataset_x[:, random_features]

            # instantiate and fit new decision tree model
            decision_tree = DecisionTreeClassifier(self.min_samples_split, self.max_depth)
            decision_tree.fit(training_dataset_x, training_dataset_y)

            # add the trained decision tree to the list of base_learners along with the features used to train it
            self.base_learners.append((decision_tree, random_features))

    def make_prediction(self, X):

        '''Get prediction for 1d array. Returns the most common predicted class from the decision tree base learners'''

        predicted_classes = [] # stores the predicted class from each decision tree

        for decision_tree, random_features in self.base_learners:

            # filter X to only include features that the decision tree was trained on 
            X_filtered = X[random_features]

            # get prediction of decision tree
            predicted_class = decision_tree.make_prediction(X_filtered, decision_tree.root)

            # append predicted class to predicted classes
            predicted_classes.append(predicted_class)


        # Majority voting for classification
        return Counter(predicted_classes).most_common(1)[0][0]

    def predict(self, X):

        return np.array([self.make_prediction(x) for x in X])


In [329]:
class Node():

    def __init__(self, feature_index=None, threshold=None, left_node=None, right_node=None, info_gained=None, value=None):

        # necessary attributes for a decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left_node = left_node
        self.right_node = right_node
        self.info_gained = info_gained

        # necessary attributes for a leaf node
        self.value = value

class DecisionTreeClassifier():

    def __init__(self, min_samples_split=2, max_depth=4):

        # root will be used to traverse decision tree
        self.root = None

        # stopping conditions for build_tree
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, current_depth=0):


        X = dataset[:,:-1] # get all rows and all columns except the last one, which will be the column to classify
        y = dataset[:,-1] # get all entries of last column 

        num_samples, num_features = np.shape(X)

        # if either of these conditions is not met, the node returned will be a leaf node
        if num_samples >= self.min_samples_split and current_depth <= self.max_depth:

            best_split = self.get_best_split(dataset, num_samples, num_features)

            if "info_gained" in best_split and best_split["info_gained"] > 0:

            
                # recursively build left tree and right tree until sample is too small or depth is too high
                left_tree = self.build_tree(best_split["less_than"], current_depth+1)
                right_tree = self.build_tree(best_split["greater_than"], current_depth+1)
    
                return Node(best_split["feature_index"], best_split["threshold"], left_tree, right_tree, best_split["info_gained"])

        # else return a leaf node when data is pure

        leaf_node_value = self.calculate_leaf_node_value(y)
        return Node(value=leaf_node_value)
    
    def get_best_split(self, dataset, num_samples, num_features):

        best_split = {}
        max_info_gained = -float("inf") #instantiating at -inf because every value is greater than -inf

        # iteratively check every value of every feature as a candidate to be a split for decision node
        for feature_idx in range(num_features):

            possible_thresholds = np.unique(dataset[:, feature_idx]) # get unique list of feature values that can be used to split data into left and right node
            for threshold in possible_thresholds:

                # filter dataset based on category
                less_than = dataset[dataset[:, feature_idx] <= threshold] 
                greater_than = dataset[dataset[:, feature_idx] > threshold]

                # the threshold must split the data so that there is data above and below threshold, otherwise the split did nothing
                if len(less_than) > 0 and len(greater_than) > 0:

                    # extracting the class label column for each dataset
                    y = dataset[:,-1]
                    y_less_than = less_than[:,-1] 
                    y_greater_than = greater_than[:,-1]

                    # calculating the amount of information gained, we are trying to maximize it
                    info_gained = self.info_gained(y, y_less_than, y_greater_than)

                    if info_gained > max_info_gained:

                        # set new max info gained
                        best_split["feature_index"] = feature_idx
                        best_split["threshold"] = threshold
                        best_split["less_than"] = less_than
                        best_split["greater_than"] = greater_than
                        best_split["info_gained"] = info_gained
                        max_info_gained = info_gained
                        
        return best_split


    def info_gained(self, y, y_less_than, y_greater_than, mode="entropy"):

        class_labels = np.unique(y) # in our case this will be 1 and 0

        weight_less_than = len(y_less_than) / len(y)
        weight_greater_than = len(y_greater_than) / len(y)

        #calculates info gained
        info_gained = self.entropy(y) - (weight_less_than * self.entropy(y_less_than) + weight_greater_than * self.entropy(y_greater_than))

        return info_gained

    def entropy(self, y):

        class_labels = np.unique(y) # gets a list of possible classes 
        entropy = 0

        for class_label in class_labels:

            prob_of_class = np.count_nonzero(y == class_label) / len(y) # calculates probability that an element of y is the certain class
            entropy += -1*prob_of_class*np.log2(prob_of_class) # calculates entropy

        return entropy

    def calculate_leaf_node_value(self, y):

        '''Find most common Y value to be the leaf node value'''

        y = list(y)
        return max(y, key=y.count)

    def fit(self, X, y):

        data = np.concatenate((X,np.array([[entry] for entry in y])), axis=1)
        self.root = self.build_tree(data)

    def make_prediction(self, X, node):

        '''Get prediction for 1d array. Traverses through tree to find class that entry fits'''
        
        while node.value is None: # while we are on a decision node

            feature_value = X[node.feature_index] # only focus on feature of X that node analyzes

            if feature_value <= node.threshold:

                node = node.left_node
                
            else:
                
                node = node.right_node

        return node.value
        
    def predict(self, X):

        '''get multiple predictions for 2d array'''


        return np.array([self.make_prediction(row, self.root) for row in X])
        

In [331]:
iris = load_iris()

rf_model = DecisionTreeClassifier()
rf_model.fit(iris.data, iris.target)
rf_model.predict(np.array([[5.9,3,5.1,1.8]]))

array([2.])

In [333]:
iris = load_iris()

rf_model = RandomForestClassifier()
rf_model.fit(iris.data, iris.target)
rf_model.predict(np.array([[5.9,3,5.1,1.8]]))

array([2.])

In [351]:
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(num_trees=15, max_features=4)  # Example parameters
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.87
