In [45]:
import numpy as np
from collections import Counter

In [46]:


class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class decision_tree:
    def __init__(self, max_dep, n_features_split=None):
        self.max_dep = max_dep
        self.root = None
        self.n_features_split = n_features_split  # number of features to try at each split

    def gini(self, y):
        classes = np.unique(y)
        gini = 1
        for c in classes:
            p = np.sum(y == c) / len(y)
            gini -= p ** 2
        return gini

    def weighted_gini(self, left, right):
        n = len(left) + len(right)
        return (len(left) / n) * self.gini(left) + (len(right) / n) * self.gini(right)

    def bestsplit(self, X, Y):
        m_samples, n_features = X.shape
        best_thresh, best_feat, best_gini = None, None, 1

        # number of features to consider
        if self.n_features_split is None:
            n_features_split = int(np.sqrt(n_features))
        else:
            n_features_split = self.n_features_split

        # choose random subset of features
        features = np.random.choice(n_features, n_features_split, replace=False)

        for feature in features:
            values = np.unique(X[:, feature])
            for value in values:
                left_mask = X[:, feature] <= value
                right_mask = ~left_mask
                y_left, y_right = Y[left_mask], Y[right_mask]
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                curr_gini = self.weighted_gini(y_left, y_right)
                if curr_gini < best_gini:
                    best_gini, best_feat, best_thresh = curr_gini, feature, value
        return best_gini, best_feat, best_thresh    

    def build_tree(self, X, Y, depth=0):
        if len(np.unique(Y)) == 1:
            return Node(value=Y[0])
        
        if depth >= self.max_dep:
            values, counts = np.unique(Y, return_counts=True)
            return Node(value=values[np.argmax(counts)])
        
        best_gin, best_fea, best_thr = self.bestsplit(X, Y)
        if best_fea is None:
            values, counts = np.unique(Y, return_counts=True)
            return Node(value=values[np.argmax(counts)])

        l_mask = X[:, best_fea] <= best_thr
        r_mask = ~l_mask

        if len(Y[l_mask]) == 0 or len(Y[r_mask]) == 0:
            values, counts = np.unique(Y, return_counts=True)
            return Node(value=values[np.argmax(counts)])

        left_node = self.build_tree(X[l_mask], Y[l_mask], depth=depth+1)
        right_node = self.build_tree(X[r_mask], Y[r_mask], depth=depth+1)
        return Node(best_fea, best_thr, left_node, right_node)

    def predict_one(self, x, node):
        if node.value is not None:
            return int(node.value)
        if x[node.feature] <= node.threshold:
            return self.predict_one(x, node.left)
        else:
            return self.predict_one(x, node.right)

    def fit(self, X, Y):
        self.root = self.build_tree(X, Y, depth=0)

    def predict(self, X):
        return [self.predict_one(x, self.root) for x in X.values]

    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.root
        indent = "  " * depth
        if node.value is not None:
            print(f"{indent}Leaf: Class={node.value}")
            return
        print(f"{indent}Feature[{node.feature}] <= {node.threshold}")
        self.print_tree(node.left, depth+1)
        self.print_tree(node.right, depth+1)


In [47]:
class randomforest:
    def __init__(self,maxdep,n_trees):
        self.maxdep = maxdep
        self.n_trees = n_trees
        self.tree = []
    def fit(self,X,Y):
      for _ in range(self.n_trees):  
        tree = decision_tree(self.maxdep)
        samples,features = X.shape
        row_indexes = np.random.choice(samples,samples,replace=True)
        x_samp = X.iloc[row_indexes].values
        y_samp = Y.iloc[row_indexes].values
        tree.fit(x_samp,y_samp)
        self.tree.append(tree)
    def predict(self,X):
       
        pred = np.array([tree.predict(X) for tree in self.tree])
        pred = np.swapaxes(pred,0,1)
        
        y_pred = np.array([Counter(preds).most_common(1)[0][0] for preds in pred])
        return y_pred
          



In [48]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd

In [49]:
data = load_breast_cancer(as_frame=True)
dt = pd.DataFrame(data.data)
tar = pd.DataFrame(data.target)

In [50]:
x_train,x_test,y_train,y_test = train_test_split(dt,tar,test_size=0.2,random_state=33,shuffle=True)

In [51]:
rn_for = randomforest(4,10)

In [52]:
rn_for.fit(x_train,y_train)

In [53]:
pred = rn_for.predict(x_test)

  return int(node.value)


In [54]:
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,f1_score,recall_score

In [55]:
print("accuracy:",accuracy_score(y_test,pred))
print("precision:",precision_score(y_test,pred))
print("confusion matrix:\n",confusion_matrix(y_test,pred))
print("f1 score:",f1_score(y_test,pred))
print('recall score',recall_score(y_test,pred))

accuracy: 0.9385964912280702
precision: 0.9444444444444444
confusion matrix:
 [[39  4]
 [ 3 68]]
f1 score: 0.951048951048951
recall score 0.9577464788732394
