# Random Forest From scratch with full explanation of why we our using each and every line of code 🌴-🌴-🌴-🌴-🌴-o/p 🤔 🫷🫷

# Need of using Random Forest is also explained in the last ⬇️⬇️⬇️⬇️⬇️ 👍👍

In [38]:
## Import important libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import random


# load the dataset and split it into dependent and independent features 👍👍

In [39]:
# Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
# Select useful features
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

# Features and target
X = df.drop('Survived', axis=1).values
y = df['Survived'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle the missing values But Why??? 😠 -- To increase oue model accuracy,performance,To reduce bias,To prevent inaccurate accuracy 😦🥹

In [41]:
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


# handle the categorical features But Why???? 😠 --- because "Gradient Descent" works properly and smoothly when the data is scaled 😦🥹

In [42]:
# Encode categorical feature
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Let's build the decision tree by finding the best feature and threshold at each node through iteration in each feature and each data of that feature using Entropy and Information gain 👍🫡

In [43]:
from collections import Counter
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape

        # Stopping conditions
        if depth >= self.max_depth or num_samples < self.min_samples_split or len(set(y)) == 1:
            return self._majority_vote(y)

        best_feature, best_threshold = self._best_split(X, y)

        # If no valid split found
        if best_feature is None:
            return self._majority_vote(y)

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        # 🛑 Avoid creating empty splits
        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return self._majority_vote(y)

        # Recursive splitting
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return (best_feature, best_threshold, left_subtree, right_subtree)

    def _best_split(self, X, y):
        best_gain = -1
        best_feature, best_threshold = None, None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(y, X[:, feature], threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, y, feature_values, threshold):
        parent_entropy = self._entropy(y)

        left_indices = feature_values <= threshold
        right_indices = feature_values > threshold

        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0

        n = len(y)
        n_left = len(y[left_indices])
        n_right = len(y[right_indices])

        left_entropy = self._entropy(y[left_indices])
        right_entropy = self._entropy(y[right_indices])

        child_entropy = (n_left / n) * left_entropy + (n_right / n) * right_entropy
        ig = parent_entropy - child_entropy
        return ig

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _majority_vote(self, y):
        if len(y) == 0:
            return 0  # or raise an error, depending on your use case
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._predict(inputs, self.tree) for inputs in X])

    def _predict(self, x, tree):
        if not isinstance(tree, tuple):
            return tree

        feature, threshold, left, right = tree
        if x[feature] <= threshold:
            return self._predict(x, left)
        else:
            return self._predict(x, right)

# Now let's build the structure of random forest by initializing the number of trees and maximum depth of each tree  👍 🫡

In [44]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, sample_size=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.sample_size = sample_size
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, self.sample_size or n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Majority vote
        return [Counter(col).most_common(1)[0][0] for col in tree_preds.T]

# Finally,initialize and trian the model 👍👍👍

In [45]:
# Initialize and train Random Forest
forest = RandomForest(n_trees=10, max_depth=10)
forest.fit(X_train, y_train)

# Make predictions
y_pred = forest.predict(X_test)

# Accuracy
accuracy = np.mean(y_pred == y_test)
print("Accuracy of Random Forest from scratch:", accuracy)

  left_indices = feature_values <= threshold
  right_indices = feature_values > threshold
  left_indices = X[:, best_feature] <= best_threshold
  right_indices = X[:, best_feature] > best_threshold


Accuracy of Random Forest from scratch: 0.7932960893854749


In [46]:
# hence , the accuracy of our model is nearly ~81%

# Why do we use Random Forest Model when We have already Decision Tree like model 🤔🤔🤔 🧠🧠🧠
# It's better than Decision Tree because it doesn't construct only one tree in-depth like decision tree,it construct number of trees and then average the predictions based on "Majority Voting" made by each tree and hence prevents----"OVERFITTING".😯😯 🫡
# we can control the number of trees and depth of each tree in Random Forest 😯😯
# In this way ,we can move from an overfitting model to a Generalised model having low bias and low varinace 😯😯
# Inshort---- (Low Bias,High Variance) is conevrted into-----------------> (Low Bias,Low Varinace) 🫡🫡🫡