In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_classification
from sklearn.utils import check_random_state
import numpy as np

def generate_sample_indices(random_state, n_samples):
    """Generate bootstrap sample indices for in-bag samples."""
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)
    return sample_indices

def generate_unsampled_indices(random_state, n_samples):
    """Generate indices for out-of-bag (OOB) samples."""
    sample_indices = generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices

class CustomRandomForestClassifier(RandomForestClassifier):
    def fit(self, X, y):
        # Fit the model using the super class
        super().fit(X, y)

        # Initialize lists to store in-bag, OOB indices, and OOB losses for each tree
        self.in_bag_indices_ = []
        self.oob_indices_ = []
        self.tree_weights_ = []

        for estimator in self.estimators_:
            # Generate in-bag and OOB indices
            random_state = estimator.random_state
            in_bag_indices = generate_sample_indices(random_state, len(X))
            oob_indices = generate_unsampled_indices(random_state, len(X))

            # Store in-bag and OOB indices
            self.in_bag_indices_.append(in_bag_indices)
            self.oob_indices_.append(oob_indices)

            # Calculate and store OOB loss
            if len(oob_indices) > 0:
                oob_predictions = estimator.predict(X[oob_indices])
                oob_loss = mean_squared_error(y[oob_indices], oob_predictions)
                self.tree_weights_.append(np.exp(-oob_loss))
            else:
                self.tree_weights_.append(0)

        return self

    def predict(self, X):
        # Check if forest is fitted
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        # Aggregate predictions from all trees, weighted by their OOB loss-based weights
        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))
        for tree, weight in zip(self.estimators_, self.tree_weights_):
            preds = tree.predict_proba(X)
            weighted_preds += weight * preds

        final_preds = np.argmax(weighted_preds, axis=1)
        return self.classes_[final_preds]

# Example usage
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
clf = CustomRandomForestClassifier(oob_score=True)
clf.fit(X, y)
predictions = clf.predict(X)

# Accessing in-bag and OOB data for each tree
for i, tree in enumerate(clf.estimators_):
    in_bag_samples_X = X[clf.in_bag_indices_[i]]
    in_bag_samples_y = y[clf.in_bag_indices_[i]]
    oob_samples_X = X[clf.oob_indices_[i]]
    oob_samples_y = y[clf.oob_indices_[i]]

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_classification
from sklearn.utils import check_random_state
import numpy as np

# Function to generate indices for random samples from a dataset
def generate_sample_indices(random_state, n_samples):
    """
    Generate random indices for selecting samples from a dataset.

    Parameters:
    - random_state: A random number generator.
    - n_samples: The total number of samples in the dataset.

    Returns:
    - sample_indices: An array of random sample indices.
    """
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)
    return sample_indices

# Function to generate indices for samples that are not selected (out-of-bag samples)
def generate_unsampled_indices(random_state, n_samples):
    """
    Generate indices for samples that are not selected (out-of-bag samples).

    Parameters:
    - random_state: A random number generator.
    - n_samples: The total number of samples in the dataset.

    Returns:
    - unsampled_indices: An array of indices representing out-of-bag samples.
    """
    sample_indices = generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices

# Custom RandomForestClassifier class
class CustomRandomForestClassifier(RandomForestClassifier):
    """
    A custom implementation of RandomForestClassifier with additional features.

    This class extends the functionality of the RandomForestClassifier from scikit-learn.

    Methods:
    - fit(X, y): Fit the model to the training data.
    - predict(X): Make predictions using the fitted model.

    Attributes:
    - in_bag_indices_: A list of indices representing samples used for training in each tree.
    - oob_indices_: A list of indices representing out-of-bag samples for each tree.
    - tree_weights_: A list of weights assigned to each tree based on its performance.

    Example Usage:
    - X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
    - clf = CustomRandomForestClassifier(oob_score=True)
    - clf.fit(X, y)
    - predictions = clf.predict(X)
    """

    def fit(self, X, y):
        """
        Fit the custom random forest model to the training data.

        Parameters:
        - X: Input features (numpy array or pandas DataFrame).
        - y: Target labels (numpy array or pandas Series).

        Returns:
        - self: The fitted model.
        """
        # Fit the model using the superclass RandomForestClassifier
        super().fit(X, y)

        # Initialize lists to store in-bag, out-of-bag indices, and out-of-bag loss weights for each tree
        self.in_bag_indices_ = []
        self.oob_indices_ = []
        self.tree_weights_ = []

        for estimator in self.estimators_:
            # Generate in-bag and out-of-bag indices for each tree
            random_state = estimator.random_state
            in_bag_indices = generate_sample_indices(random_state, len(X))
            oob_indices = generate_unsampled_indices(random_state, len(X))

            # Store in-bag and out-of-bag indices
            self.in_bag_indices_.append(in_bag_indices)
            self.oob_indices_.append(oob_indices)

            # Calculate and store out-of-bag loss-based weights
            if len(oob_indices) > 0:
                oob_predictions = estimator.predict(X[oob_indices])
                oob_loss = mean_squared_error(y[oob_indices], oob_predictions)
                self.tree_weights_.append(np.exp(-oob_loss))
            else:
                self.tree_weights_.append(0)

        return self

    def predict(self, X):
        """
        Make predictions using the fitted custom random forest model.

        Parameters:
        - X: Input features for making predictions.

        Returns:
        - final_preds: Predicted class labels.
        """
        # Check if the forest is fitted
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        # Aggregate predictions from all trees, weighted by their out-of-bag loss-based weights
        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))
        for tree, weight in zip(self.estimators_, self.tree_weights_):
            preds = tree.predict_proba(X)
            weighted_preds += weight * preds

        final_preds = np.argmax(weighted_preds, axis=1)
        return self.classes_[final_preds]

# Example usage
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
clf = CustomRandomForestClassifier(oob_score=True)
clf.fit(X, y)
predictions = clf.predict(X)

# Accessing in-bag and out-of-bag data for each tree
for i, tree in enumerate(clf.estimators_):
    in_bag_samples_X = X[clf.in_bag_indices_[i]]
    in_bag_samples_y = y[clf.in_bag_indices_[i]]
    oob_samples_X = X[clf.oob_indices_[i]]
    oob_samples_y = y[clf.oob_indices_[i]]