In [241]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from sklearn.model_selection import train_test_split

class CustomRandomForestClassifier:
    """
    A custom random forest classifier with scoring capabilities.

    Attributes:
        n_trees (int): The number of trees to use during scoring.
        random_state (int): Random seed for reproducibility.
        rf_classifier (RandomForestClassifier): The underlying random forest classifier.
        y_pred (numpy.ndarray): Cumulative predictions.

    Methods:
        fit(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf):
            Fits the random forest classifier to the training data.

        custom_predict(X):
            Predicts using the specified number of trees and accumulates the scores for each tree.

        custom_scores(X, y, score_fns):
            Calculates cumulative scores for a specified set of scoring functions.

        binary_custom_scores(X, y, score_fns):
            Calculates cumulative ROC AUC scores for binary classification tasks.

        multi_custom_scores_with_voting(X, y, score_fns, y_test):
            Calculates cumulative scores for multi-class classification tasks with majority voting.

    Example usage:
    # Create a CustomRandomForestClassifierWithScoring
    rf = CustomRandomForestClassifierWithScoring(n_trees=150, random_state=42)

    # Fit the classifier to your training data with specified parameters
    rf.fit(X_train, y_train, n_estimators=150, max_depth=10, min_samples_split=2, min_samples_leaf=1)

    # Calculate cumulative ROC AUC scores for binary classification
    roc_auc_scores = rf.binary_custom_scores(X_test, y_test, score_fns=score_fns)

    # Print the cumulative ROC AUC scores for binary classification
    for score_name, scores in roc_auc_scores.items():
        print(f"{score_name} Scores:")
        for i, score_value in enumerate(scores, start=1):
            print(f"For {i} trees - Cumulative {score_name}: {score_value:.4f}")
    """

    def __init__(self, n_trees=None, random_state=None):
        """
        Initializes a CustomRandomForestClassifierWithScoring instance.

        Args:
            n_trees (int): The number of trees to use during scoring.
            random_state (int): Random seed for reproducibility.
        """
        self.n_trees = n_trees
        self.random_state = random_state
        self.rf_classifier = None
        self.y_pred = None

    def fit(self, X, y, n_estimators=150, max_depth=10, min_samples_split=2, min_samples_leaf=1):
        """
        Fits the random forest classifier to the training data.

        Args:
            X (array-like): Input features for training.
            y (array-like): Target labels for training.
            n_estimators (int): The number of trees in the random forest.
            max_depth (int): Maximum depth of each tree.
            min_samples_split (int): Minimum samples required to split an internal node.
            min_samples_leaf (int): Minimum samples required to be a leaf node.
        """
        self.rf_classifier = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=self.random_state
        )
        self.rf_classifier.fit(X, y)

    def custom_predict(self, X):
        """
        Predict using the specified number of trees and accumulate the scores for each tree.

        Args:
            X (array-like): Input features for prediction.

        Returns:
            list: List of predicted classes for each tree.
        """
        if self.n_trees is None:
            raise ValueError("Number of trees used during training is not available.")

        y_preds = []

        for estimator in self.rf_classifier.estimators_[:self.n_trees]:
            y_pred_tree = estimator.predict(X)
            y_preds.append(y_pred_tree)

        return y_preds

    def custom_scores(self, X, y, score_fns):
        """
        Calculate cumulative ROC AUC scores for the specified scoring functions.

        Args:
            X (array-like): Input features for prediction.
            y (array-like): Ground truth labels.
            score_fns (list): List of scoring functions to use (e.g., [roc_auc_score]).

        Returns:
            dict: A dictionary containing cumulative ROC AUC scores.
        """
        if self.n_trees is None:
            raise ValueError("Number of trees used during training is not available.")

        self.y_pred = np.zeros(len(y))
        cumulative_scores = {score_fn.__name__: [] for score_fn in score_fns}

        for estimator in self.rf_classifier.estimators_[:self.n_trees]:
            y_pred_tree = estimator.predict_proba(X)[:, 1]
            self.y_pred += y_pred_tree

            for score_fn in score_fns:
                score_name = score_fn.__name__
                score_value = score_fn(y, self.y_pred)
                cumulative_scores[score_name].append(score_value)

        return cumulative_scores

    def binary_custom_scores(self, X, y, score_fns):
        """
        Calculate cumulative accuracy, F1 score, and ROC AUC scores for binary classification tasks.

        Args:
            X (array-like): Input features for prediction.
            y (array-like): Ground truth labels for binary classification.
            score_fns (list): List of scoring functions to use (e.g., [accuracy_score, f1_score, roc_auc_score]).

        Returns:
            dict: A dictionary containing cumulative scores for each scoring function.
        """
        if self.n_trees is None:
            raise ValueError("Number of trees used during training is not available.")

        nTrees = self.rf_classifier.n_estimators
        N = len(y)
        y_pred = np.zeros(N)
        scores = {score_fn.__name__: [] for score_fn in score_fns}

        for i, singleTree in enumerate(self.rf_classifier.estimators_[:self.n_trees]):
            y_pred_tree = singleTree.predict_proba(X)[:, 1]
            y_pred += y_pred_tree

            for score_fn in score_fns:
                score_name = score_fn.__name__
                if score_name == 'roc_auc_score':
                    score_value = score_fn(y, y_pred)
                else:
                    score_value = score_fn(y, np.round(y_pred / (i + 1)))
                scores[score_name].append(score_value)

        return scores

    def multi_custom_scores_with_voting(self, X, y, score_fns, y_test):
        """
          Calculate cumulative scores for multi-class classification tasks with majority voting.

          Args:
              X (array-like): Input features for prediction.
              y (array-like): Ground truth labels for training.
              score_fns (list): List of scoring functions to use (e.g., [accuracy_score, f1_score, etc.]).
              y_test (array-like): Ground truth labels for testing.

          Returns:
              tuple: A tuple containing:
                  - dict: A dictionary containing cumulative scores for each scoring function.
                  - numpy.ndarray: An array of final predicted class labels after majority voting.
        """
        if self.n_trees is None:
            raise ValueError("Number of trees used during training is not available.")

        tree_predictions = self.custom_predict(X)
        n_trees = len(tree_predictions)

        # Initialize dictionaries to store cumulative scores for each scoring function
        cumulative_scores = {score_fn.__name__: [] for score_fn in score_fns}

        # Initialize an array to store cumulative predictions as floats
        n_classes = len(np.unique(y))
        cumulative_predictions = np.zeros((len(y_test), n_classes), dtype=float)

        for i, y_pred_tree in enumerate(tree_predictions, start=1):
            # Use predict_proba to obtain class probabilities from the RandomForestClassifier
            y_pred_tree = self.rf_classifier.estimators_[i-1].predict_proba(X)

            # Reshape y_pred_tree to match the shape of cumulative_predictions
            y_pred_tree = y_pred_tree.reshape((len(y_test), n_classes))
            cumulative_predictions += y_pred_tree

            for score_fn in score_fns:
                score_name = score_fn.__name__
                # Calculate cumulative score with 'weighted' averaging
                if score_name == 'f1_score':
                    cumulative_score = score_fn(y_test, np.argmax(cumulative_predictions, axis=1), average='weighted')
                else:
                    cumulative_score = score_fn(y_test, np.argmax(cumulative_predictions, axis=1))
                cumulative_scores[score_name].append(cumulative_score)

        # Apply majority voting to determine the final predicted class labels
        final_predictions = np.argmax(cumulative_predictions, axis=1)

        return cumulative_scores, final_predictions



# Define the list of scoring functions outside the class
score_fns = [roc_auc_score, accuracy_score, f1_score]

# Generate toy data for binary classification
X, y = make_classification(n_samples=500, n_features=10, n_classes=2, n_clusters_per_class=1, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CustomRandomForestClassifier
rf = CustomRandomForestClassifier(n_trees=150, random_state=42)

# Fit the classifier to your training data with specified parameters
rf.fit(X_train, y_train, n_estimators=150, max_depth=10, min_samples_split=2, min_samples_leaf=1)

if len(np.unique(y_test)) == 2:
    # Calculate cumulative ROC AUC scores for binary classification
    roc_auc_scores = rf.binary_custom_scores(X_test, y_test, score_fns=score_fns)

    # Print the cumulative ROC AUC scores for binary classification
    for score_name, scores in roc_auc_scores.items():
        print(f"{score_name} Scores:")
        for i, score_value in enumerate(scores, start=1):
            print(f"For {i} trees - Cumulative {score_name}: {score_value:.4f}")
else:
    # Calculate cumulative scores for multi-class classification with majority voting
    multi_scores, final_predictions = rf.multi_custom_scores_with_voting(X_test, y_test, score_fns=score_fns, y_test=y_test)

    # Print the cumulative scores for each number of trees
    for i, score_name in enumerate(multi_scores.keys()):
        print(f"{score_name} Scores:")
        for j, score_value in enumerate(multi_scores[score_name], start=1):
            print(f"For {j} trees - Cumulative {score_name}: {score_value:.4f}")

roc_auc_score Scores:
For 1 trees - Cumulative roc_auc_score: 0.8337
For 2 trees - Cumulative roc_auc_score: 0.9188
For 3 trees - Cumulative roc_auc_score: 0.9319
For 4 trees - Cumulative roc_auc_score: 0.9388
For 5 trees - Cumulative roc_auc_score: 0.9368
For 6 trees - Cumulative roc_auc_score: 0.9388
For 7 trees - Cumulative roc_auc_score: 0.9390
For 8 trees - Cumulative roc_auc_score: 0.9517
For 9 trees - Cumulative roc_auc_score: 0.9508
For 10 trees - Cumulative roc_auc_score: 0.9533
For 11 trees - Cumulative roc_auc_score: 0.9559
For 12 trees - Cumulative roc_auc_score: 0.9535
For 13 trees - Cumulative roc_auc_score: 0.9537
For 14 trees - Cumulative roc_auc_score: 0.9521
For 15 trees - Cumulative roc_auc_score: 0.9488
For 16 trees - Cumulative roc_auc_score: 0.9519
For 17 trees - Cumulative roc_auc_score: 0.9517
For 18 trees - Cumulative roc_auc_score: 0.9510
For 19 trees - Cumulative roc_auc_score: 0.9488
For 20 trees - Cumulative roc_auc_score: 0.9521
For 21 trees - Cumulative r

In [238]:
# Create a RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_estimators=150, max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Fit the classifier to your training data
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Calculate accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("Accuracy:", accuracy)
print("F1-score (weighted):", f1)

Accuracy: 0.92
F1-score (weighted): 0.9184920634920634


In [242]:
# Create a RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_estimators=150, max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Fit the classifier to your training data
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Calculate accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1-score (weighted):", f1)

# Make predictions on the test data
y_pred = rf.predict_proba(X_test)[:, 1]  # Probability of positive class (class 1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred)

# Print the ROC AUC score
print("ROC AUC Score:", roc_auc)

Accuracy: 0.92
F1-score (weighted): 0.9310344827586206
ROC AUC Score: 0.9532843737250102
