In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_classification
from sklearn.utils import check_random_state
import numpy as np

def generate_sample_indices(random_state, n_samples):
    """Generate bootstrap sample indices for in-bag samples."""
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)
    return sample_indices

def generate_unsampled_indices(random_state, n_samples):
    """Generate indices for out-of-bag (OOB) samples."""
    sample_indices = generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices

class CustomRandomForestClassifier(RandomForestClassifier):
    def fit(self, X, y):
        # Fit the model using the super class
        super().fit(X, y)

        # Initialize lists to store in-bag, OOB indices, and OOB losses for each tree
        self.in_bag_indices_ = []
        self.oob_indices_ = []
        self.tree_weights_ = []

        for estimator in self.estimators_:
            # Generate in-bag and OOB indices
            random_state = estimator.random_state
            in_bag_indices = generate_sample_indices(random_state, len(X))
            oob_indices = generate_unsampled_indices(random_state, len(X))

            # Store in-bag and OOB indices
            self.in_bag_indices_.append(in_bag_indices)
            self.oob_indices_.append(oob_indices)

            # Calculate and store OOB loss
            if len(oob_indices) > 0:
                oob_predictions = estimator.predict(X[oob_indices])
                oob_loss = mean_squared_error(y[oob_indices], oob_predictions)
                self.tree_weights_.append(np.exp(-oob_loss))
            else:
                self.tree_weights_.append(0)

        return self

    def predict(self, X):
        # Check if forest is fitted
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        # Aggregate predictions from all trees, weighted by their OOB loss-based weights
        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))
        for tree, weight in zip(self.estimators_, self.tree_weights_):
            preds = tree.predict_proba(X)
            weighted_preds += weight * preds

        final_preds = np.argmax(weighted_preds, axis=1)
        return self.classes_[final_preds]

# Example usage
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
clf = CustomRandomForestClassifier(oob_score=True)
clf.fit(X, y)
predictions = clf.predict(X)

# Accessing in-bag and OOB data for each tree
for i, tree in enumerate(clf.estimators_):
    in_bag_samples_X = X[clf.in_bag_indices_[i]]
    in_bag_samples_y = y[clf.in_bag_indices_[i]]
    oob_samples_X = X[clf.oob_indices_[i]]
    oob_samples_y = y[clf.oob_indices_[i]]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_classification
from sklearn.utils import check_random_state
import numpy as np

# Function to generate indices for random samples from a dataset
def generate_sample_indices(random_state, n_samples):
    """
    Generate random indices for selecting samples from a dataset.

    Parameters:
    - random_state: A random number generator.
    - n_samples: The total number of samples in the dataset.

    Returns:
    - sample_indices: An array of random sample indices.
    """
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)
    return sample_indices

# Function to generate indices for samples that are not selected (out-of-bag samples)
def generate_unsampled_indices(random_state, n_samples):
    """
    Generate indices for samples that are not selected (out-of-bag samples).

    Parameters:
    - random_state: A random number generator.
    - n_samples: The total number of samples in the dataset.

    Returns:
    - unsampled_indices: An array of indices representing out-of-bag samples.
    """
    sample_indices = generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices

# Custom RandomForestClassifier class
class CustomRandomForestClassifier(RandomForestClassifier):
    """
    A custom implementation of RandomForestClassifier with additional features.

    This class extends the functionality of the RandomForestClassifier from scikit-learn.

    Methods:
    - fit(X, y): Fit the model to the training data.
    - predict(X): Make predictions using the fitted model.

    Attributes:
    - in_bag_indices_: A list of indices representing samples used for training in each tree.
    - oob_indices_: A list of indices representing out-of-bag samples for each tree.
    - tree_weights_: A list of weights assigned to each tree based on its performance.

    Example Usage:
    - X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
    - clf = CustomRandomForestClassifier(oob_score=True)
    - clf.fit(X, y)
    - predictions = clf.predict(X)
    """

    def fit(self, X, y):
        """
        Fit the custom random forest model to the training data.

        Parameters:
        - X: Input features (numpy array or pandas DataFrame).
        - y: Target labels (numpy array or pandas Series).

        Returns:
        - self: The fitted model.
        """
        # Fit the model using the superclass RandomForestClassifier
        super().fit(X, y)

        # Initialize lists to store in-bag, out-of-bag indices, and out-of-bag loss weights for each tree
        self.in_bag_indices_ = []
        self.oob_indices_ = []
        self.tree_weights_ = []

        for estimator in self.estimators_:
            # Generate in-bag and out-of-bag indices for each tree
            random_state = estimator.random_state
            in_bag_indices = generate_sample_indices(random_state, len(X))
            oob_indices = generate_unsampled_indices(random_state, len(X))

            # Store in-bag and out-of-bag indices
            self.in_bag_indices_.append(in_bag_indices)
            self.oob_indices_.append(oob_indices)

            # Calculate and store out-of-bag loss-based weights
            if len(oob_indices) > 0:
                oob_predictions = estimator.predict(X[oob_indices])
                oob_loss = mean_squared_error(y[oob_indices], oob_predictions)
                self.tree_weights_.append(np.exp(-oob_loss))
            else:
                self.tree_weights_.append(0)

        return self

    def predict(self, X):
        """
        Make predictions using the fitted custom random forest model.

        Parameters:
        - X: Input features for making predictions.

        Returns:
        - final_preds: Predicted class labels.
        """
        # Check if the forest is fitted
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        # Aggregate predictions from all trees, weighted by their out-of-bag loss-based weights
        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))
        for tree, weight in zip(self.estimators_, self.tree_weights_):
            preds = tree.predict_proba(X)
            weighted_preds += weight * preds

        final_preds = np.argmax(weighted_preds, axis=1)
        return self.classes_[final_preds]

# Example usage
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
clf = CustomRandomForestClassifier(oob_score=True)
clf.fit(X, y)
predictions = clf.predict(X)

# Accessing in-bag and out-of-bag data for each tree
for i, tree in enumerate(clf.estimators_):
    in_bag_samples_X = X[clf.in_bag_indices_[i]]
    in_bag_samples_y = y[clf.in_bag_indices_[i]]
    oob_samples_X = X[clf.oob_indices_[i]]
    oob_samples_y = y[clf.oob_indices_[i]]

# The updated one
And since I am at it, I have two feature requests:
I would like these functions/classes to become part of a library/module which we can easily reuse in the future. (by importing a module)
As I mentioned, I would like to try out different weighting schemes, exp(-L_oob) is just a special case. So maybe we can add a parameter to the predict function (such as def predict(self, X, weights = "expOOB")) which can then take on different values in the future ?

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.utils import check_random_state

# Function to generate indices for random samples from a dataset
def generate_sample_indices(random_state, n_samples):
    """
    Generate random indices for selecting samples from a dataset.

    Parameters:
    - random_state: A random number generator.
    - n_samples: The total number of samples in the dataset.

    Returns:
    - sample_indices: An array of random sample indices.
    """
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)
    return sample_indices

# Function to generate indices for samples that are not selected (out-of-bag samples)
def generate_unsampled_indices(random_state, n_samples):
    """
    Generate indices for samples that are not selected (out-of-bag samples).

    Parameters:
    - random_state: A random number generator.
    - n_samples: The total number of samples in the dataset.

    Returns:
    - unsampled_indices: An array of indices representing out-of-bag samples.
    """
    sample_indices = generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices

# Custom RandomForestClassifier class
class CustomRandomForestClassifier(RandomForestClassifier):
    """
      A custom implementation of RandomForestClassifier from scikit-learn with additional
      features for handling in-bag and out-of-bag samples and customizable prediction weighting schemes.

      This class extends RandomForestClassifier and provides detailed tracking of sample indices used
      for training each tree (in-bag) and those not selected (out-of-bag). It also allows for different
      weighting schemes when making predictions.

      Methods:
      - fit(X, y): Fit the model to the training data.
      - predict(X, weights): Make predictions using the fitted model with a specified weighting scheme.

      Parameters:
      - oob_score (bool): Whether to use out-of-bag samples to estimate the generalization accuracy.

      Attributes:
      - in_bag_indices_ (list): A list where each element is an array of indices representing samples
        used for training each tree.
      - oob_indices_ (list): A list where each element is an array of indices representing out-of-bag
        samples for each tree.
      - tree_weights_ (list): A list of weights for each tree, calculated based on the out-of-bag loss
        for each tree if the 'expOOB' weighting scheme is used.

      Example Usage:
          from sklearn.datasets import make_classification

          X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
          clf = CustomRandomForestClassifier(oob_score=True)
          clf.fit(X, y)

          # Predict using different weighting schemes
          predictions_default = clf.predict(X)  # Defaults to 'uniform' weighting
          predictions_expOOB = clf.predict(X, weights="expOOB")
          predictions_uniform = clf.predict(X, weights="uniform")
    """

    def fit(self, X, y):
        """
        Fit the custom random forest model to the training data.

        Parameters:
        - X: Input features (numpy array or pandas DataFrame).
        - y: Target labels (numpy array or pandas Series).

        Returns:
        - self: The fitted model instance.
        """
    def fit(self, X, y):
        super().fit(X, y)
        self.in_bag_indices_ = []
        self.oob_indices_ = []
        self.tree_weights_ = []

        for estimator in self.estimators_:
            random_state = estimator.random_state
            in_bag_indices = generate_sample_indices(random_state, len(X))
            oob_indices = generate_unsampled_indices(random_state, len(X))

            self.in_bag_indices_.append(in_bag_indices)
            self.oob_indices_.append(oob_indices)

            if len(oob_indices) > 0:
                oob_predictions = estimator.predict(X[oob_indices])
                oob_loss = mean_squared_error(y[oob_indices], oob_predictions)
                self.tree_weights_.append(np.exp(-oob_loss))
            else:
                self.tree_weights_.append(0)

        return self

    def predict(self, X, weights=None):
        """
        Make predictions using the fitted custom random forest model.

        Parameters:
        - X: Input features for making predictions (numpy array or pandas DataFrame).
        - weights (optional): The weighting scheme to use for aggregating predictions. Supported values:
          "expOOB" (weights based on the exponential of the negative out-of-bag error) and "uniform"
          (equal weighting). Defaults to "uniform" if not specified or if an unknown value is passed.

        Returns:
        - final_preds: An array of predicted class labels.
        """
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))

        if weights is None or weights not in ["expOOB", "uniform"]:
            weights = "uniform"

        if weights == "expOOB":
            for tree, weight in zip(self.estimators_, self.tree_weights_):
                preds = tree.predict_proba(X)
                weighted_preds += weight * preds
        elif weights == "uniform":
            for tree in self.estimators_:
                preds = tree.predict_proba(X)
                weighted_preds += preds / len(self.estimators_)

        final_preds = np.argmax(weighted_preds, axis=1)
        return self.classes_[final_preds]

# Example usage
if __name__ == "__main__":
    from sklearn.datasets import make_classification

    X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
    clf = CustomRandomForestClassifier(oob_score=True)
    clf.fit(X, y)
    predictions_default = clf.predict(X)
    predictions_expOOB = clf.predict(X, weights="expOOB")
    predictions_uniform = clf.predict(X, weights="uniform")

    print("Default Predictions:", predictions_default)
    print("ExpOOB Predictions:", predictions_expOOB)
    print("Uniform Predictions:", predictions_uniform)

Default Predictions: [1 0 1 1 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0
 1 0 1 1 1 1 1 0 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0
 1 1 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0 0
 0 1 1 1 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 1 1
 1 1 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1
 1 1 0 0 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0
 1 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1 0 0 0 0
 1 0 1 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1
 1 1 1 1 0 1 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1
 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1
 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 1
 1 1 1 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1
 0 1

In [2]:
!pip install imodels

Collecting imodels
  Downloading imodels-1.4.1-py3-none-any.whl (231 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.2/231.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imodels
Successfully installed imodels-1.4.1


##Bechmarking Data

In [3]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [None]:
from imodels.util.data_util import get_clean_dataset
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def evaluate_datasets(datasets, random_state=42):
    """
    Evaluate multiple datasets using a custom random forest classifier.

    Parameters:
    - datasets (list): A list of dataset names.
    - random_state (int): A seed for reproducibility.

    Returns:
    - roc_auc_scores_default (list): ROC AUC scores for the default weighting scheme.
    - roc_auc_scores_expOOB (list): ROC AUC scores for the expOOB weighting scheme.
    """
    roc_auc_scores_default = []
    roc_auc_scores_expOOB = []

    for dataset_name in datasets:
        # Fetch the dataset
        X, y, feature_names = get_clean_dataset(dataset_name)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state)

        # Initialize and train the model
        clf = CustomRandomForestClassifier(oob_score=True, random_state=random_state)
        clf.fit(X_train, y_train)

        # Make predictions
        predictions_default = clf.predict(X_test, weights="uniform")
        predictions_expOOB = clf.predict(X_test, weights="expOOB")

        # Calculate ROC AUC scores
        roc_auc_default = roc_auc_score(y_test, predictions_default)
        roc_auc_expOOB = roc_auc_score(y_test, predictions_expOOB)

        # Store the ROC AUC scores
        roc_auc_scores_default.append((dataset_name, roc_auc_default))
        roc_auc_scores_expOOB.append((dataset_name, roc_auc_expOOB))

    return roc_auc_scores_default, roc_auc_scores_expOOB

# Datasets to evaluate
dataset_names = ["diabetes", "breast_cancer", "heart", "haberman", "fico", "enhancer", "credit_g", "juvenile_clean"]

# Evaluate datasets
roc_auc_scores_default, roc_auc_scores_expOOB = evaluate_datasets(dataset_names)

# Print ROC AUC scores for each dataset
for dataset_name, score in roc_auc_scores_default:
    print(f'Default ROC AUC for {dataset_name}: {score}')

print("="*100)

for dataset_name, score in roc_auc_scores_expOOB:
    print(f'expOOB ROC AUC for {dataset_name}: {score}')

  and should_run_async(code)


fetching diabetes from pmlb
fetching heart from imodels
fetching fico from imodels
fetching credit_g from imodels
Default ROC AUC for diabetes: 0.7255555555555555
Default ROC AUC for breast_cancer: 0.7142857142857144
Default ROC AUC for heart: 0.8203463203463204
Default ROC AUC for haberman: 0.5656565656565656
Default ROC AUC for fico: 0.6987820775720646
Default ROC AUC for enhancer: 0.6695426457107301
Default ROC AUC for credit_g: 0.7053732419761991
Default ROC AUC for juvenile_clean: 0.729199372056515
expOOB ROC AUC for diabetes: 0.7298148148148148
expOOB ROC AUC for breast_cancer: 0.7142857142857144
expOOB ROC AUC for heart: 0.8441558441558441
expOOB ROC AUC for haberman: 0.5656565656565656
expOOB ROC AUC for fico: 0.6990878398172019
expOOB ROC AUC for enhancer: 0.6728983504087167
expOOB ROC AUC for credit_g: 0.6714749368914533
expOOB ROC AUC for juvenile_clean: 0.7401883830455258


#With probability - (Professor Desired Code)

It looks pretty good, just one thing has to be changed.
I had mentioned this in my message on slack last Monday:
"you should not do the argmax computation in the end, instead it should return probabilities, so we can compute AUC-ROC values"
To get proper AUC scores one should pass the probabilities, not the thresholded 0/1 labels

In [7]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.utils import check_random_state

# Function to generate indices for random samples from a dataset
def generate_sample_indices(random_state, n_samples):
    random_instance = check_random_state(random_state)
    sample_indices = random_instance.randint(0, n_samples, n_samples)
    return sample_indices

# Function to generate indices for samples that are not selected (out-of-bag samples)
def generate_unsampled_indices(random_state, n_samples):
    sample_indices = generate_sample_indices(random_state, n_samples)
    sample_counts = np.bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices

# Custom RandomForestClassifier class
class CustomRandomForestClassifier(RandomForestClassifier):
    def fit(self, X, y):
        super().fit(X, y)
        self.in_bag_indices_ = []
        self.oob_indices_ = []
        self.tree_weights_ = []

        for estimator in self.estimators_:
            random_state = estimator.random_state
            in_bag_indices = generate_sample_indices(random_state, len(X))
            oob_indices = generate_unsampled_indices(random_state, len(X))

            self.in_bag_indices_.append(in_bag_indices)
            self.oob_indices_.append(oob_indices)

            if len(oob_indices) > 0:
                oob_predictions = estimator.predict(X[oob_indices])
                oob_loss = mean_squared_error(y[oob_indices], oob_predictions)
                self.tree_weights_.append(np.exp(-oob_loss))
            else:
                self.tree_weights_.append(0)

        # Normalize tree weights
        total_weight = np.sum(self.tree_weights_)
        if total_weight > 0:
            self.tree_weights_ = [weight / total_weight for weight in self.tree_weights_]

        return self

    def predict(self, X, weights=None):
        """
        Make predictions using the fitted custom random forest model.

        Parameters:
        - X: Input features for making predictions (numpy array or pandas DataFrame).
        - weights (optional): The weighting scheme to use for aggregating predictions. Supported values:
          "expOOB" (weights based on the exponential of the negative out-of-bag error) and "uniform"
          (equal weighting). Defaults to "uniform" if not specified or if an unknown value is passed.

        Returns:
        - final_preds: An array of predicted class labels.
        """
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))

        if weights is None or weights not in ["expOOB", "uniform"]:
            weights = "uniform"

        if weights == "expOOB":
            for tree, weight in zip(self.estimators_, self.tree_weights_):
                preds = tree.predict_proba(X)
                weighted_preds += weight * preds
        elif weights == "uniform":
            for tree in self.estimators_:
                preds = tree.predict_proba(X)
                weighted_preds += preds / len(self.estimators_)

        final_preds = np.argmax(weighted_preds, axis=1)
        return self.classes_[final_preds]


    def predict_proba(self, X, weights=None):
        if not hasattr(self, "estimators_"):
            raise ValueError("The forest is not fitted yet!")

        weighted_preds = np.zeros((X.shape[0], len(self.classes_)))

        if weights is None or weights not in ["expOOB", "uniform"]:
            weights = "uniform"

        if weights == "expOOB":
            for tree, weight in zip(self.estimators_, self.tree_weights_):
                preds = tree.predict_proba(X)
                weighted_preds += weight * preds
        elif weights == "uniform":
            for tree in self.estimators_:
                preds = tree.predict_proba(X)
                weighted_preds += preds / len(self.estimators_)

        return weighted_preds

  and should_run_async(code)


In [8]:
from imodels.util.data_util import get_clean_dataset
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def evaluate_datasets(datasets, random_state=42):
    roc_auc_scores_default = []
    roc_auc_scores_expOOB = []

    for dataset_name in datasets:
        # Fetch the dataset
        X, y, feature_names = get_clean_dataset(dataset_name)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state)

        # Initialize and train the model
        clf = CustomRandomForestClassifier(oob_score=True, random_state=random_state)
        clf.fit(X_train, y_train)

        # Make predictions (probabilities)
        # Assuming the positive class is labeled as '1' and is the second column
        probabilities_default = clf.predict_proba(X_test, weights="uniform")[:, 1]
        probabilities_expOOB = clf.predict_proba(X_test, weights="expOOB")[:, 1]

        # Calculate ROC AUC scores
        roc_auc_default = roc_auc_score(y_test, probabilities_default)
        roc_auc_expOOB = roc_auc_score(y_test, probabilities_expOOB)

        # Store the ROC AUC scores
        roc_auc_scores_default.append((dataset_name, roc_auc_default))
        roc_auc_scores_expOOB.append((dataset_name, roc_auc_expOOB))

    return roc_auc_scores_default, roc_auc_scores_expOOB

# Datasets to evaluate
dataset_names = ["diabetes", "breast_cancer", "heart", "haberman", "fico", "enhancer", "credit_g", "juvenile_clean"]

# Evaluate datasets
roc_auc_scores_default, roc_auc_scores_expOOB = evaluate_datasets(dataset_names)

# Print ROC AUC scores for each dataset
for dataset_name, score in roc_auc_scores_default:
    print(f'Default ROC AUC for {dataset_name}: {score}')

print("="*100)

for dataset_name, score in roc_auc_scores_expOOB:
    print(f'expOOB ROC AUC for {dataset_name}: {score}')

  and should_run_async(code)


fetching diabetes from pmlb
fetching heart from imodels
fetching fico from imodels
fetching credit_g from imodels
Default ROC AUC for diabetes: 0.830925925925926
Default ROC AUC for breast_cancer: 0.8282312925170068
Default ROC AUC for heart: 0.9155844155844156
Default ROC AUC for haberman: 0.6243686868686869
Default ROC AUC for fico: 0.76640817245723
Default ROC AUC for enhancer: 0.82537748709253
Default ROC AUC for credit_g: 0.8158432503906718
Default ROC AUC for juvenile_clean: 0.896691220867045
expOOB ROC AUC for diabetes: 0.8303703703703703
expOOB ROC AUC for breast_cancer: 0.8282312925170068
expOOB ROC AUC for heart: 0.9163059163059162
expOOB ROC AUC for haberman: 0.6237373737373737
expOOB ROC AUC for fico: 0.7662287431157722
expOOB ROC AUC for enhancer: 0.8250545034839483
expOOB ROC AUC for credit_g: 0.816925111191249
expOOB ROC AUC for juvenile_clean: 0.8975020270153709
