<a href="https://colab.research.google.com/github/bdi2357/StatisticalRebalancing/blob/main/statistical_rebalance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

def statistical_rebalance(data, target_name, classifier, threshold=0.5, margin = 0.1):
    X = data.drop(target_name, axis=1)
    y = data[target_name]

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict probabilities for the minority class
    probs = classifier.predict_proba(X_train)[:, 1]

    # Define the threshold for rebalancing
    boundary_mask = (probs > (threshold - margin )) & (probs < (threshold + margin ))

    # Apply SMOTE to instances near the decision boundary
    smote = SMOTE(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train[boundary_mask], y_train[boundary_mask])

    # Combine the original and synthetic samples
    X_train_balanced = np.vstack([X_train, X_resampled])
    y_train_balanced = np.hstack([y_train, y_resampled])

    # Retrain the model on the balanced dataset
    classifier.fit(X_train_balanced, y_train_balanced)

    return classifier, X_train_balanced, y_train_balanced


