<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/automation_breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install pmlb



In [28]:
!pip install sdv



In [29]:
!pip install DataSynthesizer



In [54]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
import math

from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [31]:
def preprocess_data_train(X_train):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_train).sum()

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_train).sum()

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler, imputer

def preprocess_data_test(X_test, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_test).sum()

    # Handle missing data
    X_test = imputer.transform(X_test)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_test).sum()

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

## Experiments

### Dataset 1: Breast cancer

In [32]:
breast_cancer = fetch_data('breast_cancer')
breast_cancer.describe()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
count,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0
mean,2.664336,1.073427,4.062937,1.073427,1.167832,2.048951,0.468531,2.772727,0.237762,0.297203
std,1.011818,0.98668,2.151187,1.935321,0.443052,0.738217,0.499883,1.099006,0.426459,0.457828
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0
50%,3.0,2.0,4.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0
75%,3.0,2.0,5.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0
max,5.0,2.0,10.0,6.0,2.0,3.0,1.0,5.0,1.0,1.0


## Training and testing using ML models

In [33]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results


In [34]:

y = breast_cancer['target']
X = breast_cancer.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [35]:
results = evaluate_models(X_train, X_test, y_train, y_test)

In [36]:
print(results)

{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.726388888888889, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7241379310344828, 'Precision': 0.75, 'Recall': 0.16666666666666666, 'F1 Score': 0.27272727272727276, 'ROC AUC': 0.6847222222222222, 'Confusion Matrix': array([[39,  1],
       [15,  3]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.4782608695652174, 'Recall': 0.6111111111111112, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.6555555555555556, 'Confusion Matrix': array([[28, 12],
       [ 7, 11]])}}


# RRP Over-sampling

In [52]:
def cluster(candidates, enough, res):
    if len(candidates) < enough:
        res.append(candidates)
        return res

    east, west, east_items, west_items = split(candidates)
    res = cluster(east_items, enough, res)
    res = cluster(west_items, enough, res)

    return res


def split(candidates):
    pivot = random.choice(candidates)
    east_pivot = find_farest(pivot, candidates)
    west_pivot = find_farest(east_pivot, candidates)
    c = cal_distance(east_pivot, west_pivot)

    if c == 0:
        east_items = candidates[:len(candidates)//2]
        west_items = candidates[len(candidates)//2:]
        return east_pivot, west_pivot, east_items, west_items

    all_distance = []
    for candidate in candidates:
        a = cal_distance(candidate, west_pivot)
        b = cal_distance(candidate, east_pivot)
        d = (a ** 2 + c ** 2 - b ** 2) / (2 * c)
        all_distance.append((d, candidate))

    all_distance.sort(key=lambda x: x[0])
    sorted_candidates = [item[1] for item in all_distance]
    east_items = sorted_candidates[:len(sorted_candidates)//2]
    west_items = sorted_candidates[len(sorted_candidates)//2:]

    return east_pivot, west_pivot, east_items, west_items


def find_farest(pivot, candidates):
    max_d = 0
    most_point = pivot

    for candidate in candidates:
        cur_d = cal_distance(pivot, candidate)
        if  cur_d > max_d:
            max_d = cur_d
            most_point = candidate

    return most_point


def cal_distance(p1, p2):
    return math.sqrt(sum([(v1 - v2) ** 2 for v1, v2 in zip(p1[:-1], p2[:-1])]))


def process_mixed_cluster(cluster):
    """
    in DE operation, use current-to-best to mutate the candidates
    v_i = x_i + F * (x_b - x_i) + F_min * (x_r1 - x_r2)
    """
    DE_params = {"F": 0.8, "Fmin": 0.1, "CR": 1.0}
    pos_point = [(idx, item) for idx, item in enumerate(cluster) if item[-1] == 1]
    neg_point = [(idx, item) for idx, item in enumerate(cluster) if item[-1] == 0]

    candidate_l = []
    if len(pos_point) == 1:
        # only 1 pos point in cluster, then mutate all neg points toward to the pos point
        xb = pos_point[0][1]
        R = random.choice(range(len(xb)-1))

        for _, xi in neg_point:
            new_candidate = []
            for i in range(len(xi)-1):
                ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                if ri < DE_params["CR"] or i == R:
                    new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]))
                else:
                    new_candidate.append(xi[i])

            new_candidate.append(1)
            candidate_l.append(np.array(new_candidate))
    else:
        # more than 2 pos points in cluster, then randomly pick 3 points, first 1 is current point, and
        # another two are support points
        for idx1, xb in pos_point:
            R = random.choice(range(len(xb)-1))
            for idx2, xi in enumerate(cluster):
                if not idx1 == idx2:
                    available_points = []
                    for idx3, p in enumerate(cluster):
                        if not idx3 == idx2 and not idx3 == idx1:
                            available_points.append(p)
                    [xr1, xr2] = random.sample(available_points, 2)

                    new_candidate = []
                    # if xr1 and xr2 all negative class, then just use xi
                    if xr1[-1] == 0 and xr2[-1] == 0:
                        for i in range(len(xi)-1):
                            ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                            if ri < DE_params["CR"] or i == R:
                                new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]))
                            else:
                                new_candidate.append(xi[i])
                    else:
                        for i in range(len(xi)-1):
                            ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                            if ri < DE_params["CR"] or i == R:
                                if xr1[-1] == 1:
                                    new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]) + DE_params["Fmin"] * (xr1[i] - xr2[i]))
                                else:
                                    new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]) + DE_params["Fmin"] * (xr2[i] - xr1[i]))
                            else:
                                new_candidate.append(xi[i])

                    new_candidate.append(1)
                    candidate_l.append(np.array(new_candidate))

    return candidate_l


def process_positive_cluster(cluster):
    """
    in DE operation, use best to mutate the candidates
    v_i = x_b + F * (x_r1 - x_r2)
    """
    DE_params = {"F": 0.8, "CR": 1.0}
    pos_point = [item for item in cluster if item[-1] == 1]
    candidate_l = []

    for idx1 in range(len(pos_point)-2):
        for idx2 in range(idx1+1, len(pos_point)-1):
            for idx3 in range(idx2+1, len(pos_point)):
                [xb, xr1, xr2] = random.sample([idx1, idx2, idx3], 3)
                xb, xr1, xr2 = pos_point[xb], pos_point[xr1], pos_point[xr2]

                R = random.choice(range(len(xb)-1))
                new_candidate = []

                for i in range(len(xb)-1):
                    ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                    if ri < DE_params["CR"] or i == R:
                        new_candidate.append(xb[i] + DE_params["F"] * (xr1[i] - xr2[i]))
                    else:
                        new_candidate.append(xb[i])

                new_candidate.append(1)
                candidate_l.append(np.array(new_candidate))

    return candidate_l


def process_mixed_cluster_extra(cluster):
    """
    in DE operation, use current-to-best-extra to mutate the candidates
    v_i = x_b + F * (x_r1 - x_r2) + F_ex * (x_r3 - x_r4)
    """
    DE_params = {"F": 0.8, "CR": 1.0, "F_xc": 0.1}
    pos_point = [item for item in cluster if item[-1] == 1]

    candidate_l = []
    for xb in pos_point:
        R = random.choice(range(len(xb)-1))

        for xi in cluster:
            if not np.array_equal(xb, xi):
                available_points = []
                for p in cluster:
                    if not np.array_equal(p, xi) and not np.array_equal(p, xb):
                        available_points.append(p)

                for _ in range(20):
                    [xr1, xr2, xr3, xr4] = random.sample(available_points, 4)

                    new_candidate = []

                    for i in range(len(xi)-1):
                        ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                        if ri < DE_params["CR"] or i == R:
                            new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]) + DE_params["F_xc"] * (xr1[i] - xr2[i]) + DE_params["F_xc"] * (xr3[i] - xr4[i]))
                        else:
                            new_candidate.append(xi[i])

                    new_candidate.append(1)
                    candidate_l.append(new_candidate)

    return candidate_l


def RandomProjectionOversampling(X_train, y_train, threshold):
    train_df = pd.concat([X_train, y_train], axis=1)
    train_df.reset_index(inplace=True, drop=True)
    tar = y_train.name
    X_train[tar] = y_train
    X_train.reset_index(inplace=True, drop=True)
    col_names = X_train.columns

    n_data_to_generate = X_train[tar].value_counts()[0] - X_train[tar].value_counts()[1]
    X_train = X_train.to_numpy()
    start_time = time.time()
    res = cluster(X_train, threshold, [])

    new_data_negative_cluster = []
    new_data_positive_cluster = []
    for c in res:
        if sum([item[-1] for item in c]) > len(c)//2:
            cur_new_data = process_positive_cluster(c)
            new_data_positive_cluster += cur_new_data
        else:
            cur_new_data = process_mixed_cluster(c)
            new_data_negative_cluster += cur_new_data

    rt = time.time() - start_time
    if len(new_data_negative_cluster) >= n_data_to_generate - len(new_data_positive_cluster):
        new_data = new_data_positive_cluster + random.sample(new_data_negative_cluster,
                                                            n_data_to_generate - len(new_data_positive_cluster))
    else:
        extra_data = []
        for c in res:
            cur_extra_data = process_mixed_cluster_extra(c)
            extra_data += cur_extra_data

        rest_data_to_generate = n_data_to_generate - len(new_data_positive_cluster) - len(new_data_negative_cluster)
        new_data = new_data_negative_cluster + new_data_positive_cluster + random.sample(extra_data, rest_data_to_generate)

    new_data_df = pd.DataFrame(np.array(new_data), columns=col_names)

    return rt, new_data_df, train_df

# SDV - Oversampling

In [37]:
def do_sdv(X_train, y_train):
  train_df = pd.concat([X_train, y_train], axis=1)
  class_counts = y_train.value_counts()

  # Find minority class label
  minority_class_label = class_counts.idxmin()

  # Filter rows with minority class label
  minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

  # Calculate counts of majority and minority classes
  majority_count = class_counts.max()
  minority_count = class_counts.min()

  metadata_data = SingleTableMetadata()
  metadata_data.detect_from_dataframe(minority_df)
  # Generate synthetic data using GaussianCopulaSynthesizer
  synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
  synthesizer_breast_data.fit(minority_df)

  # Print sample synthetic data
  synthesizer_breast_data.reset_sampling()
  sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
  return sd1, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)
    np.random.seed(seed)
    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Random Over-Sampling

In [38]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def random_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = RandomOverSampler(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SMOTE Over-Sampling

In [39]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SVM-SMOTE Over-Sampling

In [40]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def svm_smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# Intelligent Pruning

In [41]:
def find_majority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    max_label = max(zip(counts, labels))[1]
    indices_with_max_label = np.where(y == max_label)[0]
    X_maj, y_maj = X[indices_with_max_label], y[indices_with_max_label]

    # Exclude majority class samples
    indices_without_max_label = np.where(y != max_label)[0]
    X_remaining, y_remaining = X[indices_without_max_label], y[indices_without_max_label]

    return X_maj, y_maj, X_remaining, y_remaining, min(counts)

def do_clustering(X, y, labels):
  clustered_X = defaultdict(list)
  clustered_y = defaultdict(list)

  for i, label in enumerate(labels):
      clustered_X[label].append(X[i])
      clustered_y[label].append(y[i])

  # Sort clustered_X and clustered_y in descending order based on the length of values in each dictionary
  sorted_clustered_X = dict(sorted(clustered_X.items(), key=lambda x: -len(x[1])))
  sorted_clustered_y = dict(sorted(clustered_y.items(), key=lambda x: -len(x[1])))

  return sorted_clustered_X, sorted_clustered_y


def intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, per_cluster_pruning_ratio=0.7, seed=42):
  random.seed(seed)
  pruning_ratios_X_maj, pruning_ratios_y_maj = defaultdict(list), defaultdict(list)
  for pruning_samp, pruning_ratio in zip(pruning_samps, pruning_ratios):
    samps = 0
    # print("For Pruning samps: ", pruning_samp)
    prune_samps = pruning_samp
    clustered_X_new = defaultdict(list)
    clustered_y_new = defaultdict(list)
    # Iterate over the sorted dictionaries
    for label, values_X in clustered_X.items():
        # Calculate the number of samples to prune
        num_samples_to_prune = int(prune_samps * per_cluster_pruning_ratio)
        if(num_samples_to_prune > len(values_X)):
          num_samples_to_prune = len(values_X)//2
          prune_samps -= num_samples_to_prune
        else:
          prune_samps -= num_samples_to_prune

        # Randomly choose samples to prune
        indices_to_prune = random.sample(range(len(values_X)), num_samples_to_prune)

        # Prune the samples from clustered_X and clustered_y
        clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in indices_to_prune]
        clustered_y_new[label] = [clustered_y[label][i] for i in range(len(clustered_y[label])) if i not in indices_to_prune]

    while(prune_samps > 0):
        for label, values_X in clustered_X_new.items():
          if(prune_samps <=0):
            break

          index_to_prune = random.sample(range(len(values_X)), 1)
          clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in index_to_prune]
          clustered_y_new[label] = [clustered_y_new[label][i] for i in range(len(clustered_y_new[label])) if i not in index_to_prune]

          prune_samps -= 1

    for label in clustered_X_new:
        pruning_ratios_X_maj[pruning_ratio].extend(clustered_X_new[label])
        pruning_ratios_y_maj[pruning_ratio].extend(clustered_y_new[label])

  return pruning_ratios_X_maj, pruning_ratios_y_maj

def combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining):

  pruning_ratios_X, pruning_ratios_y = defaultdict(list), defaultdict(list)
  for pruning_ratio in pruning_ratios:
    pruning_ratios_X[pruning_ratio].extend(pruning_ratios_X_maj[pruning_ratio])
    pruning_ratios_X[pruning_ratio].extend(X_remaining)

    pruning_ratios_y[pruning_ratio].extend(pruning_ratios_y_maj[pruning_ratio])
    pruning_ratios_y[pruning_ratio].extend(y_remaining)

  return pruning_ratios_X, pruning_ratios_y

def do_intelligent_pruning(X, y, ratio, per_cluster_pruning_ratio=0.7, seed=42):

  X_maj, y_maj, X_remaining, y_remaining, min_class_samples = find_majority_data(X, y)
  kmeans = KMeans(n_clusters=3, random_state = 42)
  kmeans.fit(X_maj)
  labels = kmeans.labels_
  clustered_X, clustered_y = do_clustering(X_maj, y_maj, labels)

  pruning_best = len(X_maj)-min_class_samples
  pruning_samps = [int(pruning_best * ratio)]
  pruning_ratios = [ratio]

  pruning_ratios_X_maj, pruning_ratios_y_maj = intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, \
                                                                      per_cluster_pruning_ratio=per_cluster_pruning_ratio, seed=seed)

  pruning_ratios_X, pruning_ratios_y = combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining)

  return list(pruning_ratios_X.values()), list(pruning_ratios_y.values())


# Random Pruning

In [42]:
"""
inputs:
X: np.array
y: np.array
percentage: from 0.0 upto 1.0, enter int value
"""
def random_prune_data(X, y, ratio, seed = 42):
  # preprocessed_X, scaler, imputer = preprocess_data_train(X)
  # preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  # X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
  # X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()
  np.random.seed(seed)
  labels_count = {}
  labels = np.unique(y)
  for label in labels:
    labels_count[label] = np.count_nonzero(y == label)
  max_label = min_label = labels[0]
  for label in labels_count:
    if labels_count[label] > labels_count[max_label]:
      max_label = label
    if labels_count[label] < labels_count[min_label]:
      min_label = label

  # print("Max", max_label, labels_count[max_label])
  # print("Min", min_label, labels_count[min_label])

  prune_counts = {}
  prune_indexes = {}
  for label in labels_count:
    prune_counts[label] = labels_count[label] - labels_count[min_label]
    prune_indexes[label] = np.where(y == label)[0]

  prune_amount = int(ratio * sum(map(lambda x: x[1], prune_counts.items())))
  prune_it = {}

  while prune_amount > 0:
    for label in labels:
      if (len(prune_indexes[label]) - labels_count[min_label]) > 0 and prune_amount > 0:
        random_index = np.random.choice(len(prune_indexes[label]))
        random_item = prune_indexes[label][random_index]
        prune_indexes[label] = np.delete(prune_indexes[label], random_index)
        if prune_it.get(label, None) is None:
          prune_it[label] = np.array([])
        prune_it[label] = np.append(prune_it[label], [random_item])
        prune_amount -= 1



  formatted_indexes = np.array([])
  for label in prune_indexes:
    formatted_indexes = np.append(formatted_indexes, prune_indexes[label])
  formatted_indexes = np.sort(formatted_indexes)
  new_arr = np.array([np.int64(i) for i in formatted_indexes])

  return X[new_arr], y[new_arr]

In [43]:
ratios = [ratio for ratio in np.arange(0, 1.1, 0.2)]

In [44]:
learners = {'Logistic Regression' : 'LR', 'SVM': 'SVM', 'Decision Tree': 'DT' }

X = breast_cancer.drop('target', axis=1)
y = breast_cancer['target']

states = [82, 15, 4, 95, 36, 32, 29, 18, 14, 87]
ratios = [0.2, 0.4, 0.6, 0.8, 1.0]

In [45]:
def split_data(X, y, random_state):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
  return X_train, X_test, y_train, y_test

# Calling Intelligent Pruning

In [46]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'intelligent_pruning'
file_name = f'{algorithm}_results'
per_cluster_pruning_ratios = [0.5, 0.7, 0.9, 1]

for rand_state in states:
  for per_cluster_pruning_ratio in per_cluster_pruning_ratios:
    for ratio in ratios:
      X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

      X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

      intelligent_pruned_X_train, intelligent_pruned_y_train = do_intelligent_pruning(X_train_copy.to_numpy(), y_train_copy.to_numpy(), ratio, per_cluster_pruning_ratio=per_cluster_pruning_ratio)

      preprocessed_intelligent_pruned_X_train, scaler, imputer = preprocess_data_train((np.array(intelligent_pruned_X_train))[0])
      preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

      intelligent_pruned_X_train, intelligent_pruned_y_train = preprocessed_intelligent_pruned_X_train, (np.array(intelligent_pruned_y_train))[0]
      intelligent_pruned_X_test, intelligent_pruned_y_test = preprocessed_X_test, y_test.to_numpy()
      results = evaluate_models(intelligent_pruned_X_train, intelligent_pruned_X_test, intelligent_pruned_y_train, intelligent_pruned_y_test)
      for key, item in results.items():
        row = {'algo' : [algorithm], 'ratio': [ratio], 'per_cluster_pruning_ratio': [per_cluster_pruning_ratio],'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
        temp = pd.DataFrame(row)
        df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)




#Calling Random Pruning

In [47]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'random_pruning'
file_name = f'{algorithm}_results'

for rand_state in states:
  for ratio in ratios:
    X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

    random_pruned_X_train, random_pruned_y_train = random_prune_data(X_train.to_numpy(), y_train.to_numpy(), ratio, seed=rand_state)
    preprocessed_random_pruned_X_train, scaler, imputer = preprocess_data_train(random_pruned_X_train)
    preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

    random_pruned_X_train, random_pruned_y_train = preprocessed_random_pruned_X_train, random_pruned_y_train
    random_pruned_X_test, random_pruned_y_test = preprocessed_X_test, y_test.to_numpy()

    print(f"Train data pruned randomly at {ratio * 100}%")
    results = evaluate_models(random_pruned_X_train, random_pruned_X_test, random_pruned_y_train, random_pruned_y_test)
    for key, item in results.items():
      row = {'algo' : [algorithm], 'ratio': [ratio], 'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
      temp = pd.DataFrame(row)
      df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%




Train data pruned randomly at 20.0%




Train data pruned randomly at 40.0%




Train data pruned randomly at 60.0%




Train data pruned randomly at 80.0%




Train data pruned randomly at 100.0%


# Calling SDV-Oversampling

In [48]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'gaussian_copula'
file_name = f'{algorithm}_results'

for rand_state in states:
  for ratio in ratios:
    X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

    sd1, train_df = do_sdv(X_train, y_train)
    results_syn_sdv = dict()

    # Add synthetic data at different percentages to the main DataFrame
    combined_df = add_synthetic_data(train_df, sd1, ratio, seed=rand_state)
    y_train_sdv = combined_df['target']
    X_train_sdv = combined_df.drop('target', axis=1)

    preprocessed_X_train_sdv, scaler, imputer = preprocess_data_train(X_train_sdv)
    preprocessed_X_test_sdv = preprocess_data_test(X_test, scaler, imputer)

    X_train_sdv, y_train_sdv = preprocessed_X_train_sdv, y_train_sdv.to_numpy()
    X_test_sdv, y_test_sdv = preprocessed_X_test_sdv, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class")
    results = evaluate_models(X_train_sdv, X_test_sdv, y_train_sdv, y_test_sdv)
    for key, item in results.items():
          row = {'algo' : [algorithm], 'ratio': [ratio], 'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
          temp = pd.DataFrame(row)
          df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)



Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class




Train data combined with 20.0% synthetic data of minority class




Train data combined with 40.0% synthetic data of minority class




Train data combined with 60.0% synthetic data of minority class




Train data combined with 80.0% synthetic data of minority class




Train data combined with 100.0% synthetic data of minority class


# Calling SMOTE-Oversampling

In [49]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'smote'
file_name = f'{algorithm}_results'

for rand_state in states:
  for ratio in ratios:
    X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

    X_train_smote, y_train_smote = smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])
    preprocessed_X_train_smote, scaler, imputer = preprocess_data_train((np.array(X_train_smote))[0])
    preprocessed_X_test_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_smote, y_train_smote = preprocessed_X_train_smote, (np.array(y_train_smote))[0]
    X_test_smote, y_test_smote = preprocessed_X_test_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_smote, X_test_smote, y_train_smote, y_test_smote)

    for key, item in results.items():
          row = {'algo' : [algorithm], 'ratio': [ratio], 'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
          temp = pd.DataFrame(row)
          df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)



Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:


# Calling Random-Oversampling

In [50]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'random_oversampling'
file_name = f'{algorithm}_results'

for rand_state in states:
  for ratio in ratios:
    X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

    X_train_random, y_train_random = random_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_random, scaler, imputer = preprocess_data_train((np.array(X_train_random)[0]))
    preprocessed_X_test_random = preprocess_data_test(X_test, scaler, imputer)

    X_train_random, y_train_random = preprocessed_X_train_random, (np.array(y_train_random))[0]
    X_test_random, y_test_random = preprocessed_X_test_random, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_random, X_test_random, y_train_random, y_test_random)

    for key, item in results.items():
          row = {'algo' : [algorithm], 'ratio': [ratio], 'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
          temp = pd.DataFrame(row)
          df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)



Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:


# Calling SVM-SMOTE Over-Sampling

In [51]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'svm_smote'
file_name = f'{algorithm}_results'

for rand_state in states:
  for ratio in ratios:
    X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

    X_train_svm_smote, y_train_svm_smote = svm_smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_svm_smote, scaler, imputer = preprocess_data_train((np.array(X_train_svm_smote))[0])
    preprocessed_X_test_svm_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_svm_smote, y_train_svm_smote = preprocessed_X_train_svm_smote, (np.array(y_train_svm_smote))[0]
    X_test_svm_smote, y_test_svm_smote = preprocessed_X_test_svm_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_svm_smote, X_test_svm_smote, y_train_svm_smote, y_test_svm_smote)

    for key, item in results.items():
          row = {'algo' : [algorithm], 'ratio': [ratio], 'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
          temp = pd.DataFrame(row)
          df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)



Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:




Train data combined with 20.0% synthetic data of minority class:




Train data combined with 40.0% synthetic data of minority class:




Train data combined with 60.0% synthetic data of minority class:




Train data combined with 80.0% synthetic data of minority class:




Train data combined with 100.0% synthetic data of minority class:


# Calling for RRP Over-Sampling

In [55]:
cols = ['algo', 'ratio', 'seed', 'learner', 'acc', 'prec', 'rec', 'f1', 'auc_roc']
df = pd.DataFrame([], columns = cols)
algorithm = 'RRP'
file_name = f'{algorithm}_results'

for rand_state in states:
  for ratio in ratios:
    X_train, X_test, y_train, y_test = split_data(X, y,rand_state)

    X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

    rt, new_data_df, train_df = RandomProjectionOversampling(X_train=X_train_copy,
                                                                        y_train=y_train_copy,
                                                                        threshold=10)

    # Add synthetic data at different percentages to the main DataFrame
    combined_df = add_synthetic_data(train_df, new_data_df, ratio, seed=rand_state)
    y_train_rrp = combined_df['target']
    X_train_rrp = combined_df.drop('target', axis=1)

    preprocessed_X_train_rrp, scaler, imputer = preprocess_data_train(X_train_rrp)
    preprocessed_X_test_rrp = preprocess_data_test(X_test, scaler, imputer)

    X_train_rrp, y_train_rrp = preprocessed_X_train_rrp, y_train_rrp.to_numpy()
    X_test_rrp, y_test_rrp = preprocessed_X_test_rrp, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class")
    results = evaluate_models(X_train_rrp, X_test_rrp, y_train_rrp, y_test_rrp)
    for key, item in results.items():
          row = {'algo' : [algorithm], 'ratio': [ratio], 'seed': [rand_state], 'learner': [learners[key]], 'acc': [item['Accuracy']], 'prec': [item['Precision']], 'rec': [item['Recall']], 'f1': [item['F1 Score']], 'auc_roc': [item['ROC AUC']]}
          temp = pd.DataFrame(row)
          df = pd.concat([df, temp], ignore_index=True)
df.to_csv(file_name, index=False)

Train data combined with 20.0% synthetic data of minority class
Train data combined with 40.0% synthetic data of minority class
Train data combined with 60.0% synthetic data of minority class
Train data combined with 80.0% synthetic data of minority class
Train data combined with 100.0% synthetic data of minority class
Train data combined with 20.0% synthetic data of minority class
Train data combined with 40.0% synthetic data of minority class
Train data combined with 60.0% synthetic data of minority class
Train data combined with 80.0% synthetic data of minority class
Train data combined with 100.0% synthetic data of minority class
Train data combined with 20.0% synthetic data of minority class
Train data combined with 40.0% synthetic data of minority class
Train data combined with 60.0% synthetic data of minority class
Train data combined with 80.0% synthetic data of minority class
Train data combined with 100.0% synthetic data of minority class
Train data combined with 20.0% synthe