<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/churn_sampling_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pmlb



In [None]:
!pip install sdv



In [None]:
!pip install DataSynthesizer



In [None]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [None]:
def preprocess_data_train(X_train):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_train).sum()

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_train).sum()

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler, imputer

def preprocess_data_test(X_test, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_test).sum()

    # Handle missing data
    X_test = imputer.transform(X_test)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_test).sum()

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

## Experiments

### Dataset 1: Breast cancer

In [None]:
breast_cancer = fetch_data('breast_cancer')
breast_cancer.describe()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
count,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0
mean,2.664336,1.073427,4.062937,1.073427,1.167832,2.048951,0.468531,2.772727,0.237762,0.297203
std,1.011818,0.98668,2.151187,1.935321,0.443052,0.738217,0.499883,1.099006,0.426459,0.457828
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0
50%,3.0,2.0,4.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0
75%,3.0,2.0,5.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0
max,5.0,2.0,10.0,6.0,2.0,3.0,1.0,5.0,1.0,1.0


## Training and testing using ML models

In [None]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results


In [None]:

y = breast_cancer['target']
X = breast_cancer.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [None]:
results = evaluate_models(X_train, X_test, y_train, y_test)

In [None]:
print(results)

{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.726388888888889, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7241379310344828, 'Precision': 0.75, 'Recall': 0.16666666666666666, 'F1 Score': 0.27272727272727276, 'ROC AUC': 0.6847222222222222, 'Confusion Matrix': array([[39,  1],
       [15,  3]])}, 'Decision Tree': {'Accuracy': 0.6379310344827587, 'Precision': 0.43478260869565216, 'Recall': 0.5555555555555556, 'F1 Score': 0.4878048780487805, 'ROC AUC': 0.6340277777777779, 'Confusion Matrix': array([[27, 13],
       [ 8, 10]])}}


# SDV - Oversampling

In [None]:
def do_sdv(X_train, y_train):
  train_df = pd.concat([X_train, y_train], axis=1)
  class_counts = y_train.value_counts()

  # Find minority class label
  minority_class_label = class_counts.idxmin()

  # Filter rows with minority class label
  minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

  # Calculate counts of majority and minority classes
  majority_count = class_counts.max()
  minority_count = class_counts.min()

  metadata_data = SingleTableMetadata()
  metadata_data.detect_from_dataframe(minority_df)
  # Generate synthetic data using GaussianCopulaSynthesizer
  synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
  synthesizer_breast_data.fit(minority_df)

  # Print sample synthetic data
  synthesizer_breast_data.reset_sampling()
  sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
  return sd1, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)

    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Random Over-Sampling

In [None]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def random_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = RandomOverSampler(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SMOTE Over-Sampling

In [None]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SVM-SMOTE Over-Sampling

In [None]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def svm_smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# Intelligent Pruning

In [None]:
def find_majority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    max_label = max(zip(counts, labels))[1]
    indices_with_max_label = np.where(y == max_label)[0]
    X_maj, y_maj = X[indices_with_max_label], y[indices_with_max_label]

    # Exclude majority class samples
    indices_without_max_label = np.where(y != max_label)[0]
    X_remaining, y_remaining = X[indices_without_max_label], y[indices_without_max_label]

    return X_maj, y_maj, X_remaining, y_remaining, min(counts)

def do_clustering(X, y, labels):
  clustered_X = defaultdict(list)
  clustered_y = defaultdict(list)

  for i, label in enumerate(labels):
      clustered_X[label].append(X[i])
      clustered_y[label].append(y[i])

  # Sort clustered_X and clustered_y in descending order based on the length of values in each dictionary
  sorted_clustered_X = dict(sorted(clustered_X.items(), key=lambda x: -len(x[1])))
  sorted_clustered_y = dict(sorted(clustered_y.items(), key=lambda x: -len(x[1])))

  return sorted_clustered_X, sorted_clustered_y


def intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, per_cluster_pruning_ratio=0.7, seed=42):
  random.seed(seed)
  pruning_ratios_X_maj, pruning_ratios_y_maj = defaultdict(list), defaultdict(list)
  for pruning_samp, pruning_ratio in zip(pruning_samps, pruning_ratios):
    samps = 0
    # print("For Pruning samps: ", pruning_samp)
    prune_samps = pruning_samp
    clustered_X_new = defaultdict(list)
    clustered_y_new = defaultdict(list)
    # Iterate over the sorted dictionaries
    for label, values_X in clustered_X.items():
        # Calculate the number of samples to prune
        num_samples_to_prune = int(prune_samps * per_cluster_pruning_ratio)
        if(num_samples_to_prune > len(values_X)):
          num_samples_to_prune = len(values_X)//2
          prune_samps -= num_samples_to_prune
        else:
          prune_samps -= num_samples_to_prune

        # Randomly choose samples to prune
        indices_to_prune = random.sample(range(len(values_X)), num_samples_to_prune)

        # Prune the samples from clustered_X and clustered_y
        clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in indices_to_prune]
        clustered_y_new[label] = [clustered_y[label][i] for i in range(len(clustered_y[label])) if i not in indices_to_prune]

    while(prune_samps > 0):
        for label, values_X in clustered_X_new.items():
          if(prune_samps <=0):
            break

          index_to_prune = random.sample(range(len(values_X)), 1)
          clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in index_to_prune]
          clustered_y_new[label] = [clustered_y_new[label][i] for i in range(len(clustered_y_new[label])) if i not in index_to_prune]

          prune_samps -= 1

    for label in clustered_X_new:
        pruning_ratios_X_maj[pruning_ratio].extend(clustered_X_new[label])
        pruning_ratios_y_maj[pruning_ratio].extend(clustered_y_new[label])

  return pruning_ratios_X_maj, pruning_ratios_y_maj

def combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining):

  pruning_ratios_X, pruning_ratios_y = defaultdict(list), defaultdict(list)
  for pruning_ratio in pruning_ratios:
    pruning_ratios_X[pruning_ratio].extend(pruning_ratios_X_maj[pruning_ratio])
    pruning_ratios_X[pruning_ratio].extend(X_remaining)

    pruning_ratios_y[pruning_ratio].extend(pruning_ratios_y_maj[pruning_ratio])
    pruning_ratios_y[pruning_ratio].extend(y_remaining)

  return pruning_ratios_X, pruning_ratios_y

def do_intelligent_pruning(X, y, ratio, per_cluster_pruning_ratio=0.7, seed=42):

  X_maj, y_maj, X_remaining, y_remaining, min_class_samples = find_majority_data(X, y)
  kmeans = KMeans(n_clusters=3, random_state = 42)
  kmeans.fit(X_maj)
  labels = kmeans.labels_
  clustered_X, clustered_y = do_clustering(X_maj, y_maj, labels)

  pruning_best = len(X_maj)-min_class_samples
  pruning_samps = [int(pruning_best * ratio)]
  pruning_ratios = [ratio]

  pruning_ratios_X_maj, pruning_ratios_y_maj = intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, \
                                                                      per_cluster_pruning_ratio=per_cluster_pruning_ratio, seed=seed)

  pruning_ratios_X, pruning_ratios_y = combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining)

  return list(pruning_ratios_X.values()), list(pruning_ratios_y.values())


# Random Pruning

In [None]:
"""
inputs:
X: np.array
y: np.array
percentage: from 0% upto 100%, enter int value
"""
def random_prune_data(X, y, ratio, seed = 42):
  # preprocessed_X, scaler, imputer = preprocess_data_train(X)
  # preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  # X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
  # X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()
  np.random.seed(seed)
  labels_count = {}
  labels = np.unique(y)
  for label in labels:
    labels_count[label] = np.count_nonzero(y == label)
  max_label = min_label = labels[0]
  for label in labels_count:
    if labels_count[label] > labels_count[max_label]:
      max_label = label
    if labels_count[label] < labels_count[min_label]:
      min_label = label

  # print("Max", max_label, labels_count[max_label])
  # print("Min", min_label, labels_count[min_label])

  prune_counts = {}
  prune_indexes = {}
  for label in labels_count:
    prune_counts[label] = labels_count[label] - labels_count[min_label]
    prune_indexes[label] = np.where(y == label)[0]

  prune_amount = int(ratio * sum(map(lambda x: x[1], prune_counts.items())))
  prune_it = {}

  while prune_amount > 0:
    for label in labels:
      if (len(prune_indexes[label]) - labels_count[min_label]) > 0 and prune_amount > 0:
        random_index = np.random.choice(len(prune_indexes[label]))
        random_item = prune_indexes[label][random_index]
        prune_indexes[label] = np.delete(prune_indexes[label], random_index)
        if prune_it.get(label, None) is None:
          prune_it[label] = np.array([])
        prune_it[label] = np.append(prune_it[label], [random_item])
        prune_amount -= 1



  formatted_indexes = np.array([])
  for label in prune_indexes:
    formatted_indexes = np.append(formatted_indexes, prune_indexes[label])
  formatted_indexes = np.sort(formatted_indexes)
  new_arr = np.array([np.int64(i) for i in formatted_indexes])

  return X[new_arr], y[new_arr]

In [None]:
ratios = [ratio for ratio in np.arange(0.2, 1.1, 0.2)]

# Calling Intelligent Pruning

In [None]:
results_intelligent_pruning = dict()
per_cluster_pruning_ratios = [0.5, 0.7, 0.9, 1]

for per_cluster_pruning_ratio in per_cluster_pruning_ratios:
  print(f'For per-cluster pruning ratio {per_cluster_pruning_ratio}')
  for ratio in ratios:
    X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

    intelligent_pruned_X_train, intelligent_pruned_y_train = do_intelligent_pruning(X_train_copy.to_numpy(), y_train_copy.to_numpy(), ratio, per_cluster_pruning_ratio=per_cluster_pruning_ratio)

    preprocessed_intelligent_pruned_X_train, scaler, imputer = preprocess_data_train((np.array(intelligent_pruned_X_train))[0])
    preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

    intelligent_pruned_X_train, intelligent_pruned_y_train = preprocessed_intelligent_pruned_X_train, (np.array(intelligent_pruned_y_train))[0]
    intelligent_pruned_X_test, intelligent_pruned_y_test = preprocessed_X_test, y_test.to_numpy()
    print(f"Train data pruned intelligently at {ratio * 100}% :")
    results = evaluate_models(intelligent_pruned_X_train, intelligent_pruned_X_test, intelligent_pruned_y_train, intelligent_pruned_y_test)
    print(results)
    results_intelligent_pruning[ratio] = results
    print("_______________________________________________________________________________")

For per-cluster pruning ratio 0.5
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7152777777777778, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.8111111111111111, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6379310344827587, 'Precision': 0.42857142857142855, 'Recall': 0.5, 'F1 Score': 0.4615384615384615, 'ROC AUC': 0.6194444444444444, 'Confusion Matrix': array([[28, 12],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.6923076923076923, 'Recall': 0.5, 'F1 Score': 0.5806451612903226, 'ROC AUC': 0.7527777777777778, 'Confusion Matrix': array([[36,  4],
       [ 9,  9]])}, 'SVM': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.7513888888888889, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.7241379310344828, 'Precision': 0.5714285714285714, 'Recall': 0.4444444444444444, 'F1 Score': 0.5, 'ROC AUC': 0.6708333333333333, 'Confusion Matrix': array([[34,  6],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.6666666666666666, 'Recall': 0.5555555555555556, 'F1 Score': 0.606060606060606, 'ROC AUC': 0.7597222222222222, 'Confusion Matrix': array([[35,  5],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.6923076923076923, 'Recall': 0.5, 'F1 Score': 0.5806451612903226, 'ROC AUC': 0.7972222222222223, 'Confusion Matrix': array([[36,  4],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.47619047619047616, 'Recall': 0.5555555555555556, 'F1 Score': 0.5128205128205129, 'ROC AUC': 0.6402777777777777, 'Confusion Matrix': array([[29, 11],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.7586206896551724, 'Precision': 0.625, 'Recall': 0.5555555555555556, 'F1 Score': 0.5882352941176471, 'ROC AUC': 0.7125, 'Confusion Matrix': array([[34,  6],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7413793103448276, 'Precision': 0.6, 'Recall': 0.5, 'F1 Score': 0.5454545454545454, 'ROC AUC': 0.7652777777777778, 'Confusion Matrix': array([[34,  6],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.5862068965517241, 'Precision': 0.38461538461538464, 'Recall': 0.5555555555555556, 'F1 Score': 0.4545454545454546, 'ROC AUC': 0.5777777777777778, 'Confusion Matrix': array([[24, 16],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.6724137931034483, 'Precision': 0.48, 'Recall': 0.6666666666666666, 'F1 Score': 0.5581395348837209, 'ROC AUC': 0.7583333333333333, 'Confusion Matrix': array([[27, 13],
       [ 6, 12]])}, 'SVM': {'Accuracy': 0.7413793103448276, 'Precision': 0.56, 'Recall': 0.7777777777777778, 'F1 Score': 0.6511627906976745, 'ROC AUC': 0.8236111111111112, 'Confusion Matrix': array([[29, 11],
       [ 4, 14]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.48148148148148145, 'Recall': 0.7222222222222222, 'F1 Score': 0.5777777777777777, 'ROC AUC': 0.7041666666666666, 'Confusion Matrix': array([[26, 14],
       [ 5, 13]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.7
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.8103448275862069, 'Precision': 0.8888888888888888, 'Recall': 0.4444444444444444, 'F1 Score': 0.5925925925925926, 'ROC AUC': 0.7305555555555555, 'Confusion Matrix': array([[39,  1],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.8041666666666667, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.47619047619047616, 'Recall': 0.5555555555555556, 'F1 Score': 0.5128205128205129, 'ROC AUC': 0.6604166666666667, 'Confusion Matrix': array([[29, 11],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.7291666666666667, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.7541666666666668, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4, 'Recall': 0.4444444444444444, 'F1 Score': 0.4210526315789474, 'ROC AUC': 0.611111111111111, 'Confusion Matrix': array([[28, 12],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7083333333333334, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7586206896551724, 'Precision': 0.6666666666666666, 'Recall': 0.4444444444444444, 'F1 Score': 0.5333333333333333, 'ROC AUC': 0.7777777777777778, 'Confusion Matrix': array([[36,  4],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6551724137931034, 'Precision': 0.45454545454545453, 'Recall': 0.5555555555555556, 'F1 Score': 0.5, 'ROC AUC': 0.6472222222222223, 'Confusion Matrix': array([[28, 12],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.7586206896551724, 'Precision': 0.625, 'Recall': 0.5555555555555556, 'F1 Score': 0.5882352941176471, 'ROC AUC': 0.7055555555555556, 'Confusion Matrix': array([[34,  6],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7413793103448276, 'Precision': 0.6153846153846154, 'Recall': 0.4444444444444444, 'F1 Score': 0.5161290322580646, 'ROC AUC': 0.7375, 'Confusion Matrix': array([[35,  5],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.603448275862069, 'Precision': 0.4, 'Recall': 0.5555555555555556, 'F1 Score': 0.46511627906976744, 'ROC AUC': 0.5902777777777778, 'Confusion Matrix': array([[25, 15],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.603448275862069, 'Precision': 0.4074074074074074, 'Recall': 0.6111111111111112, 'F1 Score': 0.4888888888888889, 'ROC AUC': 0.6444444444444444, 'Confusion Matrix': array([[24, 16],
       [ 7, 11]])}, 'SVM': {'Accuracy': 0.7068965517241379, 'Precision': 0.5263157894736842, 'Recall': 0.5555555555555556, 'F1 Score': 0.5405405405405405, 'ROC AUC': 0.7375, 'Confusion Matrix': array([[31,  9],
       [ 8, 10]])}, 'Decision Tree': {'Accuracy': 0.7241379310344828, 'Precision': 0.5454545454545454, 'Recall': 0.6666666666666666, 'F1 Score': 0.6, 'ROC AUC': 0.7041666666666666, 'Confusion Matrix': array([[30, 10],
       [ 6, 12]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.9
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.8103448275862069, 'Precision': 0.8888888888888888, 'Recall': 0.4444444444444444, 'F1 Score': 0.5925925925925926, 'ROC AUC': 0.7291666666666666, 'Confusion Matrix': array([[39,  1],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.7541666666666667, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4, 'Recall': 0.4444444444444444, 'F1 Score': 0.4210526315789474, 'ROC AUC': 0.5722222222222222, 'Confusion Matrix': array([[28, 12],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.8275862068965517, 'Precision': 1.0, 'Recall': 0.4444444444444444, 'F1 Score': 0.6153846153846153, 'ROC AUC': 0.7069444444444444, 'Confusion Matrix': array([[40,  0],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7305555555555556, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6379310344827587, 'Precision': 0.42105263157894735, 'Recall': 0.4444444444444444, 'F1 Score': 0.43243243243243246, 'ROC AUC': 0.5979166666666665, 'Confusion Matrix': array([[29, 11],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.8103448275862069, 'Precision': 0.8181818181818182, 'Recall': 0.5, 'F1 Score': 0.6206896551724137, 'ROC AUC': 0.7041666666666666, 'Confusion Matrix': array([[38,  2],
       [ 9,  9]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7486111111111111, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4, 'Recall': 0.4444444444444444, 'F1 Score': 0.4210526315789474, 'ROC AUC': 0.5847222222222224, 'Confusion Matrix': array([[28, 12],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.5862068965517241, 'Precision': 0.36363636363636365, 'Recall': 0.4444444444444444, 'F1 Score': 0.39999999999999997, 'ROC AUC': 0.5736111111111111, 'Confusion Matrix': array([[26, 14],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7241379310344828, 'Precision': 0.5833333333333334, 'Recall': 0.3888888888888889, 'F1 Score': 0.4666666666666666, 'ROC AUC': 0.6736111111111112, 'Confusion Matrix': array([[35,  5],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4090909090909091, 'Recall': 0.5, 'F1 Score': 0.45, 'ROC AUC': 0.6062500000000001, 'Confusion Matrix': array([[27, 13],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.41379310344827586, 'Precision': 0.3, 'Recall': 0.6666666666666666, 'F1 Score': 0.41379310344827586, 'ROC AUC': 0.4416666666666667, 'Confusion Matrix': array([[12, 28],
       [ 6, 12]])}, 'SVM': {'Accuracy': 0.43103448275862066, 'Precision': 0.3076923076923077, 'Recall': 0.6666666666666666, 'F1 Score': 0.42105263157894735, 'ROC AUC': 0.46805555555555556, 'Confusion Matrix': array([[13, 27],
       [ 6, 12]])}, 'Decision Tree': {'Accuracy': 0.29310344827586204, 'Precision': 0.24444444444444444, 'Recall': 0.6111111111111112, 'F1 Score': 0.3492063492063492, 'ROC AUC': 0.3805555555555556, 'Confusion Matrix': array([[ 6, 34],
       [ 7, 11]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 1
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.8275862068965517, 'Precision': 1.0, 'Recall': 0.4444444444444444, 'F1 Score': 0.6153846153846153, 'ROC AUC': 0.7208333333333333, 'Confusion Matrix': array([[40,  0],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.7569444444444445, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.7068965517241379, 'Precision': 0.5263157894736842, 'Recall': 0.5555555555555556, 'F1 Score': 0.5405405405405405, 'ROC AUC': 0.6666666666666667, 'Confusion Matrix': array([[31,  9],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.8275862068965517, 'Precision': 1.0, 'Recall': 0.4444444444444444, 'F1 Score': 0.6153846153846153, 'ROC AUC': 0.7055555555555556, 'Confusion Matrix': array([[40,  0],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7319444444444445, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.47058823529411764, 'Recall': 0.4444444444444444, 'F1 Score': 0.45714285714285713, 'ROC AUC': 0.6097222222222223, 'Confusion Matrix': array([[31,  9],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.8448275862068966, 'Precision': 0.9090909090909091, 'Recall': 0.5555555555555556, 'F1 Score': 0.6896551724137931, 'ROC AUC': 0.7041666666666666, 'Confusion Matrix': array([[39,  1],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7586206896551724, 'Precision': 0.6666666666666666, 'Recall': 0.4444444444444444, 'F1 Score': 0.5333333333333333, 'ROC AUC': 0.726388888888889, 'Confusion Matrix': array([[36,  4],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4, 'Recall': 0.4444444444444444, 'F1 Score': 0.4210526315789474, 'ROC AUC': 0.5916666666666666, 'Confusion Matrix': array([[28, 12],
       [10,  8]])}}
_______________________________________________________________________________
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.603448275862069, 'Precision': 0.4, 'Recall': 0.5555555555555556, 'F1 Score': 0.46511627906976744, 'ROC AUC': 0.5583333333333335, 'Confusion Matrix': array([[25, 15],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.5172413793103449, 'Precision': 0.32142857142857145, 'Recall': 0.5, 'F1 Score': 0.391304347826087, 'ROC AUC': 0.5694444444444444, 'Confusion Matrix': array([[21, 19],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.46551724137931033, 'Precision': 0.2903225806451613, 'Recall': 0.5, 'F1 Score': 0.3673469387755102, 'ROC AUC': 0.4875, 'Confusion Matrix': array([[18, 22],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.6666666666666666, 'F1 Score': 0.5714285714285715, 'ROC AUC': 0.7319444444444445, 'Confusion Matrix': array([[28, 12],
       [ 6, 12]])}, 'SVM': {'Accuracy': 0.7586206896551724, 'Precision': 0.5909090909090909, 'Recall': 0.7222222222222222, 'F1 Score': 0.65, 'ROC AUC': 0.8375, 'Confusion Matrix': array([[31,  9],
       [ 5, 13]])}, 'Decision Tree': {'Accuracy': 0.6379310344827587, 'Precision': 0.42105263157894735, 'Recall': 0.4444444444444444, 'F1 Score': 0.43243243243243246, 'ROC AUC': 0.6048611111111111, 'Confusion Matrix': array([[29, 11],
       [10,  8]])}}
_______________________________________________________________________________


#Calling Random Pruning

In [None]:
results_random_pruning = dict()
for ratio in ratios:
  random_pruned_X_train, random_pruned_y_train = random_prune_data(X_train.to_numpy(), y_train.to_numpy(), ratio)
  preprocessed_random_pruned_X_train, scaler, imputer = preprocess_data_train(random_pruned_X_train)
  preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  random_pruned_X_train, random_pruned_y_train = preprocessed_random_pruned_X_train, random_pruned_y_train
  random_pruned_X_test, random_pruned_y_test = preprocessed_X_test, y_test.to_numpy()

  print(f"Train data pruned randomly at {ratio * 100}% :")
  results = evaluate_models(random_pruned_X_train, random_pruned_X_test, random_pruned_y_train, random_pruned_y_test)
  print(results)
  results_random_pruning[ratio] = results
  print("_______________________________________________________________________________")

Train data pruned randomly at 20.0% :




{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.7236111111111111, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7586206896551724, 'Precision': 0.7, 'Recall': 0.3888888888888889, 'F1 Score': 0.5, 'ROC AUC': 0.8027777777777778, 'Confusion Matrix': array([[37,  3],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.7413793103448276, 'Precision': 0.5789473684210527, 'Recall': 0.6111111111111112, 'F1 Score': 0.5945945945945946, 'ROC AUC': 0.7055555555555556, 'Confusion Matrix': array([[32,  8],
       [ 7, 11]])}}
_______________________________________________________________________________
Train data pruned randomly at 40.0% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7194444444444446, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.8097222222222222, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.47619047619047616, 'Recall': 0.5555555555555556, 'F1 Score': 0.5128205128205129, 'ROC AUC': 0.6347222222222222, 'Confusion Matrix': array([[29, 11],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned randomly at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.673611111111111, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.8097222222222222, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.47619047619047616, 'Recall': 0.5555555555555556, 'F1 Score': 0.5128205128205129, 'ROC AUC': 0.6402777777777777, 'Confusion Matrix': array([[29, 11],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data pruned randomly at 80.0% :




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.6666666666666666, 'Recall': 0.5555555555555556, 'F1 Score': 0.606060606060606, 'ROC AUC': 0.6583333333333333, 'Confusion Matrix': array([[35,  5],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7413793103448276, 'Precision': 0.6153846153846154, 'Recall': 0.4444444444444444, 'F1 Score': 0.5161290322580646, 'ROC AUC': 0.788888888888889, 'Confusion Matrix': array([[35,  5],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.7586206896551724, 'Precision': 0.6, 'Recall': 0.6666666666666666, 'F1 Score': 0.631578947368421, 'ROC AUC': 0.7333333333333333, 'Confusion Matrix': array([[32,  8],
       [ 6, 12]])}}
_______________________________________________________________________________
Train data pruned randomly at 100.0% :




{'Logistic Regression': {'Accuracy': 0.6724137931034483, 'Precision': 0.4782608695652174, 'Recall': 0.6111111111111112, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.6722222222222223, 'Confusion Matrix': array([[28, 12],
       [ 7, 11]])}, 'SVM': {'Accuracy': 0.7241379310344828, 'Precision': 0.55, 'Recall': 0.6111111111111112, 'F1 Score': 0.5789473684210527, 'ROC AUC': 0.7875, 'Confusion Matrix': array([[31,  9],
       [ 7, 11]])}, 'Decision Tree': {'Accuracy': 0.6379310344827587, 'Precision': 0.4444444444444444, 'Recall': 0.6666666666666666, 'F1 Score': 0.5333333333333333, 'ROC AUC': 0.6416666666666666, 'Confusion Matrix': array([[25, 15],
       [ 6, 12]])}}
_______________________________________________________________________________


# Calling SDV-Oversampling

In [None]:
sd1, train_df = do_sdv(X_train, y_train)
results_syn_sdv = dict()

# Add synthetic data at different percentages to the main DataFrame
for ratio in ratios:
    combined_df = add_synthetic_data(train_df, sd1, ratio)
    y_train_sdv = combined_df['target']
    X_train_sdv = combined_df.drop('target', axis=1)

    preprocessed_X_train_sdv, scaler, imputer = preprocess_data_train(X_train_sdv)
    preprocessed_X_test_sdv = preprocess_data_test(X_test, scaler, imputer)

    X_train_sdv, y_train_sdv = preprocessed_X_train_sdv, y_train_sdv.to_numpy()
    X_test_sdv, y_test_sdv = preprocessed_X_test_sdv, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_sdv, X_test_sdv, y_train_sdv, y_test_sdv)
    results_syn_sdv[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.75, 'Recall': 0.5, 'F1 Score': 0.6, 'ROC AUC': 0.7291666666666666, 'Confusion Matrix': array([[37,  3],
       [ 9,  9]])}, 'SVM': {'Accuracy': 0.7586206896551724, 'Precision': 0.6666666666666666, 'Recall': 0.4444444444444444, 'F1 Score': 0.5333333333333333, 'ROC AUC': 0.7944444444444444, 'Confusion Matrix': array([[36,  4],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.5862068965517241, 'Precision': 0.36363636363636365, 'Recall': 0.4444444444444444, 'F1 Score': 0.39999999999999997, 'ROC AUC': 0.5652777777777778, 'Confusion Matrix': array([[26, 14],
       [10,  8]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
{'Logistic Regression': {'Accuracy': 0.8103448275862069, 'Precision': 0.7333333333333333, 'Recall': 0.6111111111111112, 'F1 Score': 0.66

# Calling SMOTE-Oversampling

In [None]:
results_smote = dict()

for ratio in ratios:

    X_train_smote, y_train_smote = smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])
    preprocessed_X_train_smote, scaler, imputer = preprocess_data_train((np.array(X_train_smote))[0])
    preprocessed_X_test_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_smote, y_train_smote = preprocessed_X_train_smote, (np.array(y_train_smote))[0]
    X_test_smote, y_test_smote = preprocessed_X_test_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_smote), len(y_train_smote))
    results = evaluate_models(X_train_smote, X_test_smote, y_train_smote, y_test_smote)
    results_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
246 246
{'Logistic Regression': {'Accuracy': 0.8103448275862069, 'Precision': 0.8181818181818182, 'Recall': 0.5, 'F1 Score': 0.6206896551724137, 'ROC AUC': 0.7541666666666667, 'Confusion Matrix': array([[38,  2],
       [ 9,  9]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7944444444444445, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4, 'Recall': 0.4444444444444444, 'F1 Score': 0.4210526315789474, 'ROC AUC': 0.5722222222222222, 'Confusion Matrix': array([[28, 12],
       [10,  8]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
265 265




{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.7142857142857143, 'Recall': 0.5555555555555556, 'F1 Score': 0.6250000000000001, 'ROC AUC': 0.7652777777777778, 'Confusion Matrix': array([[36,  4],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7931034482758621, 'Precision': 0.75, 'Recall': 0.5, 'F1 Score': 0.6, 'ROC AUC': 0.8, 'Confusion Matrix': array([[37,  3],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.48, 'Recall': 0.6666666666666666, 'F1 Score': 0.5581395348837209, 'ROC AUC': 0.6618055555555555, 'Confusion Matrix': array([[27, 13],
       [ 6, 12]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
284 284




{'Logistic Regression': {'Accuracy': 0.7586206896551724, 'Precision': 0.6111111111111112, 'Recall': 0.6111111111111112, 'F1 Score': 0.6111111111111112, 'ROC AUC': 0.7583333333333333, 'Confusion Matrix': array([[33,  7],
       [ 7, 11]])}, 'SVM': {'Accuracy': 0.7068965517241379, 'Precision': 0.5294117647058824, 'Recall': 0.5, 'F1 Score': 0.5142857142857143, 'ROC AUC': 0.8, 'Confusion Matrix': array([[32,  8],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6551724137931034, 'Precision': 0.45, 'Recall': 0.5, 'F1 Score': 0.4736842105263158, 'ROC AUC': 0.598611111111111, 'Confusion Matrix': array([[29, 11],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
303 303




{'Logistic Regression': {'Accuracy': 0.7241379310344828, 'Precision': 0.5454545454545454, 'Recall': 0.6666666666666666, 'F1 Score': 0.6, 'ROC AUC': 0.7430555555555556, 'Confusion Matrix': array([[30, 10],
       [ 6, 12]])}, 'SVM': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.5555555555555556, 'F1 Score': 0.5263157894736842, 'ROC AUC': 0.7791666666666667, 'Confusion Matrix': array([[30, 10],
       [ 8, 10]])}, 'Decision Tree': {'Accuracy': 0.7068965517241379, 'Precision': 0.5238095238095238, 'Recall': 0.6111111111111112, 'F1 Score': 0.5641025641025642, 'ROC AUC': 0.6805555555555556, 'Confusion Matrix': array([[30, 10],
       [ 7, 11]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
322 322




{'Logistic Regression': {'Accuracy': 0.6724137931034483, 'Precision': 0.48, 'Recall': 0.6666666666666666, 'F1 Score': 0.5581395348837209, 'ROC AUC': 0.7374999999999999, 'Confusion Matrix': array([[27, 13],
       [ 6, 12]])}, 'SVM': {'Accuracy': 0.7068965517241379, 'Precision': 0.5238095238095238, 'Recall': 0.6111111111111112, 'F1 Score': 0.5641025641025642, 'ROC AUC': 0.7361111111111112, 'Confusion Matrix': array([[30, 10],
       [ 7, 11]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4166666666666667, 'Recall': 0.5555555555555556, 'F1 Score': 0.4761904761904762, 'ROC AUC': 0.6097222222222222, 'Confusion Matrix': array([[26, 14],
       [ 8, 10]])}}
_______________________________________________________________________________


# Calling Random-Oversampling

In [None]:
results_random = dict()

for ratio in ratios:

    X_train_random, y_train_random = random_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_random, scaler, imputer = preprocess_data_train((np.array(X_train_random)[0]))
    preprocessed_X_test_random = preprocess_data_test(X_test, scaler, imputer)

    X_train_random, y_train_random = preprocessed_X_train_random, (np.array(y_train_random))[0]
    X_test_random, y_test_random = preprocessed_X_test_random, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_random), len(y_train_random))
    results = evaluate_models(X_train_random, X_test_random, y_train_random, y_test_random)
    results_random[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
246 246
{'Logistic Regression': {'Accuracy': 0.7586206896551724, 'Precision': 0.6666666666666666, 'Recall': 0.4444444444444444, 'F1 Score': 0.5333333333333333, 'ROC AUC': 0.7152777777777777, 'Confusion Matrix': array([[36,  4],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7272727272727273, 'Recall': 0.4444444444444444, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7763888888888889, 'Confusion Matrix': array([[37,  3],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.7413793103448276, 'Precision': 0.6, 'Recall': 0.5, 'F1 Score': 0.5454545454545454, 'ROC AUC': 0.7222222222222222, 'Confusion Matrix': array([[34,  6],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
265 265




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.6923076923076923, 'Recall': 0.5, 'F1 Score': 0.5806451612903226, 'ROC AUC': 0.7361111111111112, 'Confusion Matrix': array([[36,  4],
       [ 9,  9]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.6923076923076923, 'Recall': 0.5, 'F1 Score': 0.5806451612903226, 'ROC AUC': 0.7902777777777777, 'Confusion Matrix': array([[36,  4],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.7413793103448276, 'Precision': 0.6, 'Recall': 0.5, 'F1 Score': 0.5454545454545454, 'ROC AUC': 0.6944444444444444, 'Confusion Matrix': array([[34,  6],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
284 284




{'Logistic Regression': {'Accuracy': 0.7758620689655172, 'Precision': 0.6666666666666666, 'Recall': 0.5555555555555556, 'F1 Score': 0.606060606060606, 'ROC AUC': 0.7277777777777777, 'Confusion Matrix': array([[35,  5],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7586206896551724, 'Precision': 0.6428571428571429, 'Recall': 0.5, 'F1 Score': 0.5625000000000001, 'ROC AUC': 0.7749999999999999, 'Confusion Matrix': array([[35,  5],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.6111111111111112, 'F1 Score': 0.55, 'ROC AUC': 0.6680555555555556, 'Confusion Matrix': array([[29, 11],
       [ 7, 11]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
303 303




{'Logistic Regression': {'Accuracy': 0.7413793103448276, 'Precision': 0.5882352941176471, 'Recall': 0.5555555555555556, 'F1 Score': 0.5714285714285715, 'ROC AUC': 0.7388888888888888, 'Confusion Matrix': array([[33,  7],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7413793103448276, 'Precision': 0.5882352941176471, 'Recall': 0.5555555555555556, 'F1 Score': 0.5714285714285715, 'ROC AUC': 0.7930555555555555, 'Confusion Matrix': array([[33,  7],
       [ 8, 10]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.4444444444444444, 'F1 Score': 0.47058823529411764, 'ROC AUC': 0.6222222222222222, 'Confusion Matrix': array([[32,  8],
       [10,  8]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
322 322




{'Logistic Regression': {'Accuracy': 0.7241379310344828, 'Precision': 0.5454545454545454, 'Recall': 0.6666666666666666, 'F1 Score': 0.6, 'ROC AUC': 0.7444444444444445, 'Confusion Matrix': array([[30, 10],
       [ 6, 12]])}, 'SVM': {'Accuracy': 0.7241379310344828, 'Precision': 0.5555555555555556, 'Recall': 0.5555555555555556, 'F1 Score': 0.5555555555555556, 'ROC AUC': 0.8069444444444445, 'Confusion Matrix': array([[32,  8],
       [ 8, 10]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.5555555555555556, 'F1 Score': 0.5263157894736842, 'ROC AUC': 0.6527777777777778, 'Confusion Matrix': array([[30, 10],
       [ 8, 10]])}}
_______________________________________________________________________________


# Calling SVM-SMOTE Over-Sampling

In [None]:
results_svm_smote = dict()

for ratio in ratios:

    X_train_svm_smote, y_train_svm_smote = svm_smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_svm_smote, scaler, imputer = preprocess_data_train((np.array(X_train_svm_smote))[0])
    preprocessed_X_test_svm_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_svm_smote, y_train_svm_smote = preprocessed_X_train_svm_smote, (np.array(y_train_svm_smote))[0]
    X_test_svm_smote, y_test_svm_smote = preprocessed_X_test_svm_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_svm_smote), len(y_train_svm_smote))
    results = evaluate_models(X_train_svm_smote, X_test_svm_smote, y_train_svm_smote, y_test_svm_smote)
    results_svm_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
246 246
{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.75, 'Recall': 0.5, 'F1 Score': 0.6, 'ROC AUC': 0.7222222222222221, 'Confusion Matrix': array([[37,  3],
       [ 9,  9]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.7916666666666666, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4090909090909091, 'Recall': 0.5, 'F1 Score': 0.45, 'ROC AUC': 0.5875000000000001, 'Confusion Matrix': array([[27, 13],
       [ 9,  9]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
265 265




{'Logistic Regression': {'Accuracy': 0.8103448275862069, 'Precision': 0.7692307692307693, 'Recall': 0.5555555555555556, 'F1 Score': 0.6451612903225806, 'ROC AUC': 0.7430555555555555, 'Confusion Matrix': array([[37,  3],
       [ 8, 10]])}, 'SVM': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.7805555555555556, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'Decision Tree': {'Accuracy': 0.6551724137931034, 'Precision': 0.45454545454545453, 'Recall': 0.5555555555555556, 'F1 Score': 0.5, 'ROC AUC': 0.6277777777777778, 'Confusion Matrix': array([[28, 12],
       [ 8, 10]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
284 284




{'Logistic Regression': {'Accuracy': 0.7586206896551724, 'Precision': 0.6111111111111112, 'Recall': 0.6111111111111112, 'F1 Score': 0.6111111111111112, 'ROC AUC': 0.7680555555555556, 'Confusion Matrix': array([[33,  7],
       [ 7, 11]])}, 'SVM': {'Accuracy': 0.8103448275862069, 'Precision': 0.8181818181818182, 'Recall': 0.5, 'F1 Score': 0.6206896551724137, 'ROC AUC': 0.8125, 'Confusion Matrix': array([[38,  2],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.48, 'Recall': 0.6666666666666666, 'F1 Score': 0.5581395348837209, 'ROC AUC': 0.6708333333333334, 'Confusion Matrix': array([[27, 13],
       [ 6, 12]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
303 303




{'Logistic Regression': {'Accuracy': 0.7413793103448276, 'Precision': 0.5789473684210527, 'Recall': 0.6111111111111112, 'F1 Score': 0.5945945945945946, 'ROC AUC': 0.7583333333333334, 'Confusion Matrix': array([[32,  8],
       [ 7, 11]])}, 'SVM': {'Accuracy': 0.7931034482758621, 'Precision': 0.75, 'Recall': 0.5, 'F1 Score': 0.6, 'ROC AUC': 0.7875, 'Confusion Matrix': array([[37,  3],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.6666666666666666, 'F1 Score': 0.5714285714285715, 'ROC AUC': 0.6833333333333332, 'Confusion Matrix': array([[28, 12],
       [ 6, 12]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
322 322




{'Logistic Regression': {'Accuracy': 0.7068965517241379, 'Precision': 0.5238095238095238, 'Recall': 0.6111111111111112, 'F1 Score': 0.5641025641025642, 'ROC AUC': 0.7250000000000001, 'Confusion Matrix': array([[30, 10],
       [ 7, 11]])}, 'SVM': {'Accuracy': 0.7068965517241379, 'Precision': 0.5294117647058824, 'Recall': 0.5, 'F1 Score': 0.5142857142857143, 'ROC AUC': 0.7652777777777777, 'Confusion Matrix': array([[32,  8],
       [ 9,  9]])}, 'Decision Tree': {'Accuracy': 0.6206896551724138, 'Precision': 0.4, 'Recall': 0.4444444444444444, 'F1 Score': 0.4210526315789474, 'ROC AUC': 0.5722222222222222, 'Confusion Matrix': array([[28, 12],
       [10,  8]])}}
_______________________________________________________________________________
