<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/churn_sampling_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [2]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.11.0-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.6/125.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.34.80-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.34.80-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.10,>=0.9.0 (from sdv)
  Downloading ctgan-0.9.1-py3-none-any.whl (24 kB)
Collecting deepecho<0.6,>=0.5 (from sdv)
  Downl

In [3]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [4]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [24]:
def preprocess_data_train(X_train):

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler

def preprocess_data_test(X_test, scaler):

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

## Experiments

### Dataset 1: Churn

In [26]:
churn = fetch_data('churn')
churn.rename(columns={'state':'X'}, inplace=True)
churn.drop(columns=['phone number', 'voice mail plan'], inplace=True)
churn.describe()

Unnamed: 0,X,account length,area code,international plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,number customer service calls,target
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,25.9984,100.2586,436.9114,0.0946,7.7552,180.2889,100.0294,30.649668,200.63656,100.191,17.054322,200.39162,99.9192,9.017732,10.26178,4.4352,2.771196,1.5704,0.1414
std,14.80348,39.69456,42.209182,0.292691,13.546393,53.894699,19.831197,9.162069,50.551309,19.826496,4.296843,50.527789,19.958686,2.273763,2.761396,2.456788,0.745514,1.306363,0.348469
min,0.0,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.0,73.0,408.0,0.0,0.0,143.7,87.0,24.43,166.375,87.0,14.14,166.9,87.0,7.51,8.5,3.0,2.3,1.0,0.0
50%,26.0,100.0,415.0,0.0,0.0,180.1,100.0,30.62,201.0,100.0,17.09,200.4,100.0,9.02,10.3,4.0,2.78,1.0,0.0
75%,39.0,127.0,415.0,0.0,17.0,216.2,113.0,36.75,234.1,114.0,19.9,234.7,113.0,10.56,12.0,6.0,3.24,2.0,0.0
max,50.0,243.0,510.0,1.0,52.0,351.5,165.0,59.76,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0,1.0


## Training and testing using ML models

In [11]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test, random_state=42):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(random_state=random_state),
        "SVM": SVC(random_state=random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state)
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results


In [27]:

y = churn['target']
X = churn.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [28]:
results = evaluate_models(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
print(results)

{'Logistic Regression': {'Accuracy': 0.864, 'Precision': 0.56, 'Recall': 0.10071942446043165, 'F1 Score': 0.17073170731707316, 'ROC AUC': 0.7499561326548517, 'Confusion Matrix': array([[850,  11],
       [125,  14]])}, 'SVM': {'Accuracy': 0.861, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.7722992337837047, 'Confusion Matrix': array([[861,   0],
       [139,   0]])}, 'Decision Tree': {'Accuracy': 0.922, 'Precision': 0.7132867132867133, 'Recall': 0.7338129496402878, 'F1 Score': 0.7234042553191491, 'ROC AUC': 0.84309695101062, 'Confusion Matrix': array([[820,  41],
       [ 37, 102]])}}


# SDV - Oversampling

In [15]:
def do_sdv(X_train, y_train):
  train_df = pd.concat([X_train, y_train], axis=1)
  class_counts = y_train.value_counts()

  # Find minority class label
  minority_class_label = class_counts.idxmin()

  # Filter rows with minority class label
  minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

  # Calculate counts of majority and minority classes
  majority_count = class_counts.max()
  minority_count = class_counts.min()

  metadata_data = SingleTableMetadata()
  metadata_data.detect_from_dataframe(minority_df)
  metadata_data.remove_primary_key()
  # Generate synthetic data using GaussianCopulaSynthesizer
  synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
  synthesizer_breast_data.fit(minority_df)

  # Print sample synthetic data
  synthesizer_breast_data.reset_sampling()
  sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
  return sd1, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)

    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Random Over-Sampling

In [16]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def random_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = RandomOverSampler(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SMOTE Over-Sampling

In [17]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SVM-SMOTE Over-Sampling

In [18]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def svm_smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# Intelligent Pruning

In [19]:
def find_majority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    max_label = max(zip(counts, labels))[1]
    indices_with_max_label = np.where(y == max_label)[0]
    X_maj, y_maj = X[indices_with_max_label], y[indices_with_max_label]

    # Exclude majority class samples
    indices_without_max_label = np.where(y != max_label)[0]
    X_remaining, y_remaining = X[indices_without_max_label], y[indices_without_max_label]

    return X_maj, y_maj, X_remaining, y_remaining, min(counts)

def do_clustering(X, y, labels):
  clustered_X = defaultdict(list)
  clustered_y = defaultdict(list)

  for i, label in enumerate(labels):
      clustered_X[label].append(X[i])
      clustered_y[label].append(y[i])

  # Sort clustered_X and clustered_y in descending order based on the length of values in each dictionary
  sorted_clustered_X = dict(sorted(clustered_X.items(), key=lambda x: -len(x[1])))
  sorted_clustered_y = dict(sorted(clustered_y.items(), key=lambda x: -len(x[1])))

  return sorted_clustered_X, sorted_clustered_y


def intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, per_cluster_pruning_ratio=0.7, seed=42):
  random.seed(seed)
  pruning_ratios_X_maj, pruning_ratios_y_maj = defaultdict(list), defaultdict(list)
  for pruning_samp, pruning_ratio in zip(pruning_samps, pruning_ratios):
    samps = 0
    # print("For Pruning samps: ", pruning_samp)
    prune_samps = pruning_samp
    # print(prune_samps)
    clustered_X_new = defaultdict(list)
    clustered_y_new = defaultdict(list)
    # Iterate over the sorted dictionaries
    for label, values_X in clustered_X.items():
        # Calculate the number of samples to prune
        num_samples_to_prune = int(prune_samps * per_cluster_pruning_ratio)
        if(num_samples_to_prune > len(values_X)):
          num_samples_to_prune = len(values_X)//2
          prune_samps -= num_samples_to_prune
        else:
          prune_samps -= num_samples_to_prune

        # Randomly choose samples to prune
        indices_to_prune = random.sample(range(len(values_X)), num_samples_to_prune)

        # Prune the samples from clustered_X and clustered_y
        clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in indices_to_prune]
        clustered_y_new[label] = [clustered_y[label][i] for i in range(len(clustered_y[label])) if i not in indices_to_prune]

    iter = 0
    while(prune_samps > 0):
        if(iter>=100):
          break
        for label, values_X in clustered_X_new.items():
          if(prune_samps <=0 or len(values_X) <= 0):
            break
          # print(len(values_X))
          index_to_prune = random.sample(range(len(values_X)), 1)
          clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in index_to_prune]
          clustered_y_new[label] = [clustered_y_new[label][i] for i in range(len(clustered_y_new[label])) if i not in index_to_prune]

          prune_samps -= 1
        iter += 1

    for label in clustered_X_new:
        pruning_ratios_X_maj[pruning_ratio].extend(clustered_X_new[label])
        pruning_ratios_y_maj[pruning_ratio].extend(clustered_y_new[label])

  return pruning_ratios_X_maj, pruning_ratios_y_maj

def combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining):

  pruning_ratios_X, pruning_ratios_y = defaultdict(list), defaultdict(list)
  for pruning_ratio in pruning_ratios:
    pruning_ratios_X[pruning_ratio].extend(pruning_ratios_X_maj[pruning_ratio])
    pruning_ratios_X[pruning_ratio].extend(X_remaining)

    pruning_ratios_y[pruning_ratio].extend(pruning_ratios_y_maj[pruning_ratio])
    pruning_ratios_y[pruning_ratio].extend(y_remaining)

  return pruning_ratios_X, pruning_ratios_y

def do_intelligent_pruning(X, y, ratio, per_cluster_pruning_ratio=0.7, seed=42):

  X_maj, y_maj, X_remaining, y_remaining, min_class_samples = find_majority_data(X, y)
  kmeans = KMeans(n_clusters=3, random_state = 42)
  kmeans.fit(X_maj)
  labels = kmeans.labels_
  clustered_X, clustered_y = do_clustering(X_maj, y_maj, labels)

  pruning_best = len(X_maj)-min_class_samples
  pruning_samps = [int(pruning_best * ratio)]
  pruning_ratios = [ratio]

  pruning_ratios_X_maj, pruning_ratios_y_maj = intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, \
                                                                      per_cluster_pruning_ratio=per_cluster_pruning_ratio, seed=seed)

  pruning_ratios_X, pruning_ratios_y = combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining)

  return list(pruning_ratios_X.values()), list(pruning_ratios_y.values())


# Random Pruning

In [20]:
"""
inputs:
X: np.array
y: np.array
percentage: from 0% upto 100%, enter int value
"""
def random_prune_data(X, y, ratio, seed = 42):
  # preprocessed_X, scaler, imputer = preprocess_data_train(X)
  # preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  # X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
  # X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()
  np.random.seed(seed)
  labels_count = {}
  labels = np.unique(y)
  for label in labels:
    labels_count[label] = np.count_nonzero(y == label)
  max_label = min_label = labels[0]
  for label in labels_count:
    if labels_count[label] > labels_count[max_label]:
      max_label = label
    if labels_count[label] < labels_count[min_label]:
      min_label = label

  # print("Max", max_label, labels_count[max_label])
  # print("Min", min_label, labels_count[min_label])

  prune_counts = {}
  prune_indexes = {}
  for label in labels_count:
    prune_counts[label] = labels_count[label] - labels_count[min_label]
    prune_indexes[label] = np.where(y == label)[0]

  prune_amount = int(ratio * sum(map(lambda x: x[1], prune_counts.items())))
  prune_it = {}

  while prune_amount > 0:
    for label in labels:
      if (len(prune_indexes[label]) - labels_count[min_label]) > 0 and prune_amount > 0:
        random_index = np.random.choice(len(prune_indexes[label]))
        random_item = prune_indexes[label][random_index]
        prune_indexes[label] = np.delete(prune_indexes[label], random_index)
        if prune_it.get(label, None) is None:
          prune_it[label] = np.array([])
        prune_it[label] = np.append(prune_it[label], [random_item])
        prune_amount -= 1



  formatted_indexes = np.array([])
  for label in prune_indexes:
    formatted_indexes = np.append(formatted_indexes, prune_indexes[label])
  formatted_indexes = np.sort(formatted_indexes)
  new_arr = np.array([np.int64(i) for i in formatted_indexes])

  return X[new_arr], y[new_arr]

In [21]:
ratios = [ratio for ratio in np.arange(0.2, 1.1, 0.2)]

# Calling Intelligent Pruning

In [25]:
results_intelligent_pruning = dict()
per_cluster_pruning_ratios = [0.5, 0.7, 0.9, 1]

for per_cluster_pruning_ratio in per_cluster_pruning_ratios:
  print(f'For per-cluster pruning ratio {per_cluster_pruning_ratio}')
  for ratio in ratios:
    X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

    intelligent_pruned_X_train, intelligent_pruned_y_train = do_intelligent_pruning(X_train_copy.to_numpy(), y_train_copy.to_numpy(), ratio, per_cluster_pruning_ratio=per_cluster_pruning_ratio)

    print(len(intelligent_pruned_X_train[0]))
    preprocessed_intelligent_pruned_X_train, scaler = preprocess_data_train((np.array(intelligent_pruned_X_train))[0])
    preprocessed_X_test = preprocess_data_test(X_test, scaler)

    intelligent_pruned_X_train, intelligent_pruned_y_train = preprocessed_intelligent_pruned_X_train, (np.array(intelligent_pruned_y_train))[0]
    intelligent_pruned_X_test, intelligent_pruned_y_test = preprocessed_X_test, y_test.to_numpy()
    print(f"Train data pruned intelligently at {ratio * 100}% :")
    results = evaluate_models(intelligent_pruned_X_train, intelligent_pruned_X_test, intelligent_pruned_y_train, intelligent_pruned_y_test)
    print(results)
    results_intelligent_pruning[ratio] = results
    print("_______________________________________________________________________________")

For per-cluster pruning ratio 0.5




3428
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.868, 'Precision': 0.547945205479452, 'Recall': 0.28776978417266186, 'F1 Score': 0.37735849056603776, 'ROC AUC': 0.8276974239423792, 'Confusion Matrix': array([[828,  33],
       [ 99,  40]])}, 'SVM': {'Accuracy': 0.927, 'Precision': 0.8666666666666667, 'Recall': 0.5611510791366906, 'F1 Score': 0.6812227074235807, 'ROC AUC': 0.9093073972877447, 'Confusion Matrix': array([[849,  12],
       [ 61,  78]])}, 'Decision Tree': {'Accuracy': 0.917, 'Precision': 0.6818181818181818, 'Recall': 0.7553956834532374, 'F1 Score': 0.7167235494880545, 'ROC AUC': 0.8492425571737732, 'Confusion Matrix': array([[812,  49],
       [ 34, 105]])}}
_______________________________________________________________________________




2855
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.864, 'Precision': 0.5142857142857142, 'Recall': 0.38848920863309355, 'F1 Score': 0.4426229508196722, 'ROC AUC': 0.8271292373766493, 'Confusion Matrix': array([[810,  51],
       [ 85,  54]])}, 'SVM': {'Accuracy': 0.933, 'Precision': 0.8529411764705882, 'Recall': 0.6258992805755396, 'F1 Score': 0.7219917012448133, 'ROC AUC': 0.9061573041218594, 'Confusion Matrix': array([[846,  15],
       [ 52,  87]])}, 'Decision Tree': {'Accuracy': 0.914, 'Precision': 0.6666666666666666, 'Recall': 0.762589928057554, 'F1 Score': 0.7114093959731543, 'ROC AUC': 0.8505167991042706, 'Confusion Matrix': array([[808,  53],
       [ 33, 106]])}}
_______________________________________________________________________________




2282
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.845, 'Precision': 0.44366197183098594, 'Recall': 0.45323741007194246, 'F1 Score': 0.4483985765124555, 'ROC AUC': 0.8221408935569313, 'Confusion Matrix': array([[782,  79],
       [ 76,  63]])}, 'SVM': {'Accuracy': 0.925, 'Precision': 0.7461538461538462, 'Recall': 0.697841726618705, 'F1 Score': 0.7211895910780669, 'ROC AUC': 0.89994067463799, 'Confusion Matrix': array([[828,  33],
       [ 42,  97]])}, 'Decision Tree': {'Accuracy': 0.888, 'Precision': 0.5721925133689839, 'Recall': 0.7697841726618705, 'F1 Score': 0.6564417177914109, 'ROC AUC': 0.838434478897718, 'Confusion Matrix': array([[781,  80],
       [ 32, 107]])}}
_______________________________________________________________________________




1709
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.778, 'Precision': 0.33976833976833976, 'Recall': 0.6330935251798561, 'F1 Score': 0.4422110552763819, 'ROC AUC': 0.7928375069978859, 'Confusion Matrix': array([[690, 171],
       [ 51,  88]])}, 'SVM': {'Accuracy': 0.871, 'Precision': 0.5242718446601942, 'Recall': 0.7769784172661871, 'F1 Score': 0.6260869565217391, 'ROC AUC': 0.8809482031099859, 'Confusion Matrix': array([[763,  98],
       [ 31, 108]])}, 'Decision Tree': {'Accuracy': 0.824, 'Precision': 0.4312267657992565, 'Recall': 0.8345323741007195, 'F1 Score': 0.5686274509803921, 'ROC AUC': 0.828416012834332, 'Confusion Matrix': array([[708, 153],
       [ 23, 116]])}}
_______________________________________________________________________________




1389
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.78, 'Precision': 0.34600760456273766, 'Recall': 0.6546762589928058, 'F1 Score': 0.4527363184079602, 'ROC AUC': 0.8093650515127968, 'Confusion Matrix': array([[689, 172],
       [ 48,  91]])}, 'SVM': {'Accuracy': 0.85, 'Precision': 0.4765957446808511, 'Recall': 0.8057553956834532, 'F1 Score': 0.5989304812834224, 'ROC AUC': 0.8815999465236173, 'Confusion Matrix': array([[738, 123],
       [ 27, 112]])}, 'Decision Tree': {'Accuracy': 0.807, 'Precision': 0.40425531914893614, 'Recall': 0.8201438848920863, 'F1 Score': 0.5415676959619953, 'ROC AUC': 0.8125109668362871, 'Confusion Matrix': array([[693, 168],
       [ 25, 114]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.7




3428
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.868, 'Precision': 0.5466666666666666, 'Recall': 0.2949640287769784, 'F1 Score': 0.38317757009345793, 'ROC AUC': 0.8258006834950157, 'Confusion Matrix': array([[827,  34],
       [ 98,  41]])}, 'SVM': {'Accuracy': 0.932, 'Precision': 0.9080459770114943, 'Recall': 0.5683453237410072, 'F1 Score': 0.6991150442477876, 'ROC AUC': 0.9095914905706097, 'Confusion Matrix': array([[853,   8],
       [ 60,  79]])}, 'Decision Tree': {'Accuracy': 0.916, 'Precision': 0.6729559748427673, 'Recall': 0.7697841726618705, 'F1 Score': 0.7181208053691275, 'ROC AUC': 0.8546946414993442, 'Confusion Matrix': array([[809,  52],
       [ 32, 107]])}}
_______________________________________________________________________________




2855
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.854, 'Precision': 0.4690265486725664, 'Recall': 0.381294964028777, 'F1 Score': 0.42063492063492064, 'ROC AUC': 0.81921640388038, 'Confusion Matrix': array([[801,  60],
       [ 86,  53]])}, 'SVM': {'Accuracy': 0.924, 'Precision': 0.7837837837837838, 'Recall': 0.6258992805755396, 'F1 Score': 0.696, 'ROC AUC': 0.9028902313689118, 'Confusion Matrix': array([[837,  24],
       [ 52,  87]])}, 'Decision Tree': {'Accuracy': 0.89, 'Precision': 0.5792349726775956, 'Recall': 0.762589928057554, 'F1 Score': 0.658385093167702, 'ROC AUC': 0.8365795168743055, 'Confusion Matrix': array([[784,  77],
       [ 33, 106]])}}
_______________________________________________________________________________




2282
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.817, 'Precision': 0.3910891089108911, 'Recall': 0.5683453237410072, 'F1 Score': 0.4633431085043988, 'ROC AUC': 0.7901469764954587, 'Confusion Matrix': array([[738, 123],
       [ 60,  79]])}, 'SVM': {'Accuracy': 0.883, 'Precision': 0.5647058823529412, 'Recall': 0.6906474820143885, 'F1 Score': 0.6213592233009709, 'ROC AUC': 0.8844325236674773, 'Confusion Matrix': array([[787,  74],
       [ 43,  96]])}, 'Decision Tree': {'Accuracy': 0.848, 'Precision': 0.47280334728033474, 'Recall': 0.8129496402877698, 'F1 Score': 0.5978835978835979, 'ROC AUC': 0.8333040884365678, 'Confusion Matrix': array([[735, 126],
       [ 26, 113]])}}
_______________________________________________________________________________




1709
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.37185929648241206, 'Recall': 0.5323741007194245, 'F1 Score': 0.4378698224852071, 'ROC AUC': 0.7938485448574939, 'Confusion Matrix': array([[736, 125],
       [ 65,  74]])}, 'SVM': {'Accuracy': 0.887, 'Precision': 0.5730337078651685, 'Recall': 0.7338129496402878, 'F1 Score': 0.6435331230283912, 'ROC AUC': 0.8830204129379422, 'Confusion Matrix': array([[785,  76],
       [ 37, 102]])}, 'Decision Tree': {'Accuracy': 0.83, 'Precision': 0.4396887159533074, 'Recall': 0.8129496402877698, 'F1 Score': 0.5707070707070707, 'ROC AUC': 0.822851126764094, 'Confusion Matrix': array([[717, 144],
       [ 26, 113]])}}
_______________________________________________________________________________




1985
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.836, 'Precision': 0.4251497005988024, 'Recall': 0.5107913669064749, 'F1 Score': 0.46405228758169936, 'ROC AUC': 0.8335798260346426, 'Confusion Matrix': array([[765,  96],
       [ 68,  71]])}, 'SVM': {'Accuracy': 0.909, 'Precision': 0.6538461538461539, 'Recall': 0.7338129496402878, 'F1 Score': 0.6915254237288135, 'ROC AUC': 0.8988043015065299, 'Confusion Matrix': array([[807,  54],
       [ 37, 102]])}, 'Decision Tree': {'Accuracy': 0.892, 'Precision': 0.5828877005347594, 'Recall': 0.7841726618705036, 'F1 Score': 0.6687116564417177, 'ROC AUC': 0.846790163687865, 'Confusion Matrix': array([[783,  78],
       [ 30, 109]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.9




3428
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.868, 'Precision': 0.5443037974683544, 'Recall': 0.30935251798561153, 'F1 Score': 0.3944954128440367, 'ROC AUC': 0.8237034066126889, 'Confusion Matrix': array([[825,  36],
       [ 96,  43]])}, 'SVM': {'Accuracy': 0.93, 'Precision': 0.8791208791208791, 'Recall': 0.5755395683453237, 'F1 Score': 0.6956521739130435, 'ROC AUC': 0.9098755838534747, 'Confusion Matrix': array([[850,  11],
       [ 59,  80]])}, 'Decision Tree': {'Accuracy': 0.917, 'Precision': 0.675, 'Recall': 0.7769784172661871, 'F1 Score': 0.7224080267558529, 'ROC AUC': 0.8582917638015024, 'Confusion Matrix': array([[809,  52],
       [ 31, 108]])}}
_______________________________________________________________________________




2855
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.85, 'Precision': 0.456, 'Recall': 0.41007194244604317, 'F1 Score': 0.4318181818181818, 'ROC AUC': 0.805864019585725, 'Confusion Matrix': array([[793,  68],
       [ 82,  57]])}, 'SVM': {'Accuracy': 0.917, 'Precision': 0.7222222222222222, 'Recall': 0.6546762589928058, 'F1 Score': 0.6867924528301887, 'ROC AUC': 0.8973587680378344, 'Confusion Matrix': array([[826,  35],
       [ 48,  91]])}, 'Decision Tree': {'Accuracy': 0.907, 'Precision': 0.6263736263736264, 'Recall': 0.8201438848920863, 'F1 Score': 0.7102803738317757, 'ROC AUC': 0.8705829761278085, 'Confusion Matrix': array([[793,  68],
       [ 25, 114]])}}
_______________________________________________________________________________




2282
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.841, 'Precision': 0.4295774647887324, 'Recall': 0.43884892086330934, 'F1 Score': 0.4341637010676156, 'ROC AUC': 0.8047777805630061, 'Confusion Matrix': array([[780,  81],
       [ 78,  61]])}, 'SVM': {'Accuracy': 0.905, 'Precision': 0.6549295774647887, 'Recall': 0.6690647482014388, 'F1 Score': 0.6619217081850532, 'ROC AUC': 0.8898637187810726, 'Confusion Matrix': array([[812,  49],
       [ 46,  93]])}, 'Decision Tree': {'Accuracy': 0.885, 'Precision': 0.5612244897959183, 'Recall': 0.7913669064748201, 'F1 Score': 0.6567164179104478, 'ROC AUC': 0.8457415252467017, 'Confusion Matrix': array([[775,  86],
       [ 29, 110]])}}
_______________________________________________________________________________




1985
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.836, 'Precision': 0.4251497005988024, 'Recall': 0.5107913669064749, 'F1 Score': 0.46405228758169936, 'ROC AUC': 0.8335798260346426, 'Confusion Matrix': array([[765,  96],
       [ 68,  71]])}, 'SVM': {'Accuracy': 0.909, 'Precision': 0.6538461538461539, 'Recall': 0.7338129496402878, 'F1 Score': 0.6915254237288135, 'ROC AUC': 0.8988043015065299, 'Confusion Matrix': array([[807,  54],
       [ 37, 102]])}, 'Decision Tree': {'Accuracy': 0.892, 'Precision': 0.5828877005347594, 'Recall': 0.7841726618705036, 'F1 Score': 0.6687116564417177, 'ROC AUC': 0.846790163687865, 'Confusion Matrix': array([[783,  78],
       [ 30, 109]])}}
_______________________________________________________________________________




1985
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.836, 'Precision': 0.4251497005988024, 'Recall': 0.5107913669064749, 'F1 Score': 0.46405228758169936, 'ROC AUC': 0.8335798260346426, 'Confusion Matrix': array([[765,  96],
       [ 68,  71]])}, 'SVM': {'Accuracy': 0.909, 'Precision': 0.6538461538461539, 'Recall': 0.7338129496402878, 'F1 Score': 0.6915254237288135, 'ROC AUC': 0.8988043015065299, 'Confusion Matrix': array([[807,  54],
       [ 37, 102]])}, 'Decision Tree': {'Accuracy': 0.892, 'Precision': 0.5828877005347594, 'Recall': 0.7841726618705036, 'F1 Score': 0.6687116564417177, 'ROC AUC': 0.846790163687865, 'Confusion Matrix': array([[783,  78],
       [ 30, 109]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 1




3428
Train data pruned intelligently at 20.0% :




{'Logistic Regression': {'Accuracy': 0.868, 'Precision': 0.5411764705882353, 'Recall': 0.33093525179856115, 'F1 Score': 0.4107142857142857, 'ROC AUC': 0.8231853541556998, 'Confusion Matrix': array([[822,  39],
       [ 93,  46]])}, 'SVM': {'Accuracy': 0.93, 'Precision': 0.8791208791208791, 'Recall': 0.5755395683453237, 'F1 Score': 0.6956521739130435, 'ROC AUC': 0.909892295223055, 'Confusion Matrix': array([[850,  11],
       [ 59,  80]])}, 'Decision Tree': {'Accuracy': 0.923, 'Precision': 0.6962025316455697, 'Recall': 0.7913669064748201, 'F1 Score': 0.7407407407407407, 'ROC AUC': 0.8678088887774797, 'Confusion Matrix': array([[813,  48],
       [ 29, 110]])}}
_______________________________________________________________________________




2855
Train data pruned intelligently at 40.0% :




{'Logistic Regression': {'Accuracy': 0.839, 'Precision': 0.42567567567567566, 'Recall': 0.45323741007194246, 'F1 Score': 0.4390243902439024, 'ROC AUC': 0.7963803173489084, 'Confusion Matrix': array([[776,  85],
       [ 76,  63]])}, 'SVM': {'Accuracy': 0.909, 'Precision': 0.6846153846153846, 'Recall': 0.6402877697841727, 'F1 Score': 0.6617100371747212, 'ROC AUC': 0.8955038060144219, 'Confusion Matrix': array([[820,  41],
       [ 50,  89]])}, 'Decision Tree': {'Accuracy': 0.882, 'Precision': 0.551219512195122, 'Recall': 0.8129496402877698, 'F1 Score': 0.6569767441860466, 'ROC AUC': 0.8530485715956851, 'Confusion Matrix': array([[769,  92],
       [ 26, 113]])}}
_______________________________________________________________________________




2282
Train data pruned intelligently at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.836, 'Precision': 0.41830065359477125, 'Recall': 0.460431654676259, 'F1 Score': 0.4383561643835616, 'ROC AUC': 0.7909491222353128, 'Confusion Matrix': array([[772,  89],
       [ 75,  64]])}, 'SVM': {'Accuracy': 0.906, 'Precision': 0.6551724137931034, 'Recall': 0.6834532374100719, 'F1 Score': 0.6690140845070423, 'ROC AUC': 0.8850007102332073, 'Confusion Matrix': array([[811,  50],
       [ 44,  95]])}, 'Decision Tree': {'Accuracy': 0.885, 'Precision': 0.5588235294117647, 'Recall': 0.8201438848920863, 'F1 Score': 0.6647230320699709, 'ROC AUC': 0.8578071340836738, 'Confusion Matrix': array([[771,  90],
       [ 25, 114]])}}
_______________________________________________________________________________




1985
Train data pruned intelligently at 80.0% :




{'Logistic Regression': {'Accuracy': 0.836, 'Precision': 0.4251497005988024, 'Recall': 0.5107913669064749, 'F1 Score': 0.46405228758169936, 'ROC AUC': 0.8335798260346426, 'Confusion Matrix': array([[765,  96],
       [ 68,  71]])}, 'SVM': {'Accuracy': 0.909, 'Precision': 0.6538461538461539, 'Recall': 0.7338129496402878, 'F1 Score': 0.6915254237288135, 'ROC AUC': 0.8988043015065299, 'Confusion Matrix': array([[807,  54],
       [ 37, 102]])}, 'Decision Tree': {'Accuracy': 0.892, 'Precision': 0.5828877005347594, 'Recall': 0.7841726618705036, 'F1 Score': 0.6687116564417177, 'ROC AUC': 0.846790163687865, 'Confusion Matrix': array([[783,  78],
       [ 30, 109]])}}
_______________________________________________________________________________




1985
Train data pruned intelligently at 100.0% :




{'Logistic Regression': {'Accuracy': 0.836, 'Precision': 0.4251497005988024, 'Recall': 0.5107913669064749, 'F1 Score': 0.46405228758169936, 'ROC AUC': 0.8335798260346426, 'Confusion Matrix': array([[765,  96],
       [ 68,  71]])}, 'SVM': {'Accuracy': 0.909, 'Precision': 0.6538461538461539, 'Recall': 0.7338129496402878, 'F1 Score': 0.6915254237288135, 'ROC AUC': 0.8988043015065299, 'Confusion Matrix': array([[807,  54],
       [ 37, 102]])}, 'Decision Tree': {'Accuracy': 0.892, 'Precision': 0.5828877005347594, 'Recall': 0.7841726618705036, 'F1 Score': 0.6687116564417177, 'ROC AUC': 0.846790163687865, 'Confusion Matrix': array([[783,  78],
       [ 30, 109]])}}
_______________________________________________________________________________


#Calling Random Pruning

In [19]:
results_random_pruning = dict()
for ratio in ratios:
  random_pruned_X_train, random_pruned_y_train = random_prune_data(X_train.to_numpy(), y_train.to_numpy(), ratio)
  preprocessed_random_pruned_X_train, scaler = preprocess_data_train(random_pruned_X_train)
  preprocessed_X_test = preprocess_data_test(X_test, scaler)

  random_pruned_X_train, random_pruned_y_train = preprocessed_random_pruned_X_train, random_pruned_y_train
  random_pruned_X_test, random_pruned_y_test = preprocessed_X_test, y_test.to_numpy()

  print(f"Train data pruned randomly at {ratio * 100}% :")
  results = evaluate_models(random_pruned_X_train, random_pruned_X_test, random_pruned_y_train, random_pruned_y_test)
  print(results)
  results_random_pruning[ratio] = results
  print("_______________________________________________________________________________")

Train data pruned randomly at 20.0% :




{'Logistic Regression': {'Accuracy': 0.865, 'Precision': 0.5277777777777778, 'Recall': 0.2733812949640288, 'F1 Score': 0.3601895734597157, 'ROC AUC': 0.8284160128343319, 'Confusion Matrix': array([[827,  34],
       [101,  38]])}, 'SVM': {'Accuracy': 0.931, 'Precision': 0.8888888888888888, 'Recall': 0.5755395683453237, 'F1 Score': 0.6986899563318778, 'ROC AUC': 0.9096416246793507, 'Confusion Matrix': array([[851,  10],
       [ 59,  80]])}, 'Decision Tree': {'Accuracy': 0.904, 'Precision': 0.6335403726708074, 'Recall': 0.7338129496402878, 'F1 Score': 0.68, 'ROC AUC': 0.8326439893381461, 'Confusion Matrix': array([[802,  59],
       [ 37, 102]])}}
_______________________________________________________________________________
Train data pruned randomly at 40.0% :




{'Logistic Regression': {'Accuracy': 0.866, 'Precision': 0.5257731958762887, 'Recall': 0.3669064748201439, 'F1 Score': 0.43220338983050843, 'ROC AUC': 0.8291346017262845, 'Confusion Matrix': array([[815,  46],
       [ 88,  51]])}, 'SVM': {'Accuracy': 0.931, 'Precision': 0.8301886792452831, 'Recall': 0.6330935251798561, 'F1 Score': 0.7183673469387756, 'ROC AUC': 0.9074691466339123, 'Confusion Matrix': array([[843,  18],
       [ 51,  88]])}, 'Decision Tree': {'Accuracy': 0.887, 'Precision': 0.5691489361702128, 'Recall': 0.7697841726618705, 'F1 Score': 0.654434250764526, 'ROC AUC': 0.837853758804803, 'Confusion Matrix': array([[780,  81],
       [ 32, 107]])}}
_______________________________________________________________________________
Train data pruned randomly at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.856, 'Precision': 0.4796747967479675, 'Recall': 0.4244604316546763, 'F1 Score': 0.45038167938931295, 'ROC AUC': 0.8296526541832736, 'Confusion Matrix': array([[797,  64],
       [ 80,  59]])}, 'SVM': {'Accuracy': 0.921, 'Precision': 0.7307692307692307, 'Recall': 0.6834532374100719, 'F1 Score': 0.7063197026022305, 'ROC AUC': 0.9009099340736472, 'Confusion Matrix': array([[826,  35],
       [ 44,  95]])}, 'Decision Tree': {'Accuracy': 0.883, 'Precision': 0.5555555555555556, 'Recall': 0.7913669064748201, 'F1 Score': 0.6528189910979229, 'ROC AUC': 0.844580085060871, 'Confusion Matrix': array([[773,  88],
       [ 29, 110]])}}
_______________________________________________________________________________
Train data pruned randomly at 80.0% :




{'Logistic Regression': {'Accuracy': 0.835, 'Precision': 0.42696629213483145, 'Recall': 0.5467625899280576, 'F1 Score': 0.4794952681388012, 'ROC AUC': 0.8316830855872793, 'Confusion Matrix': array([[759, 102],
       [ 63,  76]])}, 'SVM': {'Accuracy': 0.902, 'Precision': 0.6227544910179641, 'Recall': 0.7482014388489209, 'F1 Score': 0.6797385620915033, 'ROC AUC': 0.8946682375354073, 'Confusion Matrix': array([[798,  63],
       [ 35, 104]])}, 'Decision Tree': {'Accuracy': 0.864, 'Precision': 0.5071090047393365, 'Recall': 0.7697841726618705, 'F1 Score': 0.6114285714285714, 'ROC AUC': 0.8244971966677529, 'Confusion Matrix': array([[757, 104],
       [ 32, 107]])}}
_______________________________________________________________________________
Train data pruned randomly at 100.0% :




{'Logistic Regression': {'Accuracy': 0.766, 'Precision': 0.34824281150159747, 'Recall': 0.7841726618705036, 'F1 Score': 0.4823008849557523, 'ROC AUC': 0.8308224500538942, 'Confusion Matrix': array([[657, 204],
       [ 30, 109]])}, 'SVM': {'Accuracy': 0.846, 'Precision': 0.4701195219123506, 'Recall': 0.8489208633093526, 'F1 Score': 0.6051282051282052, 'ROC AUC': 0.8875241270398315, 'Confusion Matrix': array([[728, 133],
       [ 21, 118]])}, 'Decision Tree': {'Accuracy': 0.83, 'Precision': 0.44106463878326996, 'Recall': 0.8345323741007195, 'F1 Score': 0.5771144278606966, 'ROC AUC': 0.8319003333918231, 'Confusion Matrix': array([[714, 147],
       [ 23, 116]])}}
_______________________________________________________________________________


# Calling SDV-Oversampling

In [30]:
sd1, train_df = do_sdv(X_train, y_train)
results_syn_sdv = dict()

# Add synthetic data at different percentages to the main DataFrame
for ratio in ratios:
    combined_df = add_synthetic_data(train_df, sd1, ratio)
    y_train_sdv = combined_df['target']
    X_train_sdv = combined_df.drop('target', axis=1)

    X_train_sdv.to_csv("sdv.csv")
    preprocessed_X_train_sdv, scaler = preprocess_data_train(X_train_sdv)
    preprocessed_X_test_sdv = preprocess_data_test(X_test, scaler)

    X_train_sdv, y_train_sdv = preprocessed_X_train_sdv, y_train_sdv.to_numpy()
    X_test_sdv, y_test_sdv = preprocessed_X_test_sdv, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_sdv, X_test_sdv, y_train_sdv, y_test_sdv)
    results_syn_sdv[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
{'Logistic Regression': {'Accuracy': 0.861, 'Precision': 0.5, 'Recall': 0.3237410071942446, 'F1 Score': 0.3930131004366812, 'ROC AUC': 0.7564067213128451, 'Confusion Matrix': array([[816,  45],
       [ 94,  45]])}, 'SVM': {'Accuracy': 0.913, 'Precision': 0.7131147540983607, 'Recall': 0.6258992805755396, 'F1 Score': 0.6666666666666666, 'ROC AUC': 0.9010185579759189, 'Confusion Matrix': array([[826,  35],
       [ 52,  87]])}, 'Decision Tree': {'Accuracy': 0.916, 'Precision': 0.697841726618705, 'Recall': 0.697841726618705, 'F1 Score': 0.697841726618705, 'ROC AUC': 0.8245306194069135, 'Confusion Matrix': array([[819,  42],
       [ 42,  97]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
{'Logistic Regression': {'Accuracy': 0.847, 'Precision': 0.43137254901960786, 'Recall': 0.31654676258992803, 'F1 Score': 0.3651452282157676

# Calling SMOTE-Oversampling

In [31]:
results_smote = dict()

for ratio in ratios:

    X_train_smote, y_train_smote = smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])
    preprocessed_X_train_smote, scaler = preprocess_data_train((np.array(X_train_smote))[0])
    preprocessed_X_test_smote = preprocess_data_test(X_test, scaler)

    X_train_smote, y_train_smote = preprocessed_X_train_smote, (np.array(y_train_smote))[0]
    X_test_smote, y_test_smote = preprocessed_X_test_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_smote), len(y_train_smote))
    results = evaluate_models(X_train_smote, X_test_smote, y_train_smote, y_test_smote)
    results_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
4572 4572
{'Logistic Regression': {'Accuracy': 0.85, 'Precision': 0.45985401459854014, 'Recall': 0.45323741007194246, 'F1 Score': 0.45652173913043476, 'ROC AUC': 0.8246058205700249, 'Confusion Matrix': array([[787,  74],
       [ 76,  63]])}, 'SVM': {'Accuracy': 0.924, 'Precision': 0.7647058823529411, 'Recall': 0.6546762589928058, 'F1 Score': 0.7054263565891473, 'ROC AUC': 0.8963727972325972, 'Confusion Matrix': array([[833,  28],
       [ 48,  91]])}, 'Decision Tree': {'Accuracy': 0.904, 'Precision': 0.6187845303867403, 'Recall': 0.8057553956834532, 'F1 Score': 0.7, 'ROC AUC': 0.8628080114305767, 'Confusion Matrix': array([[792,  69],
       [ 27, 112]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
5145 5145




{'Logistic Regression': {'Accuracy': 0.822, 'Precision': 0.39790575916230364, 'Recall': 0.5467625899280576, 'F1 Score': 0.46060606060606063, 'ROC AUC': 0.8254246776794592, 'Confusion Matrix': array([[746, 115],
       [ 63,  76]])}, 'SVM': {'Accuracy': 0.913, 'Precision': 0.6884057971014492, 'Recall': 0.6834532374100719, 'F1 Score': 0.6859205776173285, 'ROC AUC': 0.8857861446034809, 'Confusion Matrix': array([[818,  43],
       [ 44,  95]])}, 'Decision Tree': {'Accuracy': 0.921, 'Precision': 0.6875, 'Recall': 0.7913669064748201, 'F1 Score': 0.7357859531772575, 'ROC AUC': 0.8666474485916493, 'Confusion Matrix': array([[811,  50],
       [ 29, 110]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
5718 5718




{'Logistic Regression': {'Accuracy': 0.8, 'Precision': 0.37130801687763715, 'Recall': 0.6330935251798561, 'F1 Score': 0.46808510638297873, 'ROC AUC': 0.8273799079203537, 'Confusion Matrix': array([[712, 149],
       [ 51,  88]])}, 'SVM': {'Accuracy': 0.908, 'Precision': 0.6666666666666666, 'Recall': 0.6762589928057554, 'F1 Score': 0.6714285714285714, 'ROC AUC': 0.8791266638257338, 'Confusion Matrix': array([[814,  47],
       [ 45,  94]])}, 'Decision Tree': {'Accuracy': 0.911, 'Precision': 0.6470588235294118, 'Recall': 0.7913669064748201, 'F1 Score': 0.7119741100323624, 'ROC AUC': 0.8608402476624971, 'Confusion Matrix': array([[801,  60],
       [ 29, 110]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
6291 6291




{'Logistic Regression': {'Accuracy': 0.78, 'Precision': 0.3568904593639576, 'Recall': 0.7266187050359713, 'F1 Score': 0.47867298578199047, 'ROC AUC': 0.8273799079203535, 'Confusion Matrix': array([[679, 182],
       [ 38, 101]])}, 'SVM': {'Accuracy': 0.903, 'Precision': 0.6381578947368421, 'Recall': 0.697841726618705, 'F1 Score': 0.6666666666666667, 'ROC AUC': 0.8747399293109067, 'Confusion Matrix': array([[806,  55],
       [ 42,  97]])}, 'Decision Tree': {'Accuracy': 0.899, 'Precision': 0.6055555555555555, 'Recall': 0.7841726618705036, 'F1 Score': 0.683385579937304, 'ROC AUC': 0.8508552043382716, 'Confusion Matrix': array([[790,  71],
       [ 30, 109]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
6864 6864




{'Logistic Regression': {'Accuracy': 0.765, 'Precision': 0.3431372549019608, 'Recall': 0.7553956834532374, 'F1 Score': 0.47191011235955055, 'ROC AUC': 0.8273214181268226, 'Confusion Matrix': array([[660, 201],
       [ 34, 105]])}, 'SVM': {'Accuracy': 0.897, 'Precision': 0.6168831168831169, 'Recall': 0.6834532374100719, 'F1 Score': 0.6484641638225257, 'ROC AUC': 0.8710968507424025, 'Confusion Matrix': array([[802,  59],
       [ 44,  95]])}, 'Decision Tree': {'Accuracy': 0.913, 'Precision': 0.6460674157303371, 'Recall': 0.8273381294964028, 'F1 Score': 0.725552050473186, 'ROC AUC': 0.877083698894543, 'Confusion Matrix': array([[798,  63],
       [ 24, 115]])}}
_______________________________________________________________________________


# Calling Random-Oversampling

In [32]:
results_random = dict()

for ratio in ratios:

    X_train_random, y_train_random = random_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_random, scaler = preprocess_data_train((np.array(X_train_random)[0]))
    preprocessed_X_test_random = preprocess_data_test(X_test, scaler)

    X_train_random, y_train_random = preprocessed_X_train_random, (np.array(y_train_random))[0]
    X_test_random, y_test_random = preprocessed_X_test_random, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_random), len(y_train_random))
    results = evaluate_models(X_train_random, X_test_random, y_train_random, y_test_random)
    results_random[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
4572 4572




{'Logistic Regression': {'Accuracy': 0.852, 'Precision': 0.4645669291338583, 'Recall': 0.4244604316546763, 'F1 Score': 0.443609022556391, 'ROC AUC': 0.8278311148990216, 'Confusion Matrix': array([[793,  68],
       [ 80,  59]])}, 'SVM': {'Accuracy': 0.934, 'Precision': 0.8067226890756303, 'Recall': 0.6906474820143885, 'F1 Score': 0.7441860465116279, 'ROC AUC': 0.9044861671638299, 'Confusion Matrix': array([[838,  23],
       [ 43,  96]])}, 'Decision Tree': {'Accuracy': 0.92, 'Precision': 0.7092198581560284, 'Recall': 0.7194244604316546, 'F1 Score': 0.7142857142857142, 'ROC AUC': 0.8359027064063035, 'Confusion Matrix': array([[820,  41],
       [ 39, 100]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
5145 5145




{'Logistic Regression': {'Accuracy': 0.826, 'Precision': 0.40540540540540543, 'Recall': 0.539568345323741, 'F1 Score': 0.46296296296296297, 'ROC AUC': 0.8274467533986748, 'Confusion Matrix': array([[751, 110],
       [ 64,  75]])}, 'SVM': {'Accuracy': 0.919, 'Precision': 0.704225352112676, 'Recall': 0.7194244604316546, 'F1 Score': 0.7117437722419928, 'ROC AUC': 0.8997150711486559, 'Confusion Matrix': array([[819,  42],
       [ 39, 100]])}, 'Decision Tree': {'Accuracy': 0.917, 'Precision': 0.6707317073170732, 'Recall': 0.7913669064748201, 'F1 Score': 0.7260726072607261, 'ROC AUC': 0.8643245682199885, 'Confusion Matrix': array([[807,  54],
       [ 29, 110]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
5718 5718




{'Logistic Regression': {'Accuracy': 0.807, 'Precision': 0.3855932203389831, 'Recall': 0.6546762589928058, 'F1 Score': 0.48533333333333345, 'ROC AUC': 0.8276723568880089, 'Confusion Matrix': array([[716, 145],
       [ 48,  91]])}, 'SVM': {'Accuracy': 0.914, 'Precision': 0.6778523489932886, 'Recall': 0.7266187050359713, 'F1 Score': 0.701388888888889, 'ROC AUC': 0.894542902263555, 'Confusion Matrix': array([[813,  48],
       [ 38, 101]])}, 'Decision Tree': {'Accuracy': 0.904, 'Precision': 0.6303030303030303, 'Recall': 0.7482014388489209, 'F1 Score': 0.6842105263157895, 'ROC AUC': 0.8386767937566323, 'Confusion Matrix': array([[800,  61],
       [ 35, 104]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
6291 6291




{'Logistic Regression': {'Accuracy': 0.785, 'Precision': 0.36231884057971014, 'Recall': 0.7194244604316546, 'F1 Score': 0.48192771084337344, 'ROC AUC': 0.8278144035294412, 'Confusion Matrix': array([[685, 176],
       [ 39, 100]])}, 'SVM': {'Accuracy': 0.902, 'Precision': 0.6289308176100629, 'Recall': 0.7194244604316546, 'F1 Score': 0.6711409395973155, 'ROC AUC': 0.8910836487604342, 'Confusion Matrix': array([[802,  59],
       [ 39, 100]])}, 'Decision Tree': {'Accuracy': 0.924, 'Precision': 0.7114093959731543, 'Recall': 0.762589928057554, 'F1 Score': 0.736111111111111, 'ROC AUC': 0.8563240000334228, 'Confusion Matrix': array([[818,  43],
       [ 33, 106]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
6864 6864




{'Logistic Regression': {'Accuracy': 0.764, 'Precision': 0.3450479233226837, 'Recall': 0.7769784172661871, 'F1 Score': 0.47787610619469023, 'ROC AUC': 0.8283324559864303, 'Confusion Matrix': array([[656, 205],
       [ 31, 108]])}, 'SVM': {'Accuracy': 0.894, 'Precision': 0.5976331360946746, 'Recall': 0.7266187050359713, 'F1 Score': 0.6558441558441559, 'ROC AUC': 0.8857694332339007, 'Confusion Matrix': array([[793,  68],
       [ 38, 101]])}, 'Decision Tree': {'Accuracy': 0.909, 'Precision': 0.6481481481481481, 'Recall': 0.7553956834532374, 'F1 Score': 0.6976744186046512, 'ROC AUC': 0.8445967964304514, 'Confusion Matrix': array([[804,  57],
       [ 34, 105]])}}
_______________________________________________________________________________


# Calling SVM-SMOTE Over-Sampling

In [33]:
results_svm_smote = dict()

for ratio in ratios:

    X_train_svm_smote, y_train_svm_smote = svm_smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_svm_smote, scaler = preprocess_data_train((np.array(X_train_svm_smote))[0])
    preprocessed_X_test_svm_smote = preprocess_data_test(X_test, scaler)

    X_train_svm_smote, y_train_svm_smote = preprocessed_X_train_svm_smote, (np.array(y_train_svm_smote))[0]
    X_test_svm_smote, y_test_svm_smote = preprocessed_X_test_svm_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_svm_smote), len(y_train_svm_smote))
    results = evaluate_models(X_train_svm_smote, X_test_svm_smote, y_train_svm_smote, y_test_svm_smote)
    results_svm_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
4572 4572
{'Logistic Regression': {'Accuracy': 0.853, 'Precision': 0.46875, 'Recall': 0.4316546762589928, 'F1 Score': 0.449438202247191, 'ROC AUC': 0.8184560365644765, 'Confusion Matrix': array([[793,  68],
       [ 79,  60]])}, 'SVM': {'Accuracy': 0.926, 'Precision': 0.7927927927927928, 'Recall': 0.6330935251798561, 'F1 Score': 0.704, 'ROC AUC': 0.9014864763241671, 'Confusion Matrix': array([[838,  23],
       [ 51,  88]])}, 'Decision Tree': {'Accuracy': 0.908, 'Precision': 0.6496815286624203, 'Recall': 0.7338129496402878, 'F1 Score': 0.6891891891891891, 'ROC AUC': 0.8349668697098069, 'Confusion Matrix': array([[806,  55],
       [ 37, 102]])}}
_______________________________________________________________________________




Train data combined with 40.0% synthetic data of minority class:
5145 5145
{'Logistic Regression': {'Accuracy': 0.841, 'Precision': 0.4444444444444444, 'Recall': 0.5755395683453237, 'F1 Score': 0.5015673981191223, 'ROC AUC': 0.8186983514233909, 'Confusion Matrix': array([[761, 100],
       [ 59,  80]])}, 'SVM': {'Accuracy': 0.91, 'Precision': 0.6870229007633588, 'Recall': 0.6474820143884892, 'F1 Score': 0.6666666666666666, 'ROC AUC': 0.8955455844383726, 'Confusion Matrix': array([[820,  41],
       [ 49,  90]])}, 'Decision Tree': {'Accuracy': 0.911, 'Precision': 0.6506024096385542, 'Recall': 0.7769784172661871, 'F1 Score': 0.7081967213114754, 'ROC AUC': 0.8548074432440111, 'Confusion Matrix': array([[803,  58],
       [ 31, 108]])}}
_______________________________________________________________________________




Train data combined with 60.00000000000001% synthetic data of minority class:
5718 5718
{'Logistic Regression': {'Accuracy': 0.82, 'Precision': 0.4080717488789238, 'Recall': 0.6546762589928058, 'F1 Score': 0.5027624309392266, 'ROC AUC': 0.8194921414784547, 'Confusion Matrix': array([[729, 132],
       [ 48,  91]])}, 'SVM': {'Accuracy': 0.902, 'Precision': 0.6394557823129252, 'Recall': 0.6762589928057554, 'F1 Score': 0.6573426573426574, 'ROC AUC': 0.8933480393385639, 'Confusion Matrix': array([[808,  53],
       [ 45,  94]])}, 'Decision Tree': {'Accuracy': 0.927, 'Precision': 0.7171052631578947, 'Recall': 0.7841726618705036, 'F1 Score': 0.7491408934707904, 'ROC AUC': 0.8671153669398975, 'Confusion Matrix': array([[818,  43],
       [ 30, 109]])}}
_______________________________________________________________________________




Train data combined with 80.0% synthetic data of minority class:
6291 6291
{'Logistic Regression': {'Accuracy': 0.799, 'Precision': 0.3798449612403101, 'Recall': 0.7050359712230215, 'F1 Score': 0.4937027707808565, 'ROC AUC': 0.8214975058280901, 'Confusion Matrix': array([[701, 160],
       [ 41,  98]])}, 'SVM': {'Accuracy': 0.904, 'Precision': 0.6369426751592356, 'Recall': 0.7194244604316546, 'F1 Score': 0.6756756756756758, 'ROC AUC': 0.889487712965516, 'Confusion Matrix': array([[804,  57],
       [ 39, 100]])}, 'Decision Tree': {'Accuracy': 0.911, 'Precision': 0.65625, 'Recall': 0.7553956834532374, 'F1 Score': 0.7023411371237458, 'ROC AUC': 0.8457582366162818, 'Confusion Matrix': array([[806,  55],
       [ 34, 105]])}}
_______________________________________________________________________________




Train data combined with 100.0% synthetic data of minority class:
6864 6864
{'Logistic Regression': {'Accuracy': 0.781, 'Precision': 0.3591549295774648, 'Recall': 0.7338129496402878, 'F1 Score': 0.48226950354609927, 'ROC AUC': 0.8227007244378713, 'Confusion Matrix': array([[679, 182],
       [ 37, 102]])}, 'SVM': {'Accuracy': 0.899, 'Precision': 0.6217948717948718, 'Recall': 0.697841726618705, 'F1 Score': 0.6576271186440678, 'ROC AUC': 0.8874739929310907, 'Confusion Matrix': array([[802,  59],
       [ 42,  97]])}, 'Decision Tree': {'Accuracy': 0.917, 'Precision': 0.6707317073170732, 'Recall': 0.7913669064748201, 'F1 Score': 0.7260726072607261, 'ROC AUC': 0.8643245682199885, 'Confusion Matrix': array([[807,  54],
       [ 29, 110]])}}
_______________________________________________________________________________
