<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/Eclipse_PDE_sampling_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [2]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.11.0-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.6/125.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.34.81-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.34.81-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.10,>=0.9.0 (from sdv)
  Downloading ctgan-0.9.1-py3-none-any.whl (24 kB)
Collecting deepecho<0.6,>=0.5 (from sdv)
  Downl

In [3]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [4]:
!unzip data.zip

Archive:  data.zip
   creating: data/
   creating: data/imbalance_defects_prediction/
   creating: data/project_health/
  inflating: data/README.md          
   creating: data/JavaScript_Vulnerability/
   creating: data/Bug_Reports/
   creating: data/Vulnerable_Files/
   creating: data/defects_prediction/
   creating: data/imbalance_defects_prediction/7_CK_NET_PROC/
   creating: data/imbalance_defects_prediction/2_NET/
   creating: data/imbalance_defects_prediction/4_CK_NET/
   creating: data/imbalance_defects_prediction/3_PROC/
   creating: data/imbalance_defects_prediction/6_NET_PROC/
   creating: data/imbalance_defects_prediction/1_CK/
   creating: data/imbalance_defects_prediction/5_CK_PROC/
   creating: data/project_health/monthly_closed_PRs_2mo/
   creating: data/project_health/monthly_commits_2mo/
   creating: data/project_health/monthly_open_PRs_2mo/
   creating: data/project_health/monthly_closed_issues_2mo/
   creating: data/project_health/monthly_commits_12mo/
   creating: d

In [5]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

from scipy.io import arff
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [6]:
def preprocess_data_train(X_train):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_train).sum()

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_train).sum()

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler, imputer

def preprocess_data_test(X_test, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_test).sum()

    # Handle missing data
    X_test = imputer.transform(X_test)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_test).sum()

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

## Experiments

### Dataset: Eclipse PDE

In [7]:
project = "Defect_Eclipse_PDE_UI"
fname = "_".join(project.split("_")[1:])
data_path = f"data/imbalance_defects_prediction/7_CK_NET_PROC/input/{fname}--CK_NET_PROC.arff"
data = arff.loadarff(data_path)
df = pd.DataFrame(data[0])
df['isBug'] = df['isBug'].astype('str')
d = {'YES': 1, 'NO': 0}  # Remove the byte string prefix 'b'
df['isBug'] = df['isBug'].map(d).fillna(df['isBug'])
print(df['isBug'])
print("before drop duplicates", df.shape[0])
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)
print("after drop duplicates", df.shape[0])

df.describe()

0       1
1       0
2       0
3       1
4       0
       ..
1492    0
1493    0
1494    0
1495    0
1496    0
Name: isBug, Length: 1497, dtype: int64
before drop duplicates 1497
after drop duplicates 1497


Unnamed: 0,wmc,dit,rfc,noc,cbo,lcom,loc,revision_num,author_num,linesadd_sum,...,InFreeClo,OutValClo,InValClo,OutRecipClo,InRecipClo,OutdwReach,IndwReach,nOutdwReach,nIndwReach,isBug
count,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,...,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0,1497.0
mean,23.748831,2.280561,47.502338,0.595858,10.208417,82.175685,98.164329,13.512358,3.971276,211.704075,...,0.001132,0.257043,0.257043,0.054766,0.054766,82.188384,82.188371,0.054902,0.054902,0.139613
std,31.414402,1.565026,63.113652,2.434228,14.831382,210.815685,128.634872,18.884925,2.177879,384.246071,...,0.000635,0.194478,0.293171,0.040618,0.067921,60.948191,101.878414,0.040714,0.068055,0.3467
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000668,0.0,0.0,0.000668,0.000668,1.0,1.0,0.000668,0.000668,0.0
25%,6.0,1.0,11.0,0.0,3.0,6.0,24.0,4.0,2.0,21.0,...,0.000669,0.016684,0.001337,0.007902,0.002004,11.833334,3.0,0.007905,0.002004,0.0
50%,13.0,2.0,25.0,0.0,7.0,21.0,52.0,8.0,3.0,78.0,...,0.000678,0.436592,0.015344,0.062817,0.006151,94.413429,9.219047,0.063068,0.006158,0.0
75%,28.0,3.0,57.0,0.0,13.0,66.0,116.0,16.0,6.0,221.0,...,0.001361,0.438191,0.509428,0.088306,0.097933,132.544647,147.026443,0.08854,0.098214,0.0
max,286.0,9.0,599.0,46.0,362.0,3321.0,1326.0,410.0,10.0,3644.0,...,0.003036,0.498919,0.780527,0.151869,0.443954,227.633392,664.717651,0.15206,0.444033,1.0


## Training and testing using ML models

In [8]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test, random_state=42):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(random_state=random_state),
        "SVM": SVC(random_state=random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state)
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results

In [9]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [10]:
results = evaluate_models(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print(results)

{'Logistic Regression': {'Accuracy': 0.87, 'Precision': 0.5714285714285714, 'Recall': 0.1951219512195122, 'F1 Score': 0.29090909090909095, 'ROC AUC': 0.6914492890102647, 'Confusion Matrix': array([[253,   6],
       [ 33,   8]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.6644693473961766, 'Confusion Matrix': array([[259,   0],
       [ 41,   0]])}, 'Decision Tree': {'Accuracy': 0.8466666666666667, 'Precision': 0.43902439024390244, 'Recall': 0.43902439024390244, 'F1 Score': 0.43902439024390244, 'ROC AUC': 0.6751106507204068, 'Confusion Matrix': array([[236,  23],
       [ 23,  18]])}}


# SDV - Oversampling

In [12]:
def do_sdv(X_train, y_train):
  train_df = pd.concat([X_train, y_train], axis=1)
  class_counts = y_train.value_counts()

  # Find minority class label
  minority_class_label = class_counts.idxmin()

  # Filter rows with minority class label
  minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

  # Calculate counts of majority and minority classes
  majority_count = class_counts.max()
  minority_count = class_counts.min()

  metadata_data = SingleTableMetadata()
  metadata_data.detect_from_dataframe(minority_df)
  # print(metadata_data)
  # Generate synthetic data using GaussianCopulaSynthesizer
  synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
  synthesizer_breast_data.fit(minority_df)

  # Print sample synthetic data
  synthesizer_breast_data.reset_sampling()
  sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
  return sd1, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)

    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Random Over-Sampling

In [13]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def random_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = RandomOverSampler(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SMOTE Over-Sampling

In [14]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SVM-SMOTE Over-Sampling

In [15]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def svm_smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# Intelligent Pruning

In [16]:
def find_majority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    max_label = max(zip(counts, labels))[1]
    indices_with_max_label = np.where(y == max_label)[0]
    X_maj, y_maj = X[indices_with_max_label], y[indices_with_max_label]

    # Exclude majority class samples
    indices_without_max_label = np.where(y != max_label)[0]
    X_remaining, y_remaining = X[indices_without_max_label], y[indices_without_max_label]

    return X_maj, y_maj, X_remaining, y_remaining, min(counts)

def do_clustering(X, y, labels):
  clustered_X = defaultdict(list)
  clustered_y = defaultdict(list)

  for i, label in enumerate(labels):
      clustered_X[label].append(X[i])
      clustered_y[label].append(y[i])

  # Sort clustered_X and clustered_y in descending order based on the length of values in each dictionary
  sorted_clustered_X = dict(sorted(clustered_X.items(), key=lambda x: -len(x[1])))
  sorted_clustered_y = dict(sorted(clustered_y.items(), key=lambda x: -len(x[1])))

  return sorted_clustered_X, sorted_clustered_y


def intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, per_cluster_pruning_ratio=0.7, seed=42):
  random.seed(seed)
  pruning_ratios_X_maj, pruning_ratios_y_maj = defaultdict(list), defaultdict(list)
  for pruning_samp, pruning_ratio in zip(pruning_samps, pruning_ratios):
    samps = 0
    # print("For Pruning samps: ", pruning_samp)
    prune_samps = pruning_samp
    # print(prune_samps)
    clustered_X_new = defaultdict(list)
    clustered_y_new = defaultdict(list)
    # Iterate over the sorted dictionaries
    for label, values_X in clustered_X.items():
        # Calculate the number of samples to prune
        num_samples_to_prune = int(prune_samps * per_cluster_pruning_ratio)
        if(num_samples_to_prune > len(values_X)):
          num_samples_to_prune = len(values_X)//2
          prune_samps -= num_samples_to_prune
        else:
          prune_samps -= num_samples_to_prune

        # Randomly choose samples to prune
        indices_to_prune = random.sample(range(len(values_X)), num_samples_to_prune)

        # Prune the samples from clustered_X and clustered_y
        clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in indices_to_prune]
        clustered_y_new[label] = [clustered_y[label][i] for i in range(len(clustered_y[label])) if i not in indices_to_prune]

    iter = 0
    while(prune_samps > 0):
        if(iter>=100):
          break
        for label, values_X in clustered_X_new.items():
          if(prune_samps <=0 or len(values_X) <= 0):
            break
          # print(len(values_X))
          index_to_prune = random.sample(range(len(values_X)), 1)
          clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in index_to_prune]
          clustered_y_new[label] = [clustered_y_new[label][i] for i in range(len(clustered_y_new[label])) if i not in index_to_prune]

          prune_samps -= 1
        iter += 1

    for label in clustered_X_new:
        pruning_ratios_X_maj[pruning_ratio].extend(clustered_X_new[label])
        pruning_ratios_y_maj[pruning_ratio].extend(clustered_y_new[label])

  return pruning_ratios_X_maj, pruning_ratios_y_maj

def combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining):

  pruning_ratios_X, pruning_ratios_y = defaultdict(list), defaultdict(list)
  for pruning_ratio in pruning_ratios:
    pruning_ratios_X[pruning_ratio].extend(pruning_ratios_X_maj[pruning_ratio])
    pruning_ratios_X[pruning_ratio].extend(X_remaining)

    pruning_ratios_y[pruning_ratio].extend(pruning_ratios_y_maj[pruning_ratio])
    pruning_ratios_y[pruning_ratio].extend(y_remaining)

  return pruning_ratios_X, pruning_ratios_y

def do_intelligent_pruning(X, y, ratio, per_cluster_pruning_ratio=0.7, seed=42):

  X_maj, y_maj, X_remaining, y_remaining, min_class_samples = find_majority_data(X, y)
  kmeans = KMeans(n_clusters=3, random_state = 42)
  kmeans.fit(X_maj)
  labels = kmeans.labels_
  clustered_X, clustered_y = do_clustering(X_maj, y_maj, labels)

  pruning_best = len(X_maj)-min_class_samples
  pruning_samps = [int(pruning_best * ratio)]
  pruning_ratios = [ratio]

  pruning_ratios_X_maj, pruning_ratios_y_maj = intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, \
                                                                      per_cluster_pruning_ratio=per_cluster_pruning_ratio, seed=seed)

  pruning_ratios_X, pruning_ratios_y = combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining)

  return list(pruning_ratios_X.values()), list(pruning_ratios_y.values())

# Random Pruning

In [17]:
"""
inputs:
X: np.array
y: np.array
percentage: from 0% upto 100%, enter int value
"""
def random_prune_data(X, y, ratio, seed = 42):
  # preprocessed_X, scaler, imputer = preprocess_data_train(X)
  # preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  # X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
  # X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()
  np.random.seed(seed)
  labels_count = {}
  labels = np.unique(y)
  for label in labels:
    labels_count[label] = np.count_nonzero(y == label)
  max_label = min_label = labels[0]
  for label in labels_count:
    if labels_count[label] > labels_count[max_label]:
      max_label = label
    if labels_count[label] < labels_count[min_label]:
      min_label = label

  # print("Max", max_label, labels_count[max_label])
  # print("Min", min_label, labels_count[min_label])

  prune_counts = {}
  prune_indexes = {}
  for label in labels_count:
    prune_counts[label] = labels_count[label] - labels_count[min_label]
    prune_indexes[label] = np.where(y == label)[0]

  prune_amount = int(ratio * sum(map(lambda x: x[1], prune_counts.items())))
  prune_it = {}

  while prune_amount > 0:
    for label in labels:
      if (len(prune_indexes[label]) - labels_count[min_label]) > 0 and prune_amount > 0:
        random_index = np.random.choice(len(prune_indexes[label]))
        random_item = prune_indexes[label][random_index]
        prune_indexes[label] = np.delete(prune_indexes[label], random_index)
        if prune_it.get(label, None) is None:
          prune_it[label] = np.array([])
        prune_it[label] = np.append(prune_it[label], [random_item])
        prune_amount -= 1



  formatted_indexes = np.array([])
  for label in prune_indexes:
    formatted_indexes = np.append(formatted_indexes, prune_indexes[label])
  formatted_indexes = np.sort(formatted_indexes)
  new_arr = np.array([np.int64(i) for i in formatted_indexes])

  return X[new_arr], y[new_arr]

In [18]:
ratios = [ratio for ratio in np.arange(0.2, 1.1, 0.2)]

# Calling Intelligent Pruning

In [19]:
results_intelligent_pruning = dict()
per_cluster_pruning_ratios = [0.5, 0.7, 0.9, 1]

for per_cluster_pruning_ratio in per_cluster_pruning_ratios:
  print(f'For per-cluster pruning ratio {per_cluster_pruning_ratio}')
  for ratio in ratios:
    X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

    intelligent_pruned_X_train, intelligent_pruned_y_train = do_intelligent_pruning(X_train_copy.to_numpy(), y_train_copy.to_numpy(), ratio, per_cluster_pruning_ratio=per_cluster_pruning_ratio)

    preprocessed_intelligent_pruned_X_train, scaler, imputer = preprocess_data_train((np.array(intelligent_pruned_X_train))[0])
    preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

    intelligent_pruned_X_train, intelligent_pruned_y_train = preprocessed_intelligent_pruned_X_train, (np.array(intelligent_pruned_y_train))[0]
    intelligent_pruned_X_test, intelligent_pruned_y_test = preprocessed_X_test, y_test.to_numpy()
    print(f"Train data pruned intelligently at {ratio * 100}% :")
    results = evaluate_models(intelligent_pruned_X_train, intelligent_pruned_X_test, intelligent_pruned_y_train, intelligent_pruned_y_test)
    print(results)
    results_intelligent_pruning[ratio] = results
    print("_______________________________________________________________________________")

For per-cluster pruning ratio 0.5




Train data pruned intelligently at 20.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8566666666666667, 'Precision': 0.4583333333333333, 'Recall': 0.2682926829268293, 'F1 Score': 0.3384615384615385, 'ROC AUC': 0.7673509746680477, 'Confusion Matrix': array([[246,  13],
       [ 30,  11]])}, 'SVM': {'Accuracy': 0.8666666666666667, 'Precision': 0.5714285714285714, 'Recall': 0.0975609756097561, 'F1 Score': 0.16666666666666669, 'ROC AUC': 0.7903286561823146, 'Confusion Matrix': array([[256,   3],
       [ 37,   4]])}, 'Decision Tree': {'Accuracy': 0.8033333333333333, 'Precision': 0.32, 'Recall': 0.3902439024390244, 'F1 Score': 0.35164835164835173, 'ROC AUC': 0.6294848855824465, 'Confusion Matrix': array([[225,  34],
       [ 25,  16]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.85, 'Precision': 0.4444444444444444, 'Recall': 0.3902439024390244, 'F1 Score': 0.4155844155844156, 'ROC AUC': 0.7724832846784067, 'Confusion Matrix': array([[239,  20],
       [ 25,  16]])}, 'SVM': {'Accuracy': 0.8766666666666667, 'Precision': 0.625, 'Recall': 0.24390243902439024, 'F1 Score': 0.3508771929824561, 'ROC AUC': 0.8107637253978718, 'Confusion Matrix': array([[253,   6],
       [ 31,  10]])}, 'Decision Tree': {'Accuracy': 0.7566666666666667, 'Precision': 0.25757575757575757, 'Recall': 0.4146341463414634, 'F1 Score': 0.31775700934579443, 'ROC AUC': 0.612722478576137, 'Confusion Matrix': array([[210,  49],
       [ 24,  17]])}}
_______________________________________________________________________________




Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8033333333333333, 'Precision': 0.3269230769230769, 'Recall': 0.4146341463414634, 'F1 Score': 0.3655913978494624, 'ROC AUC': 0.7524719841793013, 'Confusion Matrix': array([[224,  35],
       [ 24,  17]])}, 'SVM': {'Accuracy': 0.8366666666666667, 'Precision': 0.4, 'Recall': 0.3902439024390244, 'F1 Score': 0.39506172839506176, 'ROC AUC': 0.7787456445993031, 'Confusion Matrix': array([[235,  24],
       [ 25,  16]])}, 'Decision Tree': {'Accuracy': 0.8066666666666666, 'Precision': 0.373134328358209, 'Recall': 0.6097560975609756, 'F1 Score': 0.4629629629629629, 'ROC AUC': 0.7237969676994067, 'Confusion Matrix': array([[217,  42],
       [ 16,  25]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.7466666666666667, 'Precision': 0.2891566265060241, 'Recall': 0.5853658536585366, 'F1 Score': 0.3870967741935484, 'ROC AUC': 0.7443262077408418, 'Confusion Matrix': array([[200,  59],
       [ 17,  24]])}, 'SVM': {'Accuracy': 0.79, 'Precision': 0.3382352941176471, 'Recall': 0.5609756097560976, 'F1 Score': 0.4220183486238533, 'ROC AUC': 0.7422544495715228, 'Confusion Matrix': array([[214,  45],
       [ 18,  23]])}, 'Decision Tree': {'Accuracy': 0.67, 'Precision': 0.24561403508771928, 'Recall': 0.6829268292682927, 'F1 Score': 0.36129032258064514, 'ROC AUC': 0.6754402486109803, 'Confusion Matrix': array([[173,  86],
       [ 13,  28]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.7, 'Precision': 0.25742574257425743, 'Recall': 0.6341463414634146, 'F1 Score': 0.36619718309859156, 'ROC AUC': 0.7300122422073643, 'Confusion Matrix': array([[184,  75],
       [ 15,  26]])}, 'SVM': {'Accuracy': 0.7533333333333333, 'Precision': 0.30120481927710846, 'Recall': 0.6097560975609756, 'F1 Score': 0.40322580645161293, 'ROC AUC': 0.7143798851115924, 'Confusion Matrix': array([[201,  58],
       [ 16,  25]])}, 'Decision Tree': {'Accuracy': 0.72, 'Precision': 0.2828282828282828, 'Recall': 0.6829268292682927, 'F1 Score': 0.4, 'ROC AUC': 0.7043977775685093, 'Confusion Matrix': array([[188,  71],
       [ 13,  28]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.7




Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.34146341463414637, 'F1 Score': 0.40579710144927533, 'ROC AUC': 0.7577455504284774, 'Confusion Matrix': array([[245,  14],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8666666666666667, 'Precision': 0.5714285714285714, 'Recall': 0.0975609756097561, 'F1 Score': 0.16666666666666669, 'ROC AUC': 0.7861851398436764, 'Confusion Matrix': array([[256,   3],
       [ 37,   4]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.2549019607843137, 'Recall': 0.3170731707317073, 'F1 Score': 0.28260869565217384, 'ROC AUC': 0.5851775120067803, 'Confusion Matrix': array([[221,  38],
       [ 28,  13]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.85, 'Precision': 0.4375, 'Recall': 0.34146341463414637, 'F1 Score': 0.3835616438356165, 'ROC AUC': 0.7678218287974385, 'Confusion Matrix': array([[241,  18],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8666666666666667, 'Precision': 0.5454545454545454, 'Recall': 0.14634146341463414, 'F1 Score': 0.23076923076923073, 'ROC AUC': 0.7951313683021, 'Confusion Matrix': array([[254,   5],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.81, 'Precision': 0.32608695652173914, 'Recall': 0.36585365853658536, 'F1 Score': 0.3448275862068966, 'ROC AUC': 0.6230812694227328, 'Confusion Matrix': array([[228,  31],
       [ 26,  15]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8433333333333334, 'Precision': 0.425, 'Recall': 0.4146341463414634, 'F1 Score': 0.41975308641975306, 'ROC AUC': 0.7682455975138903, 'Confusion Matrix': array([[236,  23],
       [ 24,  17]])}, 'SVM': {'Accuracy': 0.8733333333333333, 'Precision': 0.5882352941176471, 'Recall': 0.24390243902439024, 'F1 Score': 0.3448275862068965, 'ROC AUC': 0.8134005085224597, 'Confusion Matrix': array([[252,   7],
       [ 31,  10]])}, 'Decision Tree': {'Accuracy': 0.79, 'Precision': 0.31666666666666665, 'Recall': 0.4634146341463415, 'F1 Score': 0.3762376237623763, 'ROC AUC': 0.6525567379225916, 'Confusion Matrix': array([[218,  41],
       [ 22,  19]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.8033333333333333, 'Precision': 0.3392857142857143, 'Recall': 0.4634146341463415, 'F1 Score': 0.3917525773195876, 'ROC AUC': 0.7799227799227799, 'Confusion Matrix': array([[222,  37],
       [ 22,  19]])}, 'SVM': {'Accuracy': 0.8333333333333334, 'Precision': 0.3902439024390244, 'Recall': 0.3902439024390244, 'F1 Score': 0.3902439024390244, 'ROC AUC': 0.7781335342310953, 'Confusion Matrix': array([[234,  25],
       [ 25,  16]])}, 'Decision Tree': {'Accuracy': 0.71, 'Precision': 0.23863636363636365, 'Recall': 0.5121951219512195, 'F1 Score': 0.32558139534883723, 'ROC AUC': 0.6267539316319805, 'Confusion Matrix': array([[192,  67],
       [ 20,  21]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.67, 'Precision': 0.25, 'Recall': 0.7073170731707317, 'F1 Score': 0.36942675159235666, 'ROC AUC': 0.7583576607966852, 'Confusion Matrix': array([[172,  87],
       [ 12,  29]])}, 'SVM': {'Accuracy': 0.7166666666666667, 'Precision': 0.28, 'Recall': 0.6829268292682927, 'F1 Score': 0.3971631205673759, 'ROC AUC': 0.7394293247951783, 'Confusion Matrix': array([[187,  72],
       [ 13,  28]])}, 'Decision Tree': {'Accuracy': 0.6033333333333334, 'Precision': 0.21739130434782608, 'Recall': 0.7317073170731707, 'F1 Score': 0.335195530726257, 'ROC AUC': 0.6573594500423768, 'Confusion Matrix': array([[151, 108],
       [ 11,  30]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8733333333333333, 'Precision': 0.5652173913043478, 'Recall': 0.3170731707317073, 'F1 Score': 0.40625, 'ROC AUC': 0.7580751483190508, 'Confusion Matrix': array([[249,  10],
       [ 28,  13]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.75, 'Recall': 0.07317073170731707, 'F1 Score': 0.13333333333333333, 'ROC AUC': 0.7794048403804501, 'Confusion Matrix': array([[258,   1],
       [ 38,   3]])}, 'Decision Tree': {'Accuracy': 0.7766666666666666, 'Precision': 0.25925925925925924, 'Recall': 0.34146341463414637, 'F1 Score': 0.29473684210526313, 'ROC AUC': 0.593511630096996, 'Confusion Matrix': array([[219,  40],
       [ 27,  14]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.8666666666666667, 'Precision': 0.5185185185185185, 'Recall': 0.34146341463414637, 'F1 Score': 0.411764705882353, 'ROC AUC': 0.7554854506074019, 'Confusion Matrix': array([[246,  13],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8766666666666667, 'Precision': 0.8333333333333334, 'Recall': 0.12195121951219512, 'F1 Score': 0.21276595744680848, 'ROC AUC': 0.7883510688388737, 'Confusion Matrix': array([[258,   1],
       [ 36,   5]])}, 'Decision Tree': {'Accuracy': 0.8066666666666666, 'Precision': 0.3111111111111111, 'Recall': 0.34146341463414637, 'F1 Score': 0.3255813953488372, 'ROC AUC': 0.6108861474715134, 'Confusion Matrix': array([[228,  31],
       [ 27,  14]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8466666666666667, 'Precision': 0.42424242424242425, 'Recall': 0.34146341463414637, 'F1 Score': 0.37837837837837834, 'ROC AUC': 0.7697523307279405, 'Confusion Matrix': array([[240,  19],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.6, 'Recall': 0.14634146341463414, 'F1 Score': 0.23529411764705882, 'ROC AUC': 0.8034184009793766, 'Confusion Matrix': array([[255,   4],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.7366666666666667, 'Precision': 0.2361111111111111, 'Recall': 0.4146341463414634, 'F1 Score': 0.30088495575221236, 'ROC AUC': 0.6011394669931255, 'Confusion Matrix': array([[204,  55],
       [ 24,  17]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.7933333333333333, 'Precision': 0.3333333333333333, 'Recall': 0.5121951219512195, 'F1 Score': 0.40384615384615385, 'ROC AUC': 0.7381109332328846, 'Confusion Matrix': array([[217,  42],
       [ 20,  21]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.2926829268292683, 'F1 Score': 0.3692307692307692, 'ROC AUC': 0.7676805725586214, 'Confusion Matrix': array([[247,  12],
       [ 29,  12]])}, 'Decision Tree': {'Accuracy': 0.7433333333333333, 'Precision': 0.25, 'Recall': 0.43902439024390244, 'F1 Score': 0.31858407079646023, 'ROC AUC': 0.6152650908748469, 'Confusion Matrix': array([[205,  54],
       [ 23,  18]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.49, 'Precision': 0.18181818181818182, 'Recall': 0.7804878048780488, 'F1 Score': 0.29493087557603687, 'ROC AUC': 0.700254261229871, 'Confusion Matrix': array([[115, 144],
       [  9,  32]])}, 'SVM': {'Accuracy': 0.38, 'Precision': 0.1497584541062802, 'Recall': 0.7560975609756098, 'F1 Score': 0.25, 'ROC AUC': 0.6133345889443451, 'Confusion Matrix': array([[ 83, 176],
       [ 10,  31]])}, 'Decision Tree': {'Accuracy': 0.46, 'Precision': 0.15819209039548024, 'Recall': 0.6829268292682927, 'F1 Score': 0.25688073394495414, 'ROC AUC': 0.5538186269893588, 'Confusion Matrix': array([[110, 149],
       [ 13,  28]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8766666666666667, 'Precision': 0.5909090909090909, 'Recall': 0.3170731707317073, 'F1 Score': 0.41269841269841273, 'ROC AUC': 0.7546849985874376, 'Confusion Matrix': array([[250,   9],
       [ 28,  13]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.75, 'Recall': 0.07317073170731707, 'F1 Score': 0.13333333333333333, 'ROC AUC': 0.7804407194651096, 'Confusion Matrix': array([[258,   1],
       [ 38,   3]])}, 'Decision Tree': {'Accuracy': 0.7933333333333333, 'Precision': 0.3090909090909091, 'Recall': 0.4146341463414634, 'F1 Score': 0.3541666666666667, 'ROC AUC': 0.6339579998116583, 'Confusion Matrix': array([[221,  38],
       [ 24,  17]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.8566666666666667, 'Precision': 0.4666666666666667, 'Recall': 0.34146341463414637, 'F1 Score': 0.3943661971830986, 'ROC AUC': 0.7550145964780112, 'Confusion Matrix': array([[243,  16],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8766666666666667, 'Precision': 0.8333333333333334, 'Recall': 0.12195121951219512, 'F1 Score': 0.21276595744680848, 'ROC AUC': 0.7912703644410961, 'Confusion Matrix': array([[258,   1],
       [ 36,   5]])}, 'Decision Tree': {'Accuracy': 0.7833333333333333, 'Precision': 0.29310344827586204, 'Recall': 0.4146341463414634, 'F1 Score': 0.34343434343434337, 'ROC AUC': 0.6281664940201526, 'Confusion Matrix': array([[218,  41],
       [ 24,  17]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.85, 'Precision': 0.4411764705882353, 'Recall': 0.36585365853658536, 'F1 Score': 0.4, 'ROC AUC': 0.7495056031641398, 'Confusion Matrix': array([[240,  19],
       [ 26,  15]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.6, 'Recall': 0.14634146341463414, 'F1 Score': 0.23529411764705882, 'ROC AUC': 0.7735191637630662, 'Confusion Matrix': array([[255,   4],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.72, 'Precision': 0.1791044776119403, 'Recall': 0.2926829268292683, 'F1 Score': 0.2222222222222222, 'ROC AUC': 0.5401638572370279, 'Confusion Matrix': array([[204,  55],
       [ 29,  12]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.7433333333333333, 'Precision': 0.29069767441860467, 'Recall': 0.6097560975609756, 'F1 Score': 0.3937007874015748, 'ROC AUC': 0.7159807891515209, 'Confusion Matrix': array([[198,  61],
       [ 16,  25]])}, 'SVM': {'Accuracy': 0.8033333333333333, 'Precision': 0.3125, 'Recall': 0.36585365853658536, 'F1 Score': 0.33707865168539325, 'ROC AUC': 0.7124964685940296, 'Confusion Matrix': array([[226,  33],
       [ 26,  15]])}, 'Decision Tree': {'Accuracy': 0.64, 'Precision': 0.18691588785046728, 'Recall': 0.4878048780487805, 'F1 Score': 0.27027027027027023, 'ROC AUC': 0.5759487710707223, 'Confusion Matrix': array([[172,  87],
       [ 21,  20]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.7833333333333333, 'Precision': 0.3333333333333333, 'Recall': 0.5853658536585366, 'F1 Score': 0.42477876106194684, 'ROC AUC': 0.7613711272247857, 'Confusion Matrix': array([[211,  48],
       [ 17,  24]])}, 'SVM': {'Accuracy': 0.8, 'Precision': 0.3582089552238806, 'Recall': 0.5853658536585366, 'F1 Score': 0.4444444444444444, 'ROC AUC': 0.7506356530746775, 'Confusion Matrix': array([[216,  43],
       [ 17,  24]])}, 'Decision Tree': {'Accuracy': 0.7733333333333333, 'Precision': 0.3246753246753247, 'Recall': 0.6097560975609756, 'F1 Score': 0.423728813559322, 'ROC AUC': 0.7044919483943873, 'Confusion Matrix': array([[207,  52],
       [ 16,  25]])}}
_______________________________________________________________________________


#Calling Random Pruning

In [20]:
results_random_pruning = dict()
for ratio in ratios:
  random_pruned_X_train, random_pruned_y_train = random_prune_data(X_train.to_numpy(), y_train.to_numpy(), ratio)
  preprocessed_random_pruned_X_train, scaler, imputer = preprocess_data_train(random_pruned_X_train)
  preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  random_pruned_X_train, random_pruned_y_train = preprocessed_random_pruned_X_train, random_pruned_y_train
  random_pruned_X_test, random_pruned_y_test = preprocessed_X_test, y_test.to_numpy()

  print(f"Train data pruned randomly at {ratio * 100}% :")
  results = evaluate_models(random_pruned_X_train, random_pruned_X_test, random_pruned_y_train, random_pruned_y_test)
  print(results)
  results_random_pruning[ratio] = results
  print("_______________________________________________________________________________")

Train data pruned randomly at 20.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8766666666666667, 'Precision': 0.5833333333333334, 'Recall': 0.34146341463414637, 'F1 Score': 0.43076923076923085, 'ROC AUC': 0.7719182597231379, 'Confusion Matrix': array([[249,  10],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8666666666666667, 'Precision': 0.6666666666666666, 'Recall': 0.04878048780487805, 'F1 Score': 0.0909090909090909, 'ROC AUC': 0.8062435257557209, 'Confusion Matrix': array([[258,   1],
       [ 39,   2]])}, 'Decision Tree': {'Accuracy': 0.8066666666666666, 'Precision': 0.3111111111111111, 'Recall': 0.34146341463414637, 'F1 Score': 0.3255813953488372, 'ROC AUC': 0.6108861474715134, 'Confusion Matrix': array([[228,  31],
       [ 27,  14]])}}
_______________________________________________________________________________
Train data pruned randomly at 40.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.34146341463414637, 'F1 Score': 0.40579710144927533, 'ROC AUC': 0.7725774555042849, 'Confusion Matrix': array([[245,  14],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8733333333333333, 'Precision': 0.7142857142857143, 'Recall': 0.12195121951219512, 'F1 Score': 0.20833333333333334, 'ROC AUC': 0.8053018174969394, 'Confusion Matrix': array([[257,   2],
       [ 36,   5]])}, 'Decision Tree': {'Accuracy': 0.7866666666666666, 'Precision': 0.2909090909090909, 'Recall': 0.3902439024390244, 'F1 Score': 0.3333333333333333, 'ROC AUC': 0.6198323759299368, 'Confusion Matrix': array([[220,  39],
       [ 25,  16]])}}
_______________________________________________________________________________
Train data pruned randomly at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.85, 'Precision': 0.4444444444444444, 'Recall': 0.3902439024390244, 'F1 Score': 0.4155844155844156, 'ROC AUC': 0.76673886429984, 'Confusion Matrix': array([[239,  20],
       [ 25,  16]])}, 'SVM': {'Accuracy': 0.8666666666666667, 'Precision': 0.5294117647058824, 'Recall': 0.21951219512195122, 'F1 Score': 0.3103448275862069, 'ROC AUC': 0.7999811658348244, 'Confusion Matrix': array([[251,   8],
       [ 32,   9]])}, 'Decision Tree': {'Accuracy': 0.7366666666666667, 'Precision': 0.2625, 'Recall': 0.5121951219512195, 'F1 Score': 0.34710743801652894, 'ROC AUC': 0.6421979470759959, 'Confusion Matrix': array([[200,  59],
       [ 20,  21]])}}
_______________________________________________________________________________
Train data pruned randomly at 80.0% :




{'Logistic Regression': {'Accuracy': 0.8033333333333333, 'Precision': 0.35, 'Recall': 0.5121951219512195, 'F1 Score': 0.4158415841584158, 'ROC AUC': 0.7747433844994821, 'Confusion Matrix': array([[220,  39],
       [ 20,  21]])}, 'SVM': {'Accuracy': 0.8566666666666667, 'Precision': 0.4772727272727273, 'Recall': 0.5121951219512195, 'F1 Score': 0.49411764705882355, 'ROC AUC': 0.8174027686222808, 'Confusion Matrix': array([[236,  23],
       [ 20,  21]])}, 'Decision Tree': {'Accuracy': 0.7266666666666667, 'Precision': 0.26436781609195403, 'Recall': 0.5609756097560976, 'F1 Score': 0.359375, 'ROC AUC': 0.6569356813259253, 'Confusion Matrix': array([[195,  64],
       [ 18,  23]])}}
_______________________________________________________________________________
Train data pruned randomly at 100.0% :




{'Logistic Regression': {'Accuracy': 0.6933333333333334, 'Precision': 0.2571428571428571, 'Recall': 0.6585365853658537, 'F1 Score': 0.3698630136986301, 'ROC AUC': 0.7779451925793388, 'Confusion Matrix': array([[181,  78],
       [ 14,  27]])}, 'SVM': {'Accuracy': 0.7033333333333334, 'Precision': 0.2777777777777778, 'Recall': 0.7317073170731707, 'F1 Score': 0.40268456375838924, 'ROC AUC': 0.7900932291176194, 'Confusion Matrix': array([[181,  78],
       [ 11,  30]])}, 'Decision Tree': {'Accuracy': 0.6266666666666667, 'Precision': 0.2248062015503876, 'Recall': 0.7073170731707317, 'F1 Score': 0.34117647058823536, 'ROC AUC': 0.6606083435351728, 'Confusion Matrix': array([[159, 100],
       [ 12,  29]])}}
_______________________________________________________________________________


# Calling SDV-Oversampling

In [21]:
sd1, train_df = do_sdv(X_train, y_train)
results_syn_sdv = dict()

# Add synthetic data at different percentages to the main DataFrame
for ratio in ratios:
    combined_df = add_synthetic_data(train_df, sd1, ratio)
    y_train_sdv = combined_df.iloc[:, -1]
    X_train_sdv = combined_df.iloc[:, :-1]

    preprocessed_X_train_sdv, scaler, imputer = preprocess_data_train(X_train_sdv)
    preprocessed_X_test_sdv = preprocess_data_test(X_test, scaler, imputer)

    X_train_sdv, y_train_sdv = preprocessed_X_train_sdv, y_train_sdv.to_numpy()
    X_test_sdv, y_test_sdv = preprocessed_X_test_sdv, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_sdv, X_test_sdv, y_train_sdv, y_test_sdv)
    results_syn_sdv[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8666666666666667, 'Precision': 0.5217391304347826, 'Recall': 0.2926829268292683, 'F1 Score': 0.375, 'ROC AUC': 0.7310952067049628, 'Confusion Matrix': array([[248,  11],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8733333333333333, 'Precision': 0.7142857142857143, 'Recall': 0.12195121951219512, 'F1 Score': 0.20833333333333334, 'ROC AUC': 0.7441378660890856, 'Confusion Matrix': array([[257,   2],
       [ 36,   5]])}, 'Decision Tree': {'Accuracy': 0.7933333333333333, 'Precision': 0.29411764705882354, 'Recall': 0.36585365853658536, 'F1 Score': 0.32608695652173914, 'ROC AUC': 0.6134287597702232, 'Confusion Matrix': array([[223,  36],
       [ 26,  15]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.87, 'Precision': 0.5454545454545454, 'Recall': 0.2926829268292683, 'F1 Score': 0.3809523809523809, 'ROC AUC': 0.725350786326396, 'Confusion Matrix': array([[249,  10],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8733333333333333, 'Precision': 0.6666666666666666, 'Recall': 0.14634146341463414, 'F1 Score': 0.24, 'ROC AUC': 0.7296355589038516, 'Confusion Matrix': array([[256,   3],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.7933333333333333, 'Precision': 0.29411764705882354, 'Recall': 0.36585365853658536, 'F1 Score': 0.32608695652173914, 'ROC AUC': 0.6134287597702232, 'Confusion Matrix': array([[223,  36],
       [ 26,  15]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8666666666666667, 'Precision': 0.5217391304347826, 'Recall': 0.2926829268292683, 'F1 Score': 0.375, 'ROC AUC': 0.7274225444957152, 'Confusion Matrix': array([[248,  11],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.6, 'Recall': 0.14634146341463414, 'F1 Score': 0.23529411764705882, 'ROC AUC': 0.7287880214709482, 'Confusion Matrix': array([[255,   4],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.7933333333333333, 'Precision': 0.29411764705882354, 'Recall': 0.36585365853658536, 'F1 Score': 0.32608695652173914, 'ROC AUC': 0.6134287597702232, 'Confusion Matrix': array([[223,  36],
       [ 26,  15]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.87, 'Precision': 0.5454545454545454, 'Recall': 0.2926829268292683, 'F1 Score': 0.3809523809523809, 'ROC AUC': 0.724691590545249, 'Confusion Matrix': array([[249,  10],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.6, 'Recall': 0.14634146341463414, 'F1 Score': 0.23529411764705882, 'ROC AUC': 0.7269046049533854, 'Confusion Matrix': array([[255,   4],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.79, 'Precision': 0.28846153846153844, 'Recall': 0.36585365853658536, 'F1 Score': 0.3225806451612903, 'ROC AUC': 0.6114982578397212, 'Confusion Matrix': array([[222,  37],
       [ 26,  15]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8666666666666667, 'Precision': 0.5217391304347826, 'Recall': 0.2926829268292683, 'F1 Score': 0.375, 'ROC AUC': 0.7199359638384029, 'Confusion Matrix': array([[248,  11],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.87, 'Precision': 0.6, 'Recall': 0.14634146341463414, 'F1 Score': 0.23529411764705882, 'ROC AUC': 0.7260570675204822, 'Confusion Matrix': array([[255,   4],
       [ 35,   6]])}, 'Decision Tree': {'Accuracy': 0.7933333333333333, 'Precision': 0.29411764705882354, 'Recall': 0.36585365853658536, 'F1 Score': 0.32608695652173914, 'ROC AUC': 0.6134287597702232, 'Confusion Matrix': array([[223,  36],
       [ 26,  15]])}}
_______________________________________________________________________________


# Calling SMOTE-Oversampling

In [22]:
results_smote = dict()

for ratio in ratios:

    X_train_smote, y_train_smote = smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])
    preprocessed_X_train_smote, scaler, imputer = preprocess_data_train((np.array(X_train_smote))[0])
    preprocessed_X_test_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_smote, y_train_smote = preprocessed_X_train_smote, (np.array(y_train_smote))[0]
    X_test_smote, y_test_smote = preprocessed_X_test_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_smote), len(y_train_smote))
    results = evaluate_models(X_train_smote, X_test_smote, y_train_smote, y_test_smote)
    results_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
1369 1369


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8466666666666667, 'Precision': 0.42857142857142855, 'Recall': 0.36585365853658536, 'F1 Score': 0.39473684210526316, 'ROC AUC': 0.7496468594029569, 'Confusion Matrix': array([[239,  20],
       [ 26,  15]])}, 'SVM': {'Accuracy': 0.8666666666666667, 'Precision': 0.5294117647058824, 'Recall': 0.21951219512195122, 'F1 Score': 0.3103448275862069, 'ROC AUC': 0.7928712684810245, 'Confusion Matrix': array([[251,   8],
       [ 32,   9]])}, 'Decision Tree': {'Accuracy': 0.7933333333333333, 'Precision': 0.2558139534883721, 'Recall': 0.2682926829268293, 'F1 Score': 0.2619047619047619, 'ROC AUC': 0.5723702796873529, 'Confusion Matrix': array([[227,  32],
       [ 30,  11]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
1541 1541


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8166666666666667, 'Precision': 0.36, 'Recall': 0.43902439024390244, 'F1 Score': 0.3956043956043956, 'ROC AUC': 0.7506827384876166, 'Confusion Matrix': array([[227,  32],
       [ 23,  18]])}, 'SVM': {'Accuracy': 0.8433333333333334, 'Precision': 0.4, 'Recall': 0.2926829268292683, 'F1 Score': 0.3380281690140845, 'ROC AUC': 0.7847725774555043, 'Confusion Matrix': array([[241,  18],
       [ 29,  12]])}, 'Decision Tree': {'Accuracy': 0.7333333333333333, 'Precision': 0.2, 'Recall': 0.3170731707317073, 'F1 Score': 0.24528301886792453, 'ROC AUC': 0.5581504849797533, 'Confusion Matrix': array([[207,  52],
       [ 28,  13]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1713 1713


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.79, 'Precision': 0.31666666666666665, 'Recall': 0.4634146341463415, 'F1 Score': 0.3762376237623763, 'ROC AUC': 0.756238817214427, 'Confusion Matrix': array([[218,  41],
       [ 22,  19]])}, 'SVM': {'Accuracy': 0.83, 'Precision': 0.391304347826087, 'Recall': 0.43902439024390244, 'F1 Score': 0.4137931034482759, 'ROC AUC': 0.7752142386288727, 'Confusion Matrix': array([[231,  28],
       [ 23,  18]])}, 'Decision Tree': {'Accuracy': 0.8066666666666666, 'Precision': 0.3508771929824561, 'Recall': 0.4878048780487805, 'F1 Score': 0.4081632653061224, 'ROC AUC': 0.672473867595819, 'Confusion Matrix': array([[222,  37],
       [ 21,  20]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1885 1885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7566666666666667, 'Precision': 0.2894736842105263, 'Recall': 0.5365853658536586, 'F1 Score': 0.37606837606837606, 'ROC AUC': 0.750965250965251, 'Confusion Matrix': array([[205,  54],
       [ 19,  22]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.38596491228070173, 'Recall': 0.5365853658536586, 'F1 Score': 0.4489795918367347, 'ROC AUC': 0.7738487616536398, 'Confusion Matrix': array([[224,  35],
       [ 19,  22]])}, 'Decision Tree': {'Accuracy': 0.7966666666666666, 'Precision': 0.3148148148148148, 'Recall': 0.4146341463414634, 'F1 Score': 0.35789473684210527, 'ROC AUC': 0.6358885017421603, 'Confusion Matrix': array([[222,  37],
       [ 24,  17]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
2058 2058


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7433333333333333, 'Precision': 0.2804878048780488, 'Recall': 0.5609756097560976, 'F1 Score': 0.3739837398373984, 'ROC AUC': 0.7440436952632076, 'Confusion Matrix': array([[200,  59],
       [ 18,  23]])}, 'SVM': {'Accuracy': 0.8, 'Precision': 0.3492063492063492, 'Recall': 0.5365853658536586, 'F1 Score': 0.4230769230769231, 'ROC AUC': 0.7628778604388361, 'Confusion Matrix': array([[218,  41],
       [ 19,  22]])}, 'Decision Tree': {'Accuracy': 0.7633333333333333, 'Precision': 0.2222222222222222, 'Recall': 0.2926829268292683, 'F1 Score': 0.25263157894736843, 'ROC AUC': 0.565260382333553, 'Confusion Matrix': array([[217,  42],
       [ 29,  12]])}}
_______________________________________________________________________________


# Calling Random-Oversampling

In [23]:
results_random = dict()

for ratio in ratios:

    X_train_random, y_train_random = random_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_random, scaler, imputer = preprocess_data_train((np.array(X_train_random)[0]))
    preprocessed_X_test_random = preprocess_data_test(X_test, scaler, imputer)

    X_train_random, y_train_random = preprocessed_X_train_random, (np.array(y_train_random))[0]
    X_test_random, y_test_random = preprocessed_X_test_random, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_random), len(y_train_random))
    results = evaluate_models(X_train_random, X_test_random, y_train_random, y_test_random)
    results_random[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
1369 1369


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8533333333333334, 'Precision': 0.45714285714285713, 'Recall': 0.3902439024390244, 'F1 Score': 0.42105263157894735, 'ROC AUC': 0.7561917318014879, 'Confusion Matrix': array([[240,  19],
       [ 25,  16]])}, 'SVM': {'Accuracy': 0.8733333333333333, 'Precision': 0.6363636363636364, 'Recall': 0.17073170731707318, 'F1 Score': 0.2692307692307692, 'ROC AUC': 0.8068085507109898, 'Confusion Matrix': array([[255,   4],
       [ 34,   7]])}, 'Decision Tree': {'Accuracy': 0.8033333333333333, 'Precision': 0.32, 'Recall': 0.3902439024390244, 'F1 Score': 0.35164835164835173, 'ROC AUC': 0.6294848855824465, 'Confusion Matrix': array([[225,  34],
       [ 25,  16]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
1541 1541


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8333333333333334, 'Precision': 0.3953488372093023, 'Recall': 0.4146341463414634, 'F1 Score': 0.40476190476190477, 'ROC AUC': 0.7737075054148226, 'Confusion Matrix': array([[233,  26],
       [ 24,  17]])}, 'SVM': {'Accuracy': 0.85, 'Precision': 0.42857142857142855, 'Recall': 0.2926829268292683, 'F1 Score': 0.34782608695652173, 'ROC AUC': 0.8200866371598079, 'Confusion Matrix': array([[243,  16],
       [ 29,  12]])}, 'Decision Tree': {'Accuracy': 0.8233333333333334, 'Precision': 0.35, 'Recall': 0.34146341463414637, 'F1 Score': 0.34567901234567905, 'ROC AUC': 0.6205386571240229, 'Confusion Matrix': array([[233,  26],
       [ 27,  14]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1713 1713


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7933333333333333, 'Precision': 0.32786885245901637, 'Recall': 0.4878048780487805, 'F1 Score': 0.39215686274509803, 'ROC AUC': 0.7676334871456822, 'Confusion Matrix': array([[218,  41],
       [ 21,  20]])}, 'SVM': {'Accuracy': 0.84, 'Precision': 0.4186046511627907, 'Recall': 0.43902439024390244, 'F1 Score': 0.4285714285714286, 'ROC AUC': 0.816225633298804, 'Confusion Matrix': array([[234,  25],
       [ 23,  18]])}, 'Decision Tree': {'Accuracy': 0.83, 'Precision': 0.3333333333333333, 'Recall': 0.24390243902439024, 'F1 Score': 0.28169014084507044, 'ROC AUC': 0.5833411809021565, 'Confusion Matrix': array([[239,  20],
       [ 31,  10]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1885 1885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7566666666666667, 'Precision': 0.3, 'Recall': 0.5853658536585366, 'F1 Score': 0.39669421487603307, 'ROC AUC': 0.7600056502495527, 'Confusion Matrix': array([[203,  56],
       [ 17,  24]])}, 'SVM': {'Accuracy': 0.8133333333333334, 'Precision': 0.3728813559322034, 'Recall': 0.5365853658536586, 'F1 Score': 0.44, 'ROC AUC': 0.8034654863923155, 'Confusion Matrix': array([[222,  37],
       [ 19,  22]])}, 'Decision Tree': {'Accuracy': 0.82, 'Precision': 0.30303030303030304, 'Recall': 0.24390243902439024, 'F1 Score': 0.2702702702702703, 'ROC AUC': 0.5775496751106507, 'Confusion Matrix': array([[236,  23],
       [ 31,  10]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
2058 2058


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7266666666666667, 'Precision': 0.27472527472527475, 'Recall': 0.6097560975609756, 'F1 Score': 0.3787878787878788, 'ROC AUC': 0.7630191166776533, 'Confusion Matrix': array([[193,  66],
       [ 16,  25]])}, 'SVM': {'Accuracy': 0.8, 'Precision': 0.37333333333333335, 'Recall': 0.6829268292682927, 'F1 Score': 0.4827586206896552, 'ROC AUC': 0.8079856860344666, 'Confusion Matrix': array([[212,  47],
       [ 13,  28]])}, 'Decision Tree': {'Accuracy': 0.8, 'Precision': 0.24324324324324326, 'Recall': 0.21951219512195122, 'F1 Score': 0.23076923076923075, 'ROC AUC': 0.5557020435069215, 'Confusion Matrix': array([[231,  28],
       [ 32,   9]])}}
_______________________________________________________________________________


# Calling SVM-SMOTE Over-Sampling

In [24]:
results_svm_smote = dict()

for ratio in ratios:

    X_train_svm_smote, y_train_svm_smote = svm_smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_svm_smote, scaler, imputer = preprocess_data_train((np.array(X_train_svm_smote))[0])
    preprocessed_X_test_svm_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_svm_smote, y_train_svm_smote = preprocessed_X_train_svm_smote, (np.array(y_train_svm_smote))[0]
    X_test_svm_smote, y_test_svm_smote = preprocessed_X_test_svm_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_svm_smote), len(y_train_svm_smote))
    results = evaluate_models(X_train_svm_smote, X_test_svm_smote, y_train_svm_smote, y_test_svm_smote)
    results_svm_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
1369 1369


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8533333333333334, 'Precision': 0.45161290322580644, 'Recall': 0.34146341463414637, 'F1 Score': 0.3888888888888889, 'ROC AUC': 0.7759676052358979, 'Confusion Matrix': array([[242,  17],
       [ 27,  14]])}, 'SVM': {'Accuracy': 0.8733333333333333, 'Precision': 0.6363636363636364, 'Recall': 0.17073170731707318, 'F1 Score': 0.2692307692307692, 'ROC AUC': 0.776014690648837, 'Confusion Matrix': array([[255,   4],
       [ 34,   7]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.2807017543859649, 'Recall': 0.3902439024390244, 'F1 Score': 0.326530612244898, 'ROC AUC': 0.615971372068933, 'Confusion Matrix': array([[218,  41],
       [ 25,  16]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
1541 1541


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8333333333333334, 'Precision': 0.3902439024390244, 'Recall': 0.3902439024390244, 'F1 Score': 0.3902439024390244, 'ROC AUC': 0.7569921838214522, 'Confusion Matrix': array([[234,  25],
       [ 25,  16]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.2682926829268293, 'F1 Score': 0.34920634920634924, 'ROC AUC': 0.7706469535737829, 'Confusion Matrix': array([[248,  11],
       [ 30,  11]])}, 'Decision Tree': {'Accuracy': 0.8, 'Precision': 0.30612244897959184, 'Recall': 0.36585365853658536, 'F1 Score': 0.3333333333333333, 'ROC AUC': 0.617289763631227, 'Confusion Matrix': array([[225,  34],
       [ 26,  15]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1713 1713


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8233333333333334, 'Precision': 0.375, 'Recall': 0.43902439024390244, 'F1 Score': 0.40449438202247195, 'ROC AUC': 0.7593464544684057, 'Confusion Matrix': array([[229,  30],
       [ 23,  18]])}, 'SVM': {'Accuracy': 0.8566666666666667, 'Precision': 0.46875, 'Recall': 0.36585365853658536, 'F1 Score': 0.410958904109589, 'ROC AUC': 0.7731895658724928, 'Confusion Matrix': array([[242,  17],
       [ 26,  15]])}, 'Decision Tree': {'Accuracy': 0.8166666666666667, 'Precision': 0.34782608695652173, 'Recall': 0.3902439024390244, 'F1 Score': 0.36781609195402304, 'ROC AUC': 0.6372068933044542, 'Confusion Matrix': array([[229,  30],
       [ 25,  16]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1885 1885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.34, 'Recall': 0.4146341463414634, 'F1 Score': 0.3736263736263737, 'ROC AUC': 0.7561917318014878, 'Confusion Matrix': array([[226,  33],
       [ 24,  17]])}, 'SVM': {'Accuracy': 0.84, 'Precision': 0.41025641025641024, 'Recall': 0.3902439024390244, 'F1 Score': 0.4, 'ROC AUC': 0.7738487616536397, 'Confusion Matrix': array([[236,  23],
       [ 25,  16]])}, 'Decision Tree': {'Accuracy': 0.82, 'Precision': 0.3617021276595745, 'Recall': 0.4146341463414634, 'F1 Score': 0.38636363636363635, 'ROC AUC': 0.6494020152556738, 'Confusion Matrix': array([[229,  30],
       [ 24,  17]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
2058 2058


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8, 'Precision': 0.3333333333333333, 'Recall': 0.4634146341463415, 'F1 Score': 0.38775510204081637, 'ROC AUC': 0.7581222337319898, 'Confusion Matrix': array([[221,  38],
       [ 22,  19]])}, 'SVM': {'Accuracy': 0.8366666666666667, 'Precision': 0.4090909090909091, 'Recall': 0.43902439024390244, 'F1 Score': 0.42352941176470593, 'ROC AUC': 0.7710707222902344, 'Confusion Matrix': array([[233,  26],
       [ 23,  18]])}, 'Decision Tree': {'Accuracy': 0.7966666666666666, 'Precision': 0.3, 'Recall': 0.36585365853658536, 'F1 Score': 0.3296703296703297, 'ROC AUC': 0.6153592617007252, 'Confusion Matrix': array([[224,  35],
       [ 26,  15]])}}
_______________________________________________________________________________


# No-Sampling Results

In [25]:
results_no_sampling = dict()

for ratio in ratios:

    X_train_no_sampling, y_train_no_sampling = X_train.to_numpy(), y_train.to_numpy()

    preprocessed_X_train_no_sampling, scaler, imputer = preprocess_data_train(X_train_no_sampling)
    preprocessed_X_test_no_sampling = preprocess_data_test(X_test, scaler, imputer)

    X_train_no_sampling, y_train_no_sampling = preprocessed_X_train_no_sampling, y_train_no_sampling
    X_test_no_sampling, y_test_no_sampling = preprocessed_X_test_no_sampling, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_no_sampling), len(y_train_no_sampling))
    results = evaluate_models(X_train_no_sampling, X_test_no_sampling, y_train_no_sampling, y_test_no_sampling)
    results_no_sampling[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
1197 1197


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8833333333333333, 'Precision': 0.6666666666666666, 'Recall': 0.2926829268292683, 'F1 Score': 0.4067796610169492, 'ROC AUC': 0.7583576607966852, 'Confusion Matrix': array([[253,   6],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.024390243902439025, 'F1 Score': 0.046511627906976744, 'ROC AUC': 0.7842075525002354, 'Confusion Matrix': array([[258,   1],
       [ 40,   1]])}, 'Decision Tree': {'Accuracy': 0.8166666666666667, 'Precision': 0.325, 'Recall': 0.3170731707317073, 'F1 Score': 0.3209876543209877, 'ROC AUC': 0.6064130332423016, 'Confusion Matrix': array([[232,  27],
       [ 28,  13]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
1197 1197


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8833333333333333, 'Precision': 0.6666666666666666, 'Recall': 0.2926829268292683, 'F1 Score': 0.4067796610169492, 'ROC AUC': 0.7583576607966852, 'Confusion Matrix': array([[253,   6],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.024390243902439025, 'F1 Score': 0.046511627906976744, 'ROC AUC': 0.7842075525002354, 'Confusion Matrix': array([[258,   1],
       [ 40,   1]])}, 'Decision Tree': {'Accuracy': 0.8166666666666667, 'Precision': 0.325, 'Recall': 0.3170731707317073, 'F1 Score': 0.3209876543209877, 'ROC AUC': 0.6064130332423016, 'Confusion Matrix': array([[232,  27],
       [ 28,  13]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1197 1197


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8833333333333333, 'Precision': 0.6666666666666666, 'Recall': 0.2926829268292683, 'F1 Score': 0.4067796610169492, 'ROC AUC': 0.7583576607966852, 'Confusion Matrix': array([[253,   6],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.024390243902439025, 'F1 Score': 0.046511627906976744, 'ROC AUC': 0.7842075525002354, 'Confusion Matrix': array([[258,   1],
       [ 40,   1]])}, 'Decision Tree': {'Accuracy': 0.8166666666666667, 'Precision': 0.325, 'Recall': 0.3170731707317073, 'F1 Score': 0.3209876543209877, 'ROC AUC': 0.6064130332423016, 'Confusion Matrix': array([[232,  27],
       [ 28,  13]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1197 1197


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8833333333333333, 'Precision': 0.6666666666666666, 'Recall': 0.2926829268292683, 'F1 Score': 0.4067796610169492, 'ROC AUC': 0.7583576607966852, 'Confusion Matrix': array([[253,   6],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.024390243902439025, 'F1 Score': 0.046511627906976744, 'ROC AUC': 0.7842075525002354, 'Confusion Matrix': array([[258,   1],
       [ 40,   1]])}, 'Decision Tree': {'Accuracy': 0.8166666666666667, 'Precision': 0.325, 'Recall': 0.3170731707317073, 'F1 Score': 0.3209876543209877, 'ROC AUC': 0.6064130332423016, 'Confusion Matrix': array([[232,  27],
       [ 28,  13]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
1197 1197


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8833333333333333, 'Precision': 0.6666666666666666, 'Recall': 0.2926829268292683, 'F1 Score': 0.4067796610169492, 'ROC AUC': 0.7583576607966852, 'Confusion Matrix': array([[253,   6],
       [ 29,  12]])}, 'SVM': {'Accuracy': 0.8633333333333333, 'Precision': 0.5, 'Recall': 0.024390243902439025, 'F1 Score': 0.046511627906976744, 'ROC AUC': 0.7842075525002354, 'Confusion Matrix': array([[258,   1],
       [ 40,   1]])}, 'Decision Tree': {'Accuracy': 0.8166666666666667, 'Precision': 0.325, 'Recall': 0.3170731707317073, 'F1 Score': 0.3209876543209877, 'ROC AUC': 0.6064130332423016, 'Confusion Matrix': array([[232,  27],
       [ 28,  13]])}}
_______________________________________________________________________________
