<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/js_vuln_sampling_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip data.zip

In [None]:
!pip install sdv

In [3]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [5]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
from scipy.io import arff
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [6]:
def preprocess_data_train(X_train):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_train).sum()

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_train).sum()

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler, imputer

def preprocess_data_test(X_test, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_test).sum()

    # Handle missing data
    X_test = imputer.transform(X_test)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_test).sum()

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

## Experiments

### Dataset : JavaScript Vulnerbalities

In [14]:
data_path = f"data/JavaScript_Vulnerability/JSVulnerabilityDataSet-1.0.csv"
df = pd.read_csv(data_path)
drop_columns = ["name", "longname", "path", "full_repo_path", "line", "column", "endline", "endcolumn"]
df = df.drop(drop_columns, axis=1)
print("before drop duplicates", df.shape[0])
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)
print("after drop duplicates", df.shape[0])
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


before drop duplicates 12125
after drop duplicates 6271


## Training and testing using ML models

In [15]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test, random_state=42):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(random_state=random_state),
        "SVM": SVC(random_state=random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state)
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results

In [16]:
results = evaluate_models(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
print(results)

{'Logistic Regression': {'Accuracy': 0.8940239043824701, 'Precision': 0.8428571428571429, 'Recall': 0.3259668508287293, 'F1 Score': 0.47011952191235057, 'ROC AUC': 0.6766129613053902, 'Confusion Matrix': array([[1063,   11],
       [ 122,   59]])}, 'SVM': {'Accuracy': 0.8549800796812749, 'Precision': 0.48484848484848486, 'Recall': 0.08839779005524862, 'F1 Score': 0.14953271028037382, 'ROC AUC': 0.6180000411535336, 'Confusion Matrix': array([[1057,   17],
       [ 165,   16]])}, 'Decision Tree': {'Accuracy': 0.850199203187251, 'Precision': 0.4816753926701571, 'Recall': 0.5082872928176796, 'F1 Score': 0.49462365591397844, 'ROC AUC': 0.7100939329403171, 'Confusion Matrix': array([[975,  99],
       [ 89,  92]])}}


# SDV - Oversampling

In [18]:
def do_sdv(X_train, y_train):
  train_df = pd.concat([X_train, y_train], axis=1)
  class_counts = y_train.value_counts()

  # Find minority class label
  minority_class_label = class_counts.idxmin()

  # Filter rows with minority class label
  minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

  # Calculate counts of majority and minority classes
  majority_count = class_counts.max()
  minority_count = class_counts.min()

  metadata_data = SingleTableMetadata()
  metadata_data.detect_from_dataframe(minority_df)
  print(metadata_data)
  # Generate synthetic data using GaussianCopulaSynthesizer
  synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
  synthesizer_breast_data.fit(minority_df)

  # Print sample synthetic data
  synthesizer_breast_data.reset_sampling()
  sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
  return sd1, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)

    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Random Over-Sampling

In [19]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def random_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = RandomOverSampler(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SMOTE Over-Sampling

In [20]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SVM-SMOTE Over-Sampling

In [21]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def svm_smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# Intelligent Pruning

In [22]:
def find_majority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    max_label = max(zip(counts, labels))[1]
    indices_with_max_label = np.where(y == max_label)[0]
    X_maj, y_maj = X[indices_with_max_label], y[indices_with_max_label]

    # Exclude majority class samples
    indices_without_max_label = np.where(y != max_label)[0]
    X_remaining, y_remaining = X[indices_without_max_label], y[indices_without_max_label]

    return X_maj, y_maj, X_remaining, y_remaining, min(counts)

def do_clustering(X, y, labels):
  clustered_X = defaultdict(list)
  clustered_y = defaultdict(list)

  for i, label in enumerate(labels):
      clustered_X[label].append(X[i])
      clustered_y[label].append(y[i])

  # Sort clustered_X and clustered_y in descending order based on the length of values in each dictionary
  sorted_clustered_X = dict(sorted(clustered_X.items(), key=lambda x: -len(x[1])))
  sorted_clustered_y = dict(sorted(clustered_y.items(), key=lambda x: -len(x[1])))

  return sorted_clustered_X, sorted_clustered_y


def intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, per_cluster_pruning_ratio=0.7, seed=42):
  random.seed(seed)
  pruning_ratios_X_maj, pruning_ratios_y_maj = defaultdict(list), defaultdict(list)
  for pruning_samp, pruning_ratio in zip(pruning_samps, pruning_ratios):
    samps = 0
    # print("For Pruning samps: ", pruning_samp)
    prune_samps = pruning_samp
    # print(prune_samps)
    clustered_X_new = defaultdict(list)
    clustered_y_new = defaultdict(list)
    # Iterate over the sorted dictionaries
    for label, values_X in clustered_X.items():
        # Calculate the number of samples to prune
        num_samples_to_prune = int(prune_samps * per_cluster_pruning_ratio)
        if(num_samples_to_prune > len(values_X)):
          num_samples_to_prune = len(values_X)//2
          prune_samps -= num_samples_to_prune
        else:
          prune_samps -= num_samples_to_prune

        # Randomly choose samples to prune
        indices_to_prune = random.sample(range(len(values_X)), num_samples_to_prune)

        # Prune the samples from clustered_X and clustered_y
        clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in indices_to_prune]
        clustered_y_new[label] = [clustered_y[label][i] for i in range(len(clustered_y[label])) if i not in indices_to_prune]

    iter = 0
    while(prune_samps > 0):
        if(iter>=100):
          break
        for label, values_X in clustered_X_new.items():
          if(prune_samps <=0 or len(values_X) <= 0):
            break
          # print(len(values_X))
          index_to_prune = random.sample(range(len(values_X)), 1)
          clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in index_to_prune]
          clustered_y_new[label] = [clustered_y_new[label][i] for i in range(len(clustered_y_new[label])) if i not in index_to_prune]

          prune_samps -= 1
        iter += 1

    for label in clustered_X_new:
        pruning_ratios_X_maj[pruning_ratio].extend(clustered_X_new[label])
        pruning_ratios_y_maj[pruning_ratio].extend(clustered_y_new[label])

  return pruning_ratios_X_maj, pruning_ratios_y_maj

def combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining):

  pruning_ratios_X, pruning_ratios_y = defaultdict(list), defaultdict(list)
  for pruning_ratio in pruning_ratios:
    pruning_ratios_X[pruning_ratio].extend(pruning_ratios_X_maj[pruning_ratio])
    pruning_ratios_X[pruning_ratio].extend(X_remaining)

    pruning_ratios_y[pruning_ratio].extend(pruning_ratios_y_maj[pruning_ratio])
    pruning_ratios_y[pruning_ratio].extend(y_remaining)

  return pruning_ratios_X, pruning_ratios_y

def do_intelligent_pruning(X, y, ratio, per_cluster_pruning_ratio=0.7, seed=42):

  X_maj, y_maj, X_remaining, y_remaining, min_class_samples = find_majority_data(X, y)
  kmeans = KMeans(n_clusters=3, random_state = 42)
  kmeans.fit(X_maj)
  labels = kmeans.labels_
  clustered_X, clustered_y = do_clustering(X_maj, y_maj, labels)

  pruning_best = len(X_maj)-min_class_samples
  pruning_samps = [int(pruning_best * ratio)]
  pruning_ratios = [ratio]

  pruning_ratios_X_maj, pruning_ratios_y_maj = intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, \
                                                                      per_cluster_pruning_ratio=per_cluster_pruning_ratio, seed=seed)

  pruning_ratios_X, pruning_ratios_y = combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining)

  return list(pruning_ratios_X.values()), list(pruning_ratios_y.values())

# Random Pruning

In [23]:
"""
inputs:
X: np.array
y: np.array
percentage: from 0% upto 100%, enter int value
"""
def random_prune_data(X, y, ratio, seed = 42):
  # preprocessed_X, scaler, imputer = preprocess_data_train(X)
  # preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  # X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
  # X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()
  np.random.seed(seed)
  labels_count = {}
  labels = np.unique(y)
  for label in labels:
    labels_count[label] = np.count_nonzero(y == label)
  max_label = min_label = labels[0]
  for label in labels_count:
    if labels_count[label] > labels_count[max_label]:
      max_label = label
    if labels_count[label] < labels_count[min_label]:
      min_label = label

  # print("Max", max_label, labels_count[max_label])
  # print("Min", min_label, labels_count[min_label])

  prune_counts = {}
  prune_indexes = {}
  for label in labels_count:
    prune_counts[label] = labels_count[label] - labels_count[min_label]
    prune_indexes[label] = np.where(y == label)[0]

  prune_amount = int(ratio * sum(map(lambda x: x[1], prune_counts.items())))
  prune_it = {}

  while prune_amount > 0:
    for label in labels:
      if (len(prune_indexes[label]) - labels_count[min_label]) > 0 and prune_amount > 0:
        random_index = np.random.choice(len(prune_indexes[label]))
        random_item = prune_indexes[label][random_index]
        prune_indexes[label] = np.delete(prune_indexes[label], random_index)
        if prune_it.get(label, None) is None:
          prune_it[label] = np.array([])
        prune_it[label] = np.append(prune_it[label], [random_item])
        prune_amount -= 1



  formatted_indexes = np.array([])
  for label in prune_indexes:
    formatted_indexes = np.append(formatted_indexes, prune_indexes[label])
  formatted_indexes = np.sort(formatted_indexes)
  new_arr = np.array([np.int64(i) for i in formatted_indexes])

  return X[new_arr], y[new_arr]

In [24]:
ratios = [ratio for ratio in np.arange(0.2, 1.1, 0.2)]

# Calling Intelligent Pruning

In [25]:
results_intelligent_pruning = dict()
per_cluster_pruning_ratios = [0.5, 0.7, 0.9, 1]

for per_cluster_pruning_ratio in per_cluster_pruning_ratios:
  print(f'For per-cluster pruning ratio {per_cluster_pruning_ratio}')
  for ratio in ratios:
    X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

    intelligent_pruned_X_train, intelligent_pruned_y_train = do_intelligent_pruning(X_train_copy.to_numpy(), y_train_copy.to_numpy(), ratio, per_cluster_pruning_ratio=per_cluster_pruning_ratio)

    preprocessed_intelligent_pruned_X_train, scaler, imputer = preprocess_data_train((np.array(intelligent_pruned_X_train))[0])
    preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

    intelligent_pruned_X_train, intelligent_pruned_y_train = preprocessed_intelligent_pruned_X_train, (np.array(intelligent_pruned_y_train))[0]
    intelligent_pruned_X_test, intelligent_pruned_y_test = preprocessed_X_test, y_test.to_numpy()
    print(f"Train data pruned intelligently at {ratio * 100}% :")
    results = evaluate_models(intelligent_pruned_X_train, intelligent_pruned_X_test, intelligent_pruned_y_train, intelligent_pruned_y_test)
    print(results)
    results_intelligent_pruning[ratio] = results
    print("_______________________________________________________________________________")

For per-cluster pruning ratio 0.5




Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8741035856573706, 'Precision': 0.6493506493506493, 'Recall': 0.27624309392265195, 'F1 Score': 0.3875968992248062, 'ROC AUC': 0.7310745187608672, 'Confusion Matrix': array([[1047,   27],
       [ 131,   50]])}, 'SVM': {'Accuracy': 0.8916334661354581, 'Precision': 0.7472527472527473, 'Recall': 0.3756906077348066, 'F1 Score': 0.5, 'ROC AUC': 0.7663585295842464, 'Confusion Matrix': array([[1051,   23],
       [ 113,   68]])}, 'Decision Tree': {'Accuracy': 0.8302788844621514, 'Precision': 0.42592592592592593, 'Recall': 0.5082872928176796, 'F1 Score': 0.4634760705289673, 'ROC AUC': 0.6992422605635977, 'Confusion Matrix': array([[950, 124],
       [ 89,  92]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.8741035856573706, 'Precision': 0.6385542168674698, 'Recall': 0.292817679558011, 'F1 Score': 0.4015151515151515, 'ROC AUC': 0.7421088099426937, 'Confusion Matrix': array([[1044,   30],
       [ 128,   53]])}, 'SVM': {'Accuracy': 0.8900398406374502, 'Precision': 0.7311827956989247, 'Recall': 0.3756906077348066, 'F1 Score': 0.49635036496350365, 'ROC AUC': 0.7660498780826569, 'Confusion Matrix': array([[1049,   25],
       [ 113,   68]])}, 'Decision Tree': {'Accuracy': 0.8294820717131474, 'Precision': 0.4225352112676056, 'Recall': 0.4972375690607735, 'F1 Score': 0.4568527918781726, 'ROC AUC': 0.6897177896437132, 'Confusion Matrix': array([[951, 123],
       [ 91,  90]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8733067729083666, 'Precision': 0.6170212765957447, 'Recall': 0.32044198895027626, 'F1 Score': 0.4218181818181818, 'ROC AUC': 0.748966017469675, 'Confusion Matrix': array([[1038,   36],
       [ 123,   58]])}, 'SVM': {'Accuracy': 0.8900398406374502, 'Precision': 0.7311827956989247, 'Recall': 0.3756906077348066, 'F1 Score': 0.49635036496350365, 'ROC AUC': 0.7667289113861538, 'Confusion Matrix': array([[1049,   25],
       [ 113,   68]])}, 'Decision Tree': {'Accuracy': 0.8199203187250996, 'Precision': 0.40425531914893614, 'Recall': 0.5248618784530387, 'F1 Score': 0.4567307692307693, 'ROC AUC': 0.6992139675092852, 'Confusion Matrix': array([[934, 140],
       [ 86,  95]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.8733067729083666, 'Precision': 0.6195652173913043, 'Recall': 0.3149171270718232, 'F1 Score': 0.4175824175824176, 'ROC AUC': 0.7354470817000525, 'Confusion Matrix': array([[1039,   35],
       [ 124,   57]])}, 'SVM': {'Accuracy': 0.8908366533864542, 'Precision': 0.7340425531914894, 'Recall': 0.3812154696132597, 'F1 Score': 0.5018181818181818, 'ROC AUC': 0.769445044600142, 'Confusion Matrix': array([[1049,   25],
       [ 112,   69]])}, 'Decision Tree': {'Accuracy': 0.8310756972111554, 'Precision': 0.4351464435146444, 'Recall': 0.574585635359116, 'F1 Score': 0.4952380952380952, 'ROC AUC': 0.7248783398664568, 'Confusion Matrix': array([[939, 135],
       [ 77, 104]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.8709163346613545, 'Precision': 0.5922330097087378, 'Recall': 0.3370165745856354, 'F1 Score': 0.42957746478873243, 'ROC AUC': 0.7410388180705165, 'Confusion Matrix': array([[1032,   42],
       [ 120,   61]])}, 'SVM': {'Accuracy': 0.8892430278884462, 'Precision': 0.7142857142857143, 'Recall': 0.3867403314917127, 'F1 Score': 0.5017921146953406, 'ROC AUC': 0.7693730259164379, 'Confusion Matrix': array([[1046,   28],
       [ 111,   70]])}, 'Decision Tree': {'Accuracy': 0.8175298804780876, 'Precision': 0.40977443609022557, 'Recall': 0.6022099447513812, 'F1 Score': 0.4876957494407159, 'ROC AUC': 0.7263470065948539, 'Confusion Matrix': array([[917, 157],
       [ 72, 109]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8725099601593626, 'Precision': 0.6363636363636364, 'Recall': 0.27071823204419887, 'F1 Score': 0.37984496124031003, 'ROC AUC': 0.742407173060897, 'Confusion Matrix': array([[1046,   28],
       [ 132,   49]])}, 'SVM': {'Accuracy': 0.8916334661354581, 'Precision': 0.7472527472527473, 'Recall': 0.3756906077348066, 'F1 Score': 0.5, 'ROC AUC': 0.7654068541210122, 'Confusion Matrix': array([[1051,   23],
       [ 113,   68]])}, 'Decision Tree': {'Accuracy': 0.8398406374501992, 'Precision': 0.4523809523809524, 'Recall': 0.5248618784530387, 'F1 Score': 0.4859335038363171, 'ROC AUC': 0.706680761751906, 'Confusion Matrix': array([[959, 115],
       [ 86,  95]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.8749003984063745, 'Precision': 0.6363636363636364, 'Recall': 0.30939226519337015, 'F1 Score': 0.4163568773234201, 'ROC AUC': 0.7434462997829151, 'Confusion Matrix': array([[1042,   32],
       [ 125,   56]])}, 'SVM': {'Accuracy': 0.8900398406374502, 'Precision': 0.7311827956989247, 'Recall': 0.3756906077348066, 'F1 Score': 0.49635036496350365, 'ROC AUC': 0.7684830807535212, 'Confusion Matrix': array([[1049,   25],
       [ 113,   68]])}, 'Decision Tree': {'Accuracy': 0.8231075697211155, 'Precision': 0.41350210970464135, 'Recall': 0.5414364640883977, 'F1 Score': 0.4688995215311004, 'ROC AUC': 0.7045124849532393, 'Confusion Matrix': array([[935, 139],
       [ 83,  98]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8741035856573706, 'Precision': 0.6263736263736264, 'Recall': 0.3149171270718232, 'F1 Score': 0.4191176470588235, 'ROC AUC': 0.736496496805457, 'Confusion Matrix': array([[1040,   34],
       [ 124,   57]])}, 'SVM': {'Accuracy': 0.8892430278884462, 'Precision': 0.71875, 'Recall': 0.3812154696132597, 'F1 Score': 0.4981949458483755, 'ROC AUC': 0.7685911087790777, 'Confusion Matrix': array([[1047,   27],
       [ 112,   69]])}, 'Decision Tree': {'Accuracy': 0.8143426294820717, 'Precision': 0.3992248062015504, 'Recall': 0.569060773480663, 'F1 Score': 0.46924829157175396, 'ROC AUC': 0.7103640030042079, 'Confusion Matrix': array([[919, 155],
       [ 78, 103]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.8741035856573706, 'Precision': 0.6017699115044248, 'Recall': 0.3756906077348066, 'F1 Score': 0.46258503401360546, 'ROC AUC': 0.7442076401535028, 'Confusion Matrix': array([[1029,   45],
       [ 113,   68]])}, 'SVM': {'Accuracy': 0.8868525896414342, 'Precision': 0.693069306930693, 'Recall': 0.3867403314917127, 'F1 Score': 0.4964539007092199, 'ROC AUC': 0.7709420043828513, 'Confusion Matrix': array([[1043,   31],
       [ 111,   70]])}, 'Decision Tree': {'Accuracy': 0.80398406374502, 'Precision': 0.379182156133829, 'Recall': 0.56353591160221, 'F1 Score': 0.45333333333333337, 'ROC AUC': 0.705842258505921, 'Confusion Matrix': array([[907, 167],
       [ 79, 102]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.8685258964143426, 'Precision': 0.5579710144927537, 'Recall': 0.425414364640884, 'F1 Score': 0.4827586206896552, 'ROC AUC': 0.7493878411885141, 'Confusion Matrix': array([[1013,   61],
       [ 104,   77]])}, 'SVM': {'Accuracy': 0.8868525896414342, 'Precision': 0.6857142857142857, 'Recall': 0.39779005524861877, 'F1 Score': 0.5034965034965034, 'ROC AUC': 0.7784525242548639, 'Confusion Matrix': array([[1041,   33],
       [ 109,   72]])}, 'Decision Tree': {'Accuracy': 0.7768924302788844, 'Precision': 0.33554817275747506, 'Recall': 0.5580110497237569, 'F1 Score': 0.4190871369294606, 'ROC AUC': 0.6838379785384323, 'Confusion Matrix': array([[874, 200],
       [ 80, 101]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8764940239043825, 'Precision': 0.7407407407407407, 'Recall': 0.22099447513812154, 'F1 Score': 0.3404255319148936, 'ROC AUC': 0.7319335987736246, 'Confusion Matrix': array([[1060,   14],
       [ 141,   40]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.927536231884058, 'Recall': 0.35359116022099446, 'F1 Score': 0.5119999999999999, 'ROC AUC': 0.7688225974052696, 'Confusion Matrix': array([[1069,    5],
       [ 117,   64]])}, 'Decision Tree': {'Accuracy': 0.8390438247011952, 'Precision': 0.4467005076142132, 'Recall': 0.4861878453038674, 'F1 Score': 0.4656084656084656, 'ROC AUC': 0.6921998621356626, 'Confusion Matrix': array([[965, 109],
       [ 93,  88]])}}
_______________________________________________________________________________




Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.8725099601593626, 'Precision': 0.6153846153846154, 'Recall': 0.30939226519337015, 'F1 Score': 0.4117647058823529, 'ROC AUC': 0.7436366348755619, 'Confusion Matrix': array([[1039,   35],
       [ 125,   56]])}, 'SVM': {'Accuracy': 0.8908366533864542, 'Precision': 0.7340425531914894, 'Recall': 0.3812154696132597, 'F1 Score': 0.5018181818181818, 'ROC AUC': 0.7681229873350002, 'Confusion Matrix': array([[1049,   25],
       [ 112,   69]])}, 'Decision Tree': {'Accuracy': 0.8302788844621514, 'Precision': 0.42857142857142855, 'Recall': 0.5303867403314917, 'F1 Score': 0.47407407407407404, 'ROC AUC': 0.7091576900521621, 'Confusion Matrix': array([[946, 128],
       [ 85,  96]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8693227091633466, 'Precision': 0.5714285714285714, 'Recall': 0.3756906077348066, 'F1 Score': 0.4533333333333333, 'ROC AUC': 0.7452621994506003, 'Confusion Matrix': array([[1023,   51],
       [ 113,   68]])}, 'SVM': {'Accuracy': 0.8884462151394422, 'Precision': 0.7070707070707071, 'Recall': 0.3867403314917127, 'F1 Score': 0.5, 'ROC AUC': 0.7708082553988291, 'Confusion Matrix': array([[1045,   29],
       [ 111,   70]])}, 'Decision Tree': {'Accuracy': 0.8103585657370518, 'Precision': 0.38735177865612647, 'Recall': 0.5414364640883977, 'F1 Score': 0.4516129032258064, 'ROC AUC': 0.6966367274710125, 'Confusion Matrix': array([[919, 155],
       [ 83,  98]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.8701195219123506, 'Precision': 0.5652173913043478, 'Recall': 0.430939226519337, 'F1 Score': 0.4890282131661442, 'ROC AUC': 0.7513272014568352, 'Confusion Matrix': array([[1014,   60],
       [ 103,   78]])}, 'SVM': {'Accuracy': 0.8876494023904382, 'Precision': 0.6886792452830188, 'Recall': 0.40331491712707185, 'F1 Score': 0.5087108013937283, 'ROC AUC': 0.779815735053551, 'Confusion Matrix': array([[1041,   33],
       [ 108,   73]])}, 'Decision Tree': {'Accuracy': 0.7752988047808765, 'Precision': 0.3465045592705167, 'Recall': 0.6298342541436464, 'F1 Score': 0.44705882352941173, 'ROC AUC': 0.7153101433171806, 'Confusion Matrix': array([[859, 215],
       [ 67, 114]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.848605577689243, 'Precision': 0.4756756756756757, 'Recall': 0.4861878453038674, 'F1 Score': 0.4808743169398907, 'ROC AUC': 0.7437086535592661, 'Confusion Matrix': array([[977,  97],
       [ 93,  88]])}, 'SVM': {'Accuracy': 0.8581673306772908, 'Precision': 0.5087719298245614, 'Recall': 0.48066298342541436, 'F1 Score': 0.4943181818181818, 'ROC AUC': 0.768575676203998, 'Confusion Matrix': array([[990,  84],
       [ 94,  87]])}, 'Decision Tree': {'Accuracy': 0.69800796812749, 'Precision': 0.2708333333333333, 'Recall': 0.6464088397790055, 'F1 Score': 0.3817292006525286, 'ROC AUC': 0.6749102338549544, 'Confusion Matrix': array([[759, 315],
       [ 64, 117]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8749003984063745, 'Precision': 0.7727272727272727, 'Recall': 0.1878453038674033, 'F1 Score': 0.3022222222222222, 'ROC AUC': 0.7232141938537197, 'Confusion Matrix': array([[1064,   10],
       [ 147,   34]])}, 'SVM': {'Accuracy': 0.9035856573705179, 'Precision': 0.9411764705882353, 'Recall': 0.35359116022099446, 'F1 Score': 0.5140562248995983, 'ROC AUC': 0.7572378777122752, 'Confusion Matrix': array([[1070,    4],
       [ 117,   64]])}, 'Decision Tree': {'Accuracy': 0.8541832669322709, 'Precision': 0.49444444444444446, 'Recall': 0.49171270718232046, 'F1 Score': 0.4930747922437673, 'ROC AUC': 0.7064184079755549, 'Confusion Matrix': array([[983,  91],
       [ 92,  89]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.8709163346613545, 'Precision': 0.6610169491525424, 'Recall': 0.2154696132596685, 'F1 Score': 0.325, 'ROC AUC': 0.7252410053808245, 'Confusion Matrix': array([[1054,   20],
       [ 142,   39]])}, 'SVM': {'Accuracy': 0.9035856573705179, 'Precision': 0.9411764705882353, 'Recall': 0.35359116022099446, 'F1 Score': 0.5140562248995983, 'ROC AUC': 0.7597122339166846, 'Confusion Matrix': array([[1070,    4],
       [ 117,   64]])}, 'Decision Tree': {'Accuracy': 0.8199203187250996, 'Precision': 0.4066390041493776, 'Recall': 0.5414364640883977, 'F1 Score': 0.4644549763033175, 'ROC AUC': 0.7072260460713807, 'Confusion Matrix': array([[931, 143],
       [ 83,  98]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.8756972111553785, 'Precision': 0.6666666666666666, 'Recall': 0.27624309392265195, 'F1 Score': 0.390625, 'ROC AUC': 0.725904606109242, 'Confusion Matrix': array([[1049,   25],
       [ 131,   50]])}, 'SVM': {'Accuracy': 0.9051792828685259, 'Precision': 0.9078947368421053, 'Recall': 0.3812154696132597, 'F1 Score': 0.5369649805447472, 'ROC AUC': 0.7678812103254218, 'Confusion Matrix': array([[1067,    7],
       [ 112,   69]])}, 'Decision Tree': {'Accuracy': 0.8167330677290837, 'Precision': 0.4075471698113208, 'Recall': 0.5966850828729282, 'F1 Score': 0.48430493273542596, 'ROC AUC': 0.7228103748058068, 'Confusion Matrix': array([[917, 157],
       [ 73, 108]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.8788844621513944, 'Precision': 0.6494845360824743, 'Recall': 0.34806629834254144, 'F1 Score': 0.45323741007194246, 'ROC AUC': 0.731136249061185, 'Confusion Matrix': array([[1040,   34],
       [ 118,   63]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.8674698795180723, 'Recall': 0.39779005524861877, 'F1 Score': 0.5454545454545454, 'ROC AUC': 0.7779792586190931, 'Confusion Matrix': array([[1063,   11],
       [ 109,   72]])}, 'Decision Tree': {'Accuracy': 0.7928286852589641, 'Precision': 0.3713355048859935, 'Recall': 0.6298342541436464, 'F1 Score': 0.4672131147540983, 'ROC AUC': 0.7236874594894903, 'Confusion Matrix': array([[881, 193],
       [ 67, 114]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.6932270916334662, 'Precision': 0.2671232876712329, 'Recall': 0.6464088397790055, 'F1 Score': 0.37802907915993544, 'ROC AUC': 0.7359769334444478, 'Confusion Matrix': array([[753, 321],
       [ 64, 117]])}, 'SVM': {'Accuracy': 0.8215139442231075, 'Precision': 0.40772532188841204, 'Recall': 0.5248618784530387, 'F1 Score': 0.45893719806763283, 'ROC AUC': 0.7753351440888094, 'Confusion Matrix': array([[936, 138],
       [ 86,  95]])}, 'Decision Tree': {'Accuracy': 0.69800796812749, 'Precision': 0.28, 'Recall': 0.6961325966850829, 'F1 Score': 0.3993660855784469, 'ROC AUC': 0.6963795178863544, 'Confusion Matrix': array([[750, 324],
       [ 55, 126]])}}
_______________________________________________________________________________


#Calling Random Pruning

In [26]:
results_random_pruning = dict()
for ratio in ratios:
  random_pruned_X_train, random_pruned_y_train = random_prune_data(X_train.to_numpy(), y_train.to_numpy(), ratio)
  preprocessed_random_pruned_X_train, scaler, imputer = preprocess_data_train(random_pruned_X_train)
  preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  random_pruned_X_train, random_pruned_y_train = preprocessed_random_pruned_X_train, random_pruned_y_train
  random_pruned_X_test, random_pruned_y_test = preprocessed_X_test, y_test.to_numpy()

  print(f"Train data pruned randomly at {ratio * 100}% :")
  results = evaluate_models(random_pruned_X_train, random_pruned_X_test, random_pruned_y_train, random_pruned_y_test)
  print(results)
  results_random_pruning[ratio] = results
  print("_______________________________________________________________________________")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned randomly at 20.0% :
{'Logistic Regression': {'Accuracy': 0.8764940239043825, 'Precision': 0.782608695652174, 'Recall': 0.19889502762430938, 'F1 Score': 0.3171806167400881, 'ROC AUC': 0.7281989156043911, 'Confusion Matrix': array([[1064,   10],
       [ 145,   36]])}, 'SVM': {'Accuracy': 0.9035856573705179, 'Precision': 0.9411764705882353, 'Recall': 0.35359116022099446, 'F1 Score': 0.5140562248995983, 'ROC AUC': 0.752649258721977, 'Confusion Matrix': array([[1070,    4],
       [ 117,   64]])}, 'Decision Tree': {'Accuracy': 0.8350597609561753, 'Precision': 0.4386792452830189, 'Recall': 0.5138121546961326, 'F1 Score': 0.4732824427480916, 'ROC AUC': 0.7041549636305647, 'Confusion Matrix': array([[955, 119],
       [ 88,  93]])}}
_______________________________________________________________________________




Train data pruned randomly at 40.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8772908366533865, 'Precision': 0.7647058823529411, 'Recall': 0.2154696132596685, 'F1 Score': 0.33620689655172414, 'ROC AUC': 0.7285024229142875, 'Confusion Matrix': array([[1062,   12],
       [ 142,   39]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.927536231884058, 'Recall': 0.35359116022099446, 'F1 Score': 0.5119999999999999, 'ROC AUC': 0.7539198740701873, 'Confusion Matrix': array([[1069,    5],
       [ 117,   64]])}, 'Decision Tree': {'Accuracy': 0.8159362549800797, 'Precision': 0.38839285714285715, 'Recall': 0.48066298342541436, 'F1 Score': 0.4296296296296296, 'ROC AUC': 0.6840154531518463, 'Confusion Matrix': array([[937, 137],
       [ 94,  87]])}}
_______________________________________________________________________________
Train data pruned randomly at 60.00000000000001% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8749003984063745, 'Precision': 0.6764705882352942, 'Recall': 0.2541436464088398, 'F1 Score': 0.3694779116465864, 'ROC AUC': 0.7332865211889255, 'Confusion Matrix': array([[1052,   22],
       [ 135,   46]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.9178082191780822, 'Recall': 0.3701657458563536, 'F1 Score': 0.5275590551181102, 'ROC AUC': 0.763344033252055, 'Confusion Matrix': array([[1068,    6],
       [ 114,   67]])}, 'Decision Tree': {'Accuracy': 0.8063745019920319, 'Precision': 0.376984126984127, 'Recall': 0.5248618784530387, 'F1 Score': 0.43879907621247116, 'ROC AUC': 0.6950883257713715, 'Confusion Matrix': array([[917, 157],
       [ 86,  95]])}}
_______________________________________________________________________________
Train data pruned randomly at 80.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8749003984063745, 'Precision': 0.6176470588235294, 'Recall': 0.34806629834254144, 'F1 Score': 0.44522968197879864, 'ROC AUC': 0.7312854306202867, 'Confusion Matrix': array([[1035,   39],
       [ 118,   63]])}, 'SVM': {'Accuracy': 0.899601593625498, 'Precision': 0.8160919540229885, 'Recall': 0.39226519337016574, 'F1 Score': 0.5298507462686567, 'ROC AUC': 0.7762405218268054, 'Confusion Matrix': array([[1058,   16],
       [ 110,   71]])}, 'Decision Tree': {'Accuracy': 0.7952191235059761, 'Precision': 0.37662337662337664, 'Recall': 0.6408839779005525, 'F1 Score': 0.474437627811861, 'ROC AUC': 0.7308173091762092, 'Confusion Matrix': array([[882, 192],
       [ 65, 116]])}}
_______________________________________________________________________________
Train data pruned randomly at 100.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7952191235059761, 'Precision': 0.36524822695035464, 'Recall': 0.569060773480663, 'F1 Score': 0.4449244060475162, 'ROC AUC': 0.752129695360968, 'Confusion Matrix': array([[895, 179],
       [ 78, 103]])}, 'SVM': {'Accuracy': 0.8589641434262948, 'Precision': 0.5106382978723404, 'Recall': 0.5303867403314917, 'F1 Score': 0.5203252032520325, 'ROC AUC': 0.7779278167021615, 'Confusion Matrix': array([[982,  92],
       [ 85,  96]])}, 'Decision Tree': {'Accuracy': 0.7043824701195219, 'Precision': 0.2850678733031674, 'Recall': 0.6961325966850829, 'F1 Score': 0.4044943820224719, 'ROC AUC': 0.6995380515859543, 'Confusion Matrix': array([[758, 316],
       [ 55, 126]])}}
_______________________________________________________________________________


# Calling SDV-Oversampling

In [28]:
sd1, train_df = do_sdv(X_train, y_train)
results_syn_sdv = dict()

# Add synthetic data at different percentages to the main DataFrame
for ratio in ratios:
    combined_df = add_synthetic_data(train_df, sd1, ratio)
    y_train_sdv = combined_df.iloc[:, -1]
    X_train_sdv = combined_df.iloc[:, :-1]

    preprocessed_X_train_sdv, scaler, imputer = preprocess_data_train(X_train_sdv)
    preprocessed_X_test_sdv = preprocess_data_test(X_test, scaler, imputer)

    X_train_sdv, y_train_sdv = preprocessed_X_train_sdv, y_train_sdv.to_numpy()
    X_test_sdv, y_test_sdv = preprocessed_X_test_sdv, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_sdv, X_test_sdv, y_train_sdv, y_test_sdv)
    results_syn_sdv[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

{
    "columns": {
        "CC": {
            "sdtype": "numerical"
        },
        "CCL": {
            "sdtype": "categorical"
        },
        "CCO": {
            "sdtype": "numerical"
        },
        "CI": {
            "sdtype": "numerical"
        },
        "CLC": {
            "sdtype": "numerical"
        },
        "CLLC": {
            "sdtype": "categorical"
        },
        "McCC": {
            "sdtype": "numerical"
        },
        "NL": {
            "sdtype": "numerical"
        },
        "NLE": {
            "sdtype": "categorical"
        },
        "CD": {
            "sdtype": "numerical"
        },
        "CLOC": {
            "sdtype": "numerical"
        },
        "DLOC": {
            "sdtype": "numerical"
        },
        "TCD": {
            "sdtype": "numerical"
        },
        "TCLOC": {
            "sdtype": "numerical"
        },
        "LLOC": {
            "sdtype": "numerical"
        },
        "LOC": {
            "sdtype": "nu



Train data combined with 20.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8677290836653386, 'Precision': 0.6415094339622641, 'Recall': 0.1878453038674033, 'F1 Score': 0.2905982905982906, 'ROC AUC': 0.721156517176456, 'Confusion Matrix': array([[1055,   19],
       [ 147,   34]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.9178082191780822, 'Recall': 0.3701657458563536, 'F1 Score': 0.5275590551181102, 'ROC AUC': 0.7535494922682799, 'Confusion Matrix': array([[1068,    6],
       [ 114,   67]])}, 'Decision Tree': {'Accuracy': 0.853386454183267, 'Precision': 0.4918032786885246, 'Recall': 0.4972375690607735, 'F1 Score': 0.4945054945054945, 'ROC AUC': 0.7130749920265028, 'Confusion Matrix': array([[981,  93],
       [ 91,  90]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8693227091633466, 'Precision': 0.6491228070175439, 'Recall': 0.20441988950276244, 'F1 Score': 0.3109243697478991, 'ROC AUC': 0.7186049980966491, 'Confusion Matrix': array([[1054,   20],
       [ 144,   37]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.9178082191780822, 'Recall': 0.3701657458563536, 'F1 Score': 0.5275590551181102, 'ROC AUC': 0.7516975832587427, 'Confusion Matrix': array([[1068,    6],
       [ 114,   67]])}, 'Decision Tree': {'Accuracy': 0.8605577689243028, 'Precision': 0.5176470588235295, 'Recall': 0.4861878453038674, 'F1 Score': 0.5014245014245013, 'ROC AUC': 0.712254493451444, 'Confusion Matrix': array([[992,  82],
       [ 93,  88]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8701195219123506, 'Precision': 0.65, 'Recall': 0.2154696132596685, 'F1 Score': 0.3236514522821577, 'ROC AUC': 0.7183117791701391, 'Confusion Matrix': array([[1053,   21],
       [ 142,   39]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.9178082191780822, 'Recall': 0.3701657458563536, 'F1 Score': 0.5275590551181102, 'ROC AUC': 0.7559775507474511, 'Confusion Matrix': array([[1068,    6],
       [ 114,   67]])}, 'Decision Tree': {'Accuracy': 0.8581673306772908, 'Precision': 0.5084745762711864, 'Recall': 0.4972375690607735, 'F1 Score': 0.5027932960893854, 'ROC AUC': 0.7108527012150581, 'Confusion Matrix': array([[987,  87],
       [ 91,  90]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8701195219123506, 'Precision': 0.65, 'Recall': 0.2154696132596685, 'F1 Score': 0.3236514522821577, 'ROC AUC': 0.7185947097132628, 'Confusion Matrix': array([[1053,   21],
       [ 142,   39]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.9178082191780822, 'Recall': 0.3701657458563536, 'F1 Score': 0.5275590551181102, 'ROC AUC': 0.7566771608177207, 'Confusion Matrix': array([[1068,    6],
       [ 114,   67]])}, 'Decision Tree': {'Accuracy': 0.8645418326693227, 'Precision': 0.5321637426900585, 'Recall': 0.5027624309392266, 'F1 Score': 0.5170454545454546, 'ROC AUC': 0.7232167659495663, 'Confusion Matrix': array([[994,  80],
       [ 90,  91]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8709163346613545, 'Precision': 0.6557377049180327, 'Recall': 0.22099447513812154, 'F1 Score': 0.3305785123966942, 'ROC AUC': 0.7184661049209338, 'Confusion Matrix': array([[1053,   21],
       [ 141,   40]])}, 'SVM': {'Accuracy': 0.9043824701195219, 'Precision': 0.9178082191780822, 'Recall': 0.3701657458563536, 'F1 Score': 0.5275590551181102, 'ROC AUC': 0.7569240820189924, 'Confusion Matrix': array([[1068,    6],
       [ 114,   67]])}, 'Decision Tree': {'Accuracy': 0.8430278884462151, 'Precision': 0.4583333333333333, 'Recall': 0.4861878453038674, 'F1 Score': 0.4718498659517426, 'ROC AUC': 0.6982314268958919, 'Confusion Matrix': array([[970, 104],
       [ 93,  88]])}}
_______________________________________________________________________________


# Calling SMOTE-Oversampling

In [29]:
results_smote = dict()

for ratio in ratios:

    X_train_smote, y_train_smote = smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])
    preprocessed_X_train_smote, scaler, imputer = preprocess_data_train((np.array(X_train_smote))[0])
    preprocessed_X_test_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_smote, y_train_smote = preprocessed_X_train_smote, (np.array(y_train_smote))[0]
    X_test_smote, y_test_smote = preprocessed_X_test_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_smote), len(y_train_smote))
    results = evaluate_models(X_train_smote, X_test_smote, y_train_smote, y_test_smote)
    results_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 20.0% synthetic data of minority class:
5730 5730
{'Logistic Regression': {'Accuracy': 0.8788844621513944, 'Precision': 0.704225352112676, 'Recall': 0.27624309392265195, 'F1 Score': 0.39682539682539686, 'ROC AUC': 0.7321702315915101, 'Confusion Matrix': array([[1053,   21],
       [ 131,   50]])}, 'SVM': {'Accuracy': 0.9035856573705179, 'Precision': 0.9285714285714286, 'Recall': 0.35911602209944754, 'F1 Score': 0.5179282868525897, 'ROC AUC': 0.7707671018652837, 'Confusion Matrix': array([[1069,    5],
       [ 116,   65]])}, 'Decision Tree': {'Accuracy': 0.8653386454183267, 'Precision': 0.532967032967033, 'Recall': 0.5359116022099447, 'F1 Score': 0.5344352617079889, 'ROC AUC': 0.7319593197320905, 'Confusion Matrix': array([[989,  85],
       [ 84,  97]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
6444 6444


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8812749003984064, 'Precision': 0.6538461538461539, 'Recall': 0.3756906077348066, 'F1 Score': 0.47719298245614034, 'ROC AUC': 0.735303044332644, 'Confusion Matrix': array([[1038,   36],
       [ 113,   68]])}, 'SVM': {'Accuracy': 0.9035856573705179, 'Precision': 0.8191489361702128, 'Recall': 0.425414364640884, 'F1 Score': 0.56, 'ROC AUC': 0.7798105908618579, 'Confusion Matrix': array([[1057,   17],
       [ 104,   77]])}, 'Decision Tree': {'Accuracy': 0.8454183266932271, 'Precision': 0.4688995215311005, 'Recall': 0.5414364640883977, 'F1 Score': 0.5025641025641026, 'ROC AUC': 0.7204517629144932, 'Confusion Matrix': array([[963, 111],
       [ 83,  98]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
7158 7158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8772908366533865, 'Precision': 0.5957446808510638, 'Recall': 0.46408839779005523, 'F1 Score': 0.5217391304347826, 'ROC AUC': 0.7391406113357407, 'Confusion Matrix': array([[1017,   57],
       [  97,   84]])}, 'SVM': {'Accuracy': 0.8868525896414342, 'Precision': 0.6444444444444445, 'Recall': 0.48066298342541436, 'F1 Score': 0.550632911392405, 'ROC AUC': 0.787243947858473, 'Confusion Matrix': array([[1026,   48],
       [  94,   87]])}, 'Decision Tree': {'Accuracy': 0.846215139442231, 'Precision': 0.47115384615384615, 'Recall': 0.5414364640883977, 'F1 Score': 0.5038560411311054, 'ROC AUC': 0.7270774818152824, 'Confusion Matrix': array([[964, 110],
       [ 83,  98]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
7872 7872


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8366533864541833, 'Precision': 0.4405940594059406, 'Recall': 0.49171270718232046, 'F1 Score': 0.4647519582245431, 'ROC AUC': 0.7410902599874483, 'Confusion Matrix': array([[961, 113],
       [ 92,  89]])}, 'SVM': {'Accuracy': 0.8709163346613545, 'Precision': 0.5549132947976878, 'Recall': 0.5303867403314917, 'F1 Score': 0.5423728813559321, 'ROC AUC': 0.7866112122802145, 'Confusion Matrix': array([[997,  77],
       [ 85,  96]])}, 'Decision Tree': {'Accuracy': 0.851792828685259, 'Precision': 0.4873096446700508, 'Recall': 0.5303867403314917, 'F1 Score': 0.5079365079365079, 'ROC AUC': 0.7230572960070785, 'Confusion Matrix': array([[973, 101],
       [ 85,  96]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
8586 8586


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7816733067729084, 'Precision': 0.33797909407665505, 'Recall': 0.5359116022099447, 'F1 Score': 0.4145299145299145, 'ROC AUC': 0.7461984423387553, 'Confusion Matrix': array([[884, 190],
       [ 84,  97]])}, 'SVM': {'Accuracy': 0.8406374501992032, 'Precision': 0.45622119815668205, 'Recall': 0.5469613259668509, 'F1 Score': 0.49748743718592975, 'ROC AUC': 0.7847541590789837, 'Confusion Matrix': array([[956, 118],
       [ 82,  99]])}, 'Decision Tree': {'Accuracy': 0.847808764940239, 'Precision': 0.4752475247524752, 'Recall': 0.5303867403314917, 'F1 Score': 0.5013054830287206, 'ROC AUC': 0.7206909678282252, 'Confusion Matrix': array([[968, 106],
       [ 85,  96]])}}
_______________________________________________________________________________


# Calling Random-Oversampling

In [30]:
results_random = dict()

for ratio in ratios:

    X_train_random, y_train_random = random_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_random, scaler, imputer = preprocess_data_train((np.array(X_train_random)[0]))
    preprocessed_X_test_random = preprocess_data_test(X_test, scaler, imputer)

    X_train_random, y_train_random = preprocessed_X_train_random, (np.array(y_train_random))[0]
    X_test_random, y_test_random = preprocessed_X_test_random, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_random), len(y_train_random))
    results = evaluate_models(X_train_random, X_test_random, y_train_random, y_test_random)
    results_random[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 20.0% synthetic data of minority class:
5730 5730
{'Logistic Regression': {'Accuracy': 0.8749003984063745, 'Precision': 0.6621621621621622, 'Recall': 0.27071823204419887, 'F1 Score': 0.38431372549019605, 'ROC AUC': 0.7376076422111794, 'Confusion Matrix': array([[1049,   25],
       [ 132,   49]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.9154929577464789, 'Recall': 0.35911602209944754, 'F1 Score': 0.5158730158730159, 'ROC AUC': 0.7693524491496652, 'Confusion Matrix': array([[1068,    6],
       [ 116,   65]])}, 'Decision Tree': {'Accuracy': 0.850199203187251, 'Precision': 0.48148148148148145, 'Recall': 0.5027624309392266, 'F1 Score': 0.49189189189189186, 'ROC AUC': 0.7054873092790931, 'Confusion Matrix': array([[976,  98],
       [ 90,  91]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
6444 6444


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8796812749003984, 'Precision': 0.6442307692307693, 'Recall': 0.3701657458563536, 'F1 Score': 0.4701754385964913, 'ROC AUC': 0.7468774756422523, 'Confusion Matrix': array([[1037,   37],
       [ 114,   67]])}, 'SVM': {'Accuracy': 0.9059760956175299, 'Precision': 0.8387096774193549, 'Recall': 0.430939226519337, 'F1 Score': 0.5693430656934306, 'ROC AUC': 0.7860299186188874, 'Confusion Matrix': array([[1059,   15],
       [ 103,   78]])}, 'Decision Tree': {'Accuracy': 0.850199203187251, 'Precision': 0.48258706467661694, 'Recall': 0.5359116022099447, 'F1 Score': 0.5078534031413613, 'ROC AUC': 0.7203128697387778, 'Confusion Matrix': array([[970, 104],
       [ 84,  97]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
7158 7158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8725099601593626, 'Precision': 0.574468085106383, 'Recall': 0.44751381215469616, 'F1 Score': 0.5031055900621119, 'ROC AUC': 0.7520885418274226, 'Confusion Matrix': array([[1014,   60],
       [ 100,   81]])}, 'SVM': {'Accuracy': 0.8972111553784861, 'Precision': 0.7131147540983607, 'Recall': 0.48066298342541436, 'F1 Score': 0.5742574257425743, 'ROC AUC': 0.7916628085228968, 'Confusion Matrix': array([[1039,   35],
       [  94,   87]])}, 'Decision Tree': {'Accuracy': 0.847808764940239, 'Precision': 0.47572815533980584, 'Recall': 0.5414364640883977, 'F1 Score': 0.5064599483204134, 'ROC AUC': 0.7231498914575553, 'Confusion Matrix': array([[966, 108],
       [ 83,  98]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
7872 7872


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8382470119521912, 'Precision': 0.445, 'Recall': 0.49171270718232046, 'F1 Score': 0.4671916010498688, 'ROC AUC': 0.7530402172906572, 'Confusion Matrix': array([[963, 111],
       [ 92,  89]])}, 'SVM': {'Accuracy': 0.8852589641434263, 'Precision': 0.6178343949044586, 'Recall': 0.5359116022099447, 'F1 Score': 0.57396449704142, 'ROC AUC': 0.790603105034106, 'Confusion Matrix': array([[1014,   60],
       [  84,   97]])}, 'Decision Tree': {'Accuracy': 0.8454183266932271, 'Precision': 0.4688995215311005, 'Recall': 0.5414364640883977, 'F1 Score': 0.5025641025641026, 'ROC AUC': 0.7206909678282251, 'Confusion Matrix': array([[963, 111],
       [ 83,  98]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
8586 8586


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7840637450199203, 'Precision': 0.3469387755102041, 'Recall': 0.56353591160221, 'F1 Score': 0.42947368421052634, 'ROC AUC': 0.7552059219934772, 'Confusion Matrix': array([[882, 192],
       [ 79, 102]])}, 'SVM': {'Accuracy': 0.8701195219123506, 'Precision': 0.5489130434782609, 'Recall': 0.5580110497237569, 'F1 Score': 0.5534246575342465, 'ROC AUC': 0.7931494799222198, 'Confusion Matrix': array([[991,  83],
       [ 80, 101]])}, 'Decision Tree': {'Accuracy': 0.8430278884462151, 'Precision': 0.46, 'Recall': 0.5082872928176796, 'F1 Score': 0.4829396325459318, 'ROC AUC': 0.7061431937199708, 'Confusion Matrix': array([[966, 108],
       [ 89,  92]])}}
_______________________________________________________________________________


# Calling SVM-SMOTE Over-Sampling

In [31]:
results_svm_smote = dict()

for ratio in ratios:

    X_train_svm_smote, y_train_svm_smote = svm_smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_svm_smote, scaler, imputer = preprocess_data_train((np.array(X_train_svm_smote))[0])
    preprocessed_X_test_svm_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_svm_smote, y_train_svm_smote = preprocessed_X_train_svm_smote, (np.array(y_train_svm_smote))[0]
    X_test_svm_smote, y_test_svm_smote = preprocessed_X_test_svm_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_svm_smote), len(y_train_svm_smote))
    results = evaluate_models(X_train_svm_smote, X_test_svm_smote, y_train_svm_smote, y_test_svm_smote)
    results_svm_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 20.0% synthetic data of minority class:
5730 5730
{'Logistic Regression': {'Accuracy': 0.8749003984063745, 'Precision': 0.6578947368421053, 'Recall': 0.27624309392265195, 'F1 Score': 0.3891050583657588, 'ROC AUC': 0.7371909626840334, 'Confusion Matrix': array([[1048,   26],
       [ 131,   50]])}, 'SVM': {'Accuracy': 0.9067729083665339, 'Precision': 0.9, 'Recall': 0.39779005524861877, 'F1 Score': 0.5517241379310345, 'ROC AUC': 0.7779844028107864, 'Confusion Matrix': array([[1066,    8],
       [ 109,   72]])}, 'Decision Tree': {'Accuracy': 0.8430278884462151, 'Precision': 0.4587628865979381, 'Recall': 0.49171270718232046, 'F1 Score': 0.4746666666666667, 'ROC AUC': 0.7000653312345032, 'Confusion Matrix': array([[969, 105],
       [ 92,  89]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 40.0% synthetic data of minority class:
6444 6444
{'Logistic Regression': {'Accuracy': 0.8796812749003984, 'Precision': 0.6442307692307693, 'Recall': 0.3701657458563536, 'F1 Score': 0.4701754385964913, 'ROC AUC': 0.7419081864666606, 'Confusion Matrix': array([[1037,   37],
       [ 114,   67]])}, 'SVM': {'Accuracy': 0.8988047808764941, 'Precision': 0.75, 'Recall': 0.44751381215469616, 'F1 Score': 0.560553633217993, 'ROC AUC': 0.7862562630533864, 'Confusion Matrix': array([[1047,   27],
       [ 100,   81]])}, 'Decision Tree': {'Accuracy': 0.8549800796812749, 'Precision': 0.4973821989528796, 'Recall': 0.5248618784530387, 'F1 Score': 0.510752688172043, 'ROC AUC': 0.7191477103202774, 'Confusion Matrix': array([[978,  96],
       [ 86,  95]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 60.00000000000001% synthetic data of minority class:
7158 7158
{'Logistic Regression': {'Accuracy': 0.8741035856573706, 'Precision': 0.5815602836879432, 'Recall': 0.4530386740331492, 'F1 Score': 0.5093167701863354, 'ROC AUC': 0.7502726421597374, 'Confusion Matrix': array([[1015,   59],
       [  99,   82]])}, 'SVM': {'Accuracy': 0.8916334661354581, 'Precision': 0.6691729323308271, 'Recall': 0.49171270718232046, 'F1 Score': 0.5668789808917198, 'ROC AUC': 0.7897183040628826, 'Confusion Matrix': array([[1030,   44],
       [  92,   89]])}, 'Decision Tree': {'Accuracy': 0.8597609561752988, 'Precision': 0.5128205128205128, 'Recall': 0.5524861878453039, 'F1 Score': 0.5319148936170214, 'ROC AUC': 0.7351770116361617, 'Confusion Matrix': array([[979,  95],
       [ 81, 100]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 80.0% synthetic data of minority class:
7872 7872
{'Logistic Regression': {'Accuracy': 0.8358565737051793, 'Precision': 0.43781094527363185, 'Recall': 0.4861878453038674, 'F1 Score': 0.4607329842931937, 'ROC AUC': 0.7496193298147062, 'Confusion Matrix': array([[961, 113],
       [ 93,  88]])}, 'SVM': {'Accuracy': 0.8780876494023905, 'Precision': 0.5813953488372093, 'Recall': 0.5524861878453039, 'F1 Score': 0.56657223796034, 'ROC AUC': 0.7939159644845006, 'Confusion Matrix': array([[1002,   72],
       [  81,  100]])}, 'Decision Tree': {'Accuracy': 0.8382470119521912, 'Precision': 0.44554455445544555, 'Recall': 0.4972375690607735, 'F1 Score': 0.4699738903394256, 'ROC AUC': 0.6993194234389951, 'Confusion Matrix': array([[962, 112],
       [ 91,  90]])}}
_______________________________________________________________________________




Train data combined with 100.0% synthetic data of minority class:
8586 8586


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.7936254980079681, 'Precision': 0.3617021276595745, 'Recall': 0.56353591160221, 'F1 Score': 0.44060475161987045, 'ROC AUC': 0.754007325328971, 'Confusion Matrix': array([[894, 180],
       [ 79, 102]])}, 'SVM': {'Accuracy': 0.850996015936255, 'Precision': 0.4857142857142857, 'Recall': 0.56353591160221, 'F1 Score': 0.5217391304347826, 'ROC AUC': 0.79176569235676, 'Confusion Matrix': array([[966, 108],
       [ 79, 102]])}, 'Decision Tree': {'Accuracy': 0.8438247011952191, 'Precision': 0.4634146341463415, 'Recall': 0.5248618784530387, 'F1 Score': 0.49222797927461137, 'ROC AUC': 0.7135251087996543, 'Confusion Matrix': array([[964, 110],
       [ 86,  95]])}}
_______________________________________________________________________________


# No-Sampling Results

In [33]:
results_no_sampling = dict()

for ratio in ratios:

    X_train_no_sampling, y_train_no_sampling = X_train.to_numpy(), y_train.to_numpy()

    preprocessed_X_train_no_sampling, scaler, imputer = preprocess_data_train(X_train_no_sampling)
    preprocessed_X_test_no_sampling = preprocess_data_test(X_test, scaler, imputer)

    X_train_no_sampling, y_train_no_sampling = preprocessed_X_train_no_sampling, y_train_no_sampling
    X_test_no_sampling, y_test_no_sampling = preprocessed_X_test_no_sampling, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_no_sampling), len(y_train_no_sampling))
    results = evaluate_models(X_train_no_sampling, X_test_no_sampling, y_train_no_sampling, y_test_no_sampling)
    results_no_sampling[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:
5016 5016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8756972111553785, 'Precision': 0.8205128205128205, 'Recall': 0.17679558011049723, 'F1 Score': 0.2909090909090909, 'ROC AUC': 0.7235742872722409, 'Confusion Matrix': array([[1067,    7],
       [ 149,   32]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.9538461538461539, 'Recall': 0.3425414364640884, 'F1 Score': 0.5040650406504066, 'ROC AUC': 0.7590023354630289, 'Confusion Matrix': array([[1071,    3],
       [ 119,   62]])}, 'Decision Tree': {'Accuracy': 0.8494023904382471, 'Precision': 0.4789473684210526, 'Recall': 0.5027624309392266, 'F1 Score': 0.49056603773584906, 'ROC AUC': 0.7075295533812771, 'Confusion Matrix': array([[975,  99],
       [ 90,  91]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
5016 5016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8756972111553785, 'Precision': 0.8205128205128205, 'Recall': 0.17679558011049723, 'F1 Score': 0.2909090909090909, 'ROC AUC': 0.7235742872722409, 'Confusion Matrix': array([[1067,    7],
       [ 149,   32]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.9538461538461539, 'Recall': 0.3425414364640884, 'F1 Score': 0.5040650406504066, 'ROC AUC': 0.7590023354630289, 'Confusion Matrix': array([[1071,    3],
       [ 119,   62]])}, 'Decision Tree': {'Accuracy': 0.8494023904382471, 'Precision': 0.4789473684210526, 'Recall': 0.5027624309392266, 'F1 Score': 0.49056603773584906, 'ROC AUC': 0.7075295533812771, 'Confusion Matrix': array([[975,  99],
       [ 90,  91]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
5016 5016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8756972111553785, 'Precision': 0.8205128205128205, 'Recall': 0.17679558011049723, 'F1 Score': 0.2909090909090909, 'ROC AUC': 0.7235742872722409, 'Confusion Matrix': array([[1067,    7],
       [ 149,   32]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.9538461538461539, 'Recall': 0.3425414364640884, 'F1 Score': 0.5040650406504066, 'ROC AUC': 0.7590023354630289, 'Confusion Matrix': array([[1071,    3],
       [ 119,   62]])}, 'Decision Tree': {'Accuracy': 0.8494023904382471, 'Precision': 0.4789473684210526, 'Recall': 0.5027624309392266, 'F1 Score': 0.49056603773584906, 'ROC AUC': 0.7075295533812771, 'Confusion Matrix': array([[975,  99],
       [ 90,  91]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
5016 5016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8756972111553785, 'Precision': 0.8205128205128205, 'Recall': 0.17679558011049723, 'F1 Score': 0.2909090909090909, 'ROC AUC': 0.7235742872722409, 'Confusion Matrix': array([[1067,    7],
       [ 149,   32]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.9538461538461539, 'Recall': 0.3425414364640884, 'F1 Score': 0.5040650406504066, 'ROC AUC': 0.7590023354630289, 'Confusion Matrix': array([[1071,    3],
       [ 119,   62]])}, 'Decision Tree': {'Accuracy': 0.8494023904382471, 'Precision': 0.4789473684210526, 'Recall': 0.5027624309392266, 'F1 Score': 0.49056603773584906, 'ROC AUC': 0.7075295533812771, 'Confusion Matrix': array([[975,  99],
       [ 90,  91]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
5016 5016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8756972111553785, 'Precision': 0.8205128205128205, 'Recall': 0.17679558011049723, 'F1 Score': 0.2909090909090909, 'ROC AUC': 0.7235742872722409, 'Confusion Matrix': array([[1067,    7],
       [ 149,   32]])}, 'SVM': {'Accuracy': 0.902788844621514, 'Precision': 0.9538461538461539, 'Recall': 0.3425414364640884, 'F1 Score': 0.5040650406504066, 'ROC AUC': 0.7590023354630289, 'Confusion Matrix': array([[1071,    3],
       [ 119,   62]])}, 'Decision Tree': {'Accuracy': 0.8494023904382471, 'Precision': 0.4789473684210526, 'Recall': 0.5027624309392266, 'F1 Score': 0.49056603773584906, 'ROC AUC': 0.7075295533812771, 'Confusion Matrix': array([[975,  99],
       [ 90,  91]])}}
_______________________________________________________________________________
