<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/Eclipse_JDT_sampling_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [2]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.11.0-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.6/125.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.34.81-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.34.81-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.10,>=0.9.0 (from sdv)
  Downloading ctgan-0.9.1-py3-none-any.whl (24 kB)
Collecting deepecho<0.6,>=0.5 (from sdv)
  Downl

In [3]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [4]:
!unzip data.zip

Archive:  data.zip
   creating: data/
   creating: data/imbalance_defects_prediction/
   creating: data/project_health/
  inflating: data/README.md          
   creating: data/JavaScript_Vulnerability/
   creating: data/Bug_Reports/
   creating: data/Vulnerable_Files/
   creating: data/defects_prediction/
   creating: data/imbalance_defects_prediction/7_CK_NET_PROC/
   creating: data/imbalance_defects_prediction/2_NET/
   creating: data/imbalance_defects_prediction/4_CK_NET/
   creating: data/imbalance_defects_prediction/3_PROC/
   creating: data/imbalance_defects_prediction/6_NET_PROC/
   creating: data/imbalance_defects_prediction/1_CK/
   creating: data/imbalance_defects_prediction/5_CK_PROC/
   creating: data/project_health/monthly_closed_PRs_2mo/
   creating: data/project_health/monthly_commits_2mo/
   creating: data/project_health/monthly_open_PRs_2mo/
   creating: data/project_health/monthly_closed_issues_2mo/
   creating: data/project_health/monthly_commits_12mo/
   creating: d

In [5]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

from scipy.io import arff
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [6]:
def preprocess_data_train(X_train):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_train).sum()

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_train).sum()

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler, imputer

def preprocess_data_test(X_test, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_test).sum()

    # Handle missing data
    X_test = imputer.transform(X_test)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_test).sum()

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

## Experiments

### Dataset: Eclipse JDT

In [13]:
project = "Defect_Eclipse_JDT_Core"
fname = "_".join(project.split("_")[1:])
data_path = f"data/imbalance_defects_prediction/7_CK_NET_PROC/input/{fname}--CK_NET_PROC.arff"
data = arff.loadarff(data_path)
df = pd.DataFrame(data[0])
df['isBug'] = df['isBug'].astype('str')
d = {'YES': 1, 'NO': 0}  # Remove the byte string prefix 'b'
df['isBug'] = df['isBug'].map(d).fillna(df['isBug'])
print(df['isBug'])
print("before drop duplicates", df.shape[0])
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)
print("after drop duplicates", df.shape[0])

df.describe()

0      0
1      0
2      0
3      0
4      0
      ..
992    0
993    0
994    0
995    0
996    0
Name: isBug, Length: 997, dtype: int64
before drop duplicates 997
after drop duplicates 997


Unnamed: 0,wmc,dit,rfc,noc,cbo,lcom,loc,revision_num,author_num,linesadd_sum,...,InFreeClo,OutValClo,InValClo,OutRecipClo,InRecipClo,OutdwReach,IndwReach,nOutdwReach,nIndwReach,isBug
count,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,...,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0
mean,58.384152,2.727182,76.874624,0.712136,12.21665,364.727182,224.729188,45.618857,5.79338,1209.460381,...,0.003507,0.57792,0.57792,0.173663,0.173663,173.548682,173.548714,0.174071,0.174071,0.20662
std,135.72266,1.721525,180.978591,2.154752,17.815915,3230.074059,555.70053,60.995862,2.570187,3921.650184,...,0.002629,0.388013,0.202109,0.10694,0.089645,106.878591,89.46382,0.1072,0.089733,0.405084
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.001003,0.0,0.0,0.001003,0.001003,1.0,1.0,0.001003,0.001003,0.0
25%,8.0,1.0,12.0,0.0,3.0,6.0,28.0,13.0,4.0,81.0,...,0.002014,0.257659,0.502486,0.092025,0.118853,91.916748,118.883202,0.092193,0.119241,0.0
50%,20.0,2.0,30.0,0.0,7.0,28.0,75.0,30.0,6.0,276.0,...,0.002016,0.951735,0.502911,0.162254,0.150396,162.562225,150.299881,0.163051,0.150752,0.0
75%,50.0,4.0,70.0,0.0,14.0,91.0,192.0,52.0,7.0,833.0,...,0.007888,0.953893,0.873717,0.267023,0.225866,266.911713,225.466293,0.267715,0.226145,0.0
max,1680.0,8.0,2603.0,26.0,156.0,81003.0,7341.0,709.0,15.0,65571.0,...,0.008033,0.970087,0.876018,0.438462,0.534813,437.66452,533.550598,0.438981,0.535156,1.0


## Training and testing using ML models

In [8]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test, random_state=42):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(random_state=random_state),
        "SVM": SVC(random_state=random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state)
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results

In [14]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [15]:
results = evaluate_models(X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
print(results)

{'Logistic Regression': {'Accuracy': 0.84, 'Precision': 0.7777777777777778, 'Recall': 0.44680851063829785, 'F1 Score': 0.5675675675675675, 'ROC AUC': 0.8149075232930052, 'Confusion Matrix': array([[147,   6],
       [ 26,  21]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.7894736842105263, 'Recall': 0.3191489361702128, 'F1 Score': 0.4545454545454545, 'ROC AUC': 0.7744402725629259, 'Confusion Matrix': array([[149,   4],
       [ 32,  15]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7014323459880406, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}}


# SDV - Oversampling

In [27]:
def do_sdv(X_train, y_train):
  train_df = pd.concat([X_train, y_train], axis=1)
  class_counts = y_train.value_counts()

  # Find minority class label
  minority_class_label = class_counts.idxmin()

  # Filter rows with minority class label
  minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

  # Calculate counts of majority and minority classes
  majority_count = class_counts.max()
  minority_count = class_counts.min()

  metadata_data = SingleTableMetadata()
  metadata_data.detect_from_dataframe(minority_df)
  # print(metadata_data)
  # Generate synthetic data using GaussianCopulaSynthesizer
  synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
  synthesizer_breast_data.fit(minority_df)

  # Print sample synthetic data
  synthesizer_breast_data.reset_sampling()
  sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
  return sd1, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)

    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Random Over-Sampling

In [18]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def random_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = RandomOverSampler(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SMOTE Over-Sampling

In [19]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# SVM-SMOTE Over-Sampling

In [20]:
def find_minority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    min_label = min(zip(counts, labels))[1]
    indices_with_min_label = np.where(y == min_label)[0]
    X_min, y_min = X[indices_with_min_label], y[indices_with_min_label]

    # Other class samples
    indices_without_min_label = np.where(y != min_label)[0]
    X_remaining, y_remaining = X[indices_without_min_label], y[indices_without_min_label]

    return X_min, y_min, X_remaining, y_remaining, min_label

def svm_smote_oversampling(X_train, y_train, oversampling_ratios, seed=42):

  oversampled_X_train_ratios = dict()
  oversampled_y_train_ratios = dict()
  X_minority, y_minority, X_remaining, y_remaining, min_label = find_minority_data(X_train, y_train)
  ideal_samps = len(X_remaining) - len(X_minority)

  oversampling_samps = [int(ideal_samps * (oversampling_ratio)) for oversampling_ratio in oversampling_ratios]
  for oversampling_samp, oversampling_ratio in zip(oversampling_samps, oversampling_ratios):

    sampling_strategy = {min_label: len(X_minority) + oversampling_samp}
    X_train_upsampled, y_train_upsampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state = seed).fit_resample(X_train, y_train)

    oversampled_X_train_ratios[oversampling_ratio] = X_train_upsampled
    oversampled_y_train_ratios[oversampling_ratio] = y_train_upsampled

  return list(oversampled_X_train_ratios.values()), list(oversampled_y_train_ratios.values())


# Intelligent Pruning

In [21]:
def find_majority_data(X, y):
    labels, counts = np.unique(y, return_counts=True)
    max_label = max(zip(counts, labels))[1]
    indices_with_max_label = np.where(y == max_label)[0]
    X_maj, y_maj = X[indices_with_max_label], y[indices_with_max_label]

    # Exclude majority class samples
    indices_without_max_label = np.where(y != max_label)[0]
    X_remaining, y_remaining = X[indices_without_max_label], y[indices_without_max_label]

    return X_maj, y_maj, X_remaining, y_remaining, min(counts)

def do_clustering(X, y, labels):
  clustered_X = defaultdict(list)
  clustered_y = defaultdict(list)

  for i, label in enumerate(labels):
      clustered_X[label].append(X[i])
      clustered_y[label].append(y[i])

  # Sort clustered_X and clustered_y in descending order based on the length of values in each dictionary
  sorted_clustered_X = dict(sorted(clustered_X.items(), key=lambda x: -len(x[1])))
  sorted_clustered_y = dict(sorted(clustered_y.items(), key=lambda x: -len(x[1])))

  return sorted_clustered_X, sorted_clustered_y


def intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, per_cluster_pruning_ratio=0.7, seed=42):
  random.seed(seed)
  pruning_ratios_X_maj, pruning_ratios_y_maj = defaultdict(list), defaultdict(list)
  for pruning_samp, pruning_ratio in zip(pruning_samps, pruning_ratios):
    samps = 0
    # print("For Pruning samps: ", pruning_samp)
    prune_samps = pruning_samp
    # print(prune_samps)
    clustered_X_new = defaultdict(list)
    clustered_y_new = defaultdict(list)
    # Iterate over the sorted dictionaries
    for label, values_X in clustered_X.items():
        # Calculate the number of samples to prune
        num_samples_to_prune = int(prune_samps * per_cluster_pruning_ratio)
        if(num_samples_to_prune > len(values_X)):
          num_samples_to_prune = len(values_X)//2
          prune_samps -= num_samples_to_prune
        else:
          prune_samps -= num_samples_to_prune

        # Randomly choose samples to prune
        indices_to_prune = random.sample(range(len(values_X)), num_samples_to_prune)

        # Prune the samples from clustered_X and clustered_y
        clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in indices_to_prune]
        clustered_y_new[label] = [clustered_y[label][i] for i in range(len(clustered_y[label])) if i not in indices_to_prune]

    iter = 0
    while(prune_samps > 0):
        if(iter>=100):
          break
        for label, values_X in clustered_X_new.items():
          if(prune_samps <=0 or len(values_X) <= 0):
            break
          # print(len(values_X))
          index_to_prune = random.sample(range(len(values_X)), 1)
          clustered_X_new[label] = [values_X[i] for i in range(len(values_X)) if i not in index_to_prune]
          clustered_y_new[label] = [clustered_y_new[label][i] for i in range(len(clustered_y_new[label])) if i not in index_to_prune]

          prune_samps -= 1
        iter += 1

    for label in clustered_X_new:
        pruning_ratios_X_maj[pruning_ratio].extend(clustered_X_new[label])
        pruning_ratios_y_maj[pruning_ratio].extend(clustered_y_new[label])

  return pruning_ratios_X_maj, pruning_ratios_y_maj

def combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining):

  pruning_ratios_X, pruning_ratios_y = defaultdict(list), defaultdict(list)
  for pruning_ratio in pruning_ratios:
    pruning_ratios_X[pruning_ratio].extend(pruning_ratios_X_maj[pruning_ratio])
    pruning_ratios_X[pruning_ratio].extend(X_remaining)

    pruning_ratios_y[pruning_ratio].extend(pruning_ratios_y_maj[pruning_ratio])
    pruning_ratios_y[pruning_ratio].extend(y_remaining)

  return pruning_ratios_X, pruning_ratios_y

def do_intelligent_pruning(X, y, ratio, per_cluster_pruning_ratio=0.7, seed=42):

  X_maj, y_maj, X_remaining, y_remaining, min_class_samples = find_majority_data(X, y)
  kmeans = KMeans(n_clusters=3, random_state = 42)
  kmeans.fit(X_maj)
  labels = kmeans.labels_
  clustered_X, clustered_y = do_clustering(X_maj, y_maj, labels)

  pruning_best = len(X_maj)-min_class_samples
  pruning_samps = [int(pruning_best * ratio)]
  pruning_ratios = [ratio]

  pruning_ratios_X_maj, pruning_ratios_y_maj = intelligent_prune_data(pruning_samps, pruning_ratios, clustered_X, clustered_y, \
                                                                      per_cluster_pruning_ratio=per_cluster_pruning_ratio, seed=seed)

  pruning_ratios_X, pruning_ratios_y = combine_data(pruning_ratios, pruning_ratios_X_maj, pruning_ratios_y_maj, X_remaining, y_remaining)

  return list(pruning_ratios_X.values()), list(pruning_ratios_y.values())

# Random Pruning

In [22]:
"""
inputs:
X: np.array
y: np.array
percentage: from 0% upto 100%, enter int value
"""
def random_prune_data(X, y, ratio, seed = 42):
  # preprocessed_X, scaler, imputer = preprocess_data_train(X)
  # preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  # X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
  # X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()
  np.random.seed(seed)
  labels_count = {}
  labels = np.unique(y)
  for label in labels:
    labels_count[label] = np.count_nonzero(y == label)
  max_label = min_label = labels[0]
  for label in labels_count:
    if labels_count[label] > labels_count[max_label]:
      max_label = label
    if labels_count[label] < labels_count[min_label]:
      min_label = label

  # print("Max", max_label, labels_count[max_label])
  # print("Min", min_label, labels_count[min_label])

  prune_counts = {}
  prune_indexes = {}
  for label in labels_count:
    prune_counts[label] = labels_count[label] - labels_count[min_label]
    prune_indexes[label] = np.where(y == label)[0]

  prune_amount = int(ratio * sum(map(lambda x: x[1], prune_counts.items())))
  prune_it = {}

  while prune_amount > 0:
    for label in labels:
      if (len(prune_indexes[label]) - labels_count[min_label]) > 0 and prune_amount > 0:
        random_index = np.random.choice(len(prune_indexes[label]))
        random_item = prune_indexes[label][random_index]
        prune_indexes[label] = np.delete(prune_indexes[label], random_index)
        if prune_it.get(label, None) is None:
          prune_it[label] = np.array([])
        prune_it[label] = np.append(prune_it[label], [random_item])
        prune_amount -= 1



  formatted_indexes = np.array([])
  for label in prune_indexes:
    formatted_indexes = np.append(formatted_indexes, prune_indexes[label])
  formatted_indexes = np.sort(formatted_indexes)
  new_arr = np.array([np.int64(i) for i in formatted_indexes])

  return X[new_arr], y[new_arr]

In [23]:
ratios = [ratio for ratio in np.arange(0.2, 1.1, 0.2)]

# Calling Intelligent Pruning

In [24]:
results_intelligent_pruning = dict()
per_cluster_pruning_ratios = [0.5, 0.7, 0.9, 1]

for per_cluster_pruning_ratio in per_cluster_pruning_ratios:
  print(f'For per-cluster pruning ratio {per_cluster_pruning_ratio}')
  for ratio in ratios:
    X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

    intelligent_pruned_X_train, intelligent_pruned_y_train = do_intelligent_pruning(X_train_copy.to_numpy(), y_train_copy.to_numpy(), ratio, per_cluster_pruning_ratio=per_cluster_pruning_ratio)

    preprocessed_intelligent_pruned_X_train, scaler, imputer = preprocess_data_train((np.array(intelligent_pruned_X_train))[0])
    preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

    intelligent_pruned_X_train, intelligent_pruned_y_train = preprocessed_intelligent_pruned_X_train, (np.array(intelligent_pruned_y_train))[0]
    intelligent_pruned_X_test, intelligent_pruned_y_test = preprocessed_X_test, y_test.to_numpy()
    print(f"Train data pruned intelligently at {ratio * 100}% :")
    results = evaluate_models(intelligent_pruned_X_train, intelligent_pruned_X_test, intelligent_pruned_y_train, intelligent_pruned_y_test)
    print(results)
    results_intelligent_pruning[ratio] = results
    print("_______________________________________________________________________________")

For per-cluster pruning ratio 0.5




Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.825, 'Precision': 0.6428571428571429, 'Recall': 0.574468085106383, 'F1 Score': 0.6067415730337079, 'ROC AUC': 0.8327075511055486, 'Confusion Matrix': array([[138,  15],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.83, 'Precision': 0.7407407407407407, 'Recall': 0.425531914893617, 'F1 Score': 0.5405405405405406, 'ROC AUC': 0.8210262828535669, 'Confusion Matrix': array([[146,   7],
       [ 27,  20]])}, 'Decision Tree': {'Accuracy': 0.815, 'Precision': 0.5961538461538461, 'Recall': 0.6595744680851063, 'F1 Score': 0.6262626262626262, 'ROC AUC': 0.761159783062161, 'Confusion Matrix': array([[132,  21],
       [ 16,  31]])}}
_______________________________________________________________________________




Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.805, 'Precision': 0.58, 'Recall': 0.6170212765957447, 'F1 Score': 0.5979381443298969, 'ROC AUC': 0.8236684744819914, 'Confusion Matrix': array([[132,  21],
       [ 18,  29]])}, 'SVM': {'Accuracy': 0.84, 'Precision': 0.7142857142857143, 'Recall': 0.5319148936170213, 'F1 Score': 0.6097560975609756, 'ROC AUC': 0.8100403281880127, 'Confusion Matrix': array([[143,  10],
       [ 22,  25]])}, 'Decision Tree': {'Accuracy': 0.785, 'Precision': 0.5370370370370371, 'Recall': 0.6170212765957447, 'F1 Score': 0.5742574257425743, 'ROC AUC': 0.7268112918926436, 'Confusion Matrix': array([[128,  25],
       [ 18,  29]])}}
_______________________________________________________________________________




Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.77, 'Precision': 0.5081967213114754, 'Recall': 0.6595744680851063, 'F1 Score': 0.5740740740740742, 'ROC AUC': 0.7940481157001809, 'Confusion Matrix': array([[123,  30],
       [ 16,  31]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.6170212765957447, 'Recall': 0.6170212765957447, 'F1 Score': 0.6170212765957447, 'ROC AUC': 0.8094840773188707, 'Confusion Matrix': array([[135,  18],
       [ 18,  29]])}, 'Decision Tree': {'Accuracy': 0.75, 'Precision': 0.4805194805194805, 'Recall': 0.7872340425531915, 'F1 Score': 0.5967741935483871, 'ROC AUC': 0.7628980670282297, 'Confusion Matrix': array([[113,  40],
       [ 10,  37]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.755, 'Precision': 0.48484848484848486, 'Recall': 0.6808510638297872, 'F1 Score': 0.5663716814159292, 'ROC AUC': 0.7978028090668892, 'Confusion Matrix': array([[119,  34],
       [ 15,  32]])}, 'SVM': {'Accuracy': 0.81, 'Precision': 0.5918367346938775, 'Recall': 0.6170212765957447, 'F1 Score': 0.6041666666666666, 'ROC AUC': 0.8192184675288555, 'Confusion Matrix': array([[133,  20],
       [ 18,  29]])}, 'Decision Tree': {'Accuracy': 0.715, 'Precision': 0.44047619047619047, 'Recall': 0.7872340425531915, 'F1 Score': 0.564885496183206, 'ROC AUC': 0.7400222500347656, 'Confusion Matrix': array([[106,  47],
       [ 10,  37]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.745, 'Precision': 0.4722222222222222, 'Recall': 0.723404255319149, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.80086218884717, 'Confusion Matrix': array([[115,  38],
       [ 13,  34]])}, 'SVM': {'Accuracy': 0.8, 'Precision': 0.5660377358490566, 'Recall': 0.6382978723404256, 'F1 Score': 0.6, 'ROC AUC': 0.8158809623140036, 'Confusion Matrix': array([[130,  23],
       [ 17,  30]])}, 'Decision Tree': {'Accuracy': 0.65, 'Precision': 0.38144329896907214, 'Recall': 0.7872340425531915, 'F1 Score': 0.513888888888889, 'ROC AUC': 0.6975385899040467, 'Confusion Matrix': array([[93, 60],
       [10, 37]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.82, 'Precision': 0.6341463414634146, 'Recall': 0.5531914893617021, 'F1 Score': 0.5909090909090909, 'ROC AUC': 0.8239465999165624, 'Confusion Matrix': array([[138,  15],
       [ 21,  26]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.7916666666666666, 'Recall': 0.40425531914893614, 'F1 Score': 0.5352112676056338, 'ROC AUC': 0.8190794048115699, 'Confusion Matrix': array([[148,   5],
       [ 28,  19]])}, 'Decision Tree': {'Accuracy': 0.77, 'Precision': 0.5111111111111111, 'Recall': 0.48936170212765956, 'F1 Score': 0.5, 'ROC AUC': 0.6727854262272285, 'Confusion Matrix': array([[131,  22],
       [ 24,  23]])}}
_______________________________________________________________________________




Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.82, 'Precision': 0.627906976744186, 'Recall': 0.574468085106383, 'F1 Score': 0.6, 'ROC AUC': 0.799193436239744, 'Confusion Matrix': array([[137,  16],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.84, 'Precision': 0.7586206896551724, 'Recall': 0.46808510638297873, 'F1 Score': 0.5789473684210527, 'ROC AUC': 0.8099012654707274, 'Confusion Matrix': array([[146,   7],
       [ 25,  22]])}, 'Decision Tree': {'Accuracy': 0.79, 'Precision': 0.5471698113207547, 'Recall': 0.6170212765957447, 'F1 Score': 0.58, 'ROC AUC': 0.7300792657488528, 'Confusion Matrix': array([[129,  24],
       [ 18,  29]])}}
_______________________________________________________________________________




Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.78, 'Precision': 0.5283018867924528, 'Recall': 0.5957446808510638, 'F1 Score': 0.56, 'ROC AUC': 0.8078153247114448, 'Confusion Matrix': array([[128,  25],
       [ 19,  28]])}, 'SVM': {'Accuracy': 0.84, 'Precision': 0.7027027027027027, 'Recall': 0.5531914893617021, 'F1 Score': 0.6190476190476191, 'ROC AUC': 0.8306216103462662, 'Confusion Matrix': array([[142,  11],
       [ 21,  26]])}, 'Decision Tree': {'Accuracy': 0.76, 'Precision': 0.49295774647887325, 'Recall': 0.7446808510638298, 'F1 Score': 0.5932203389830508, 'ROC AUC': 0.7546933667083855, 'Confusion Matrix': array([[117,  36],
       [ 12,  35]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.76, 'Precision': 0.49230769230769234, 'Recall': 0.6808510638297872, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.804199694062022, 'Confusion Matrix': array([[120,  33],
       [ 15,  32]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.6122448979591837, 'Recall': 0.6382978723404256, 'F1 Score': 0.625, 'ROC AUC': 0.8082325128633013, 'Confusion Matrix': array([[134,  19],
       [ 17,  30]])}, 'Decision Tree': {'Accuracy': 0.67, 'Precision': 0.38823529411764707, 'Recall': 0.7021276595744681, 'F1 Score': 0.5, 'ROC AUC': 0.6811291892643583, 'Confusion Matrix': array([[101,  52],
       [ 14,  33]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.73, 'Precision': 0.45569620253164556, 'Recall': 0.7659574468085106, 'F1 Score': 0.5714285714285715, 'ROC AUC': 0.7755527743012098, 'Confusion Matrix': array([[110,  43],
       [ 11,  36]])}, 'SVM': {'Accuracy': 0.73, 'Precision': 0.4533333333333333, 'Recall': 0.723404255319149, 'F1 Score': 0.5573770491803279, 'ROC AUC': 0.8135168961201502, 'Confusion Matrix': array([[112,  41],
       [ 13,  34]])}, 'Decision Tree': {'Accuracy': 0.68, 'Precision': 0.4, 'Recall': 0.723404255319149, 'F1 Score': 0.5151515151515151, 'ROC AUC': 0.6950354609929079, 'Confusion Matrix': array([[102,  51],
       [ 13,  34]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6153846153846154, 'Recall': 0.5106382978723404, 'F1 Score': 0.558139534883721, 'ROC AUC': 0.7925184258100403, 'Confusion Matrix': array([[138,  15],
       [ 23,  24]])}, 'SVM': {'Accuracy': 0.825, 'Precision': 1.0, 'Recall': 0.2553191489361702, 'F1 Score': 0.4067796610169491, 'ROC AUC': 0.8161590877485747, 'Confusion Matrix': array([[153,   0],
       [ 35,  12]])}, 'Decision Tree': {'Accuracy': 0.775, 'Precision': 0.5217391304347826, 'Recall': 0.5106382978723404, 'F1 Score': 0.5161290322580645, 'ROC AUC': 0.6834237240995689, 'Confusion Matrix': array([[131,  22],
       [ 23,  24]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.805, 'Precision': 0.5952380952380952, 'Recall': 0.5319148936170213, 'F1 Score': 0.5617977528089887, 'ROC AUC': 0.7794465303852038, 'Confusion Matrix': array([[136,  17],
       [ 22,  25]])}, 'SVM': {'Accuracy': 0.83, 'Precision': 0.782608695652174, 'Recall': 0.3829787234042553, 'F1 Score': 0.5142857142857143, 'ROC AUC': 0.8239465999165624, 'Confusion Matrix': array([[148,   5],
       [ 29,  18]])}, 'Decision Tree': {'Accuracy': 0.775, 'Precision': 0.5208333333333334, 'Recall': 0.5319148936170213, 'F1 Score': 0.5263157894736842, 'ROC AUC': 0.6907940481157001, 'Confusion Matrix': array([[130,  23],
       [ 22,  25]])}}
_______________________________________________________________________________




Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6046511627906976, 'Recall': 0.5531914893617021, 'F1 Score': 0.5777777777777778, 'ROC AUC': 0.7813934084272007, 'Confusion Matrix': array([[136,  17],
       [ 21,  26]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.75, 'Recall': 0.44680851063829785, 'F1 Score': 0.56, 'ROC AUC': 0.8421638158809625, 'Confusion Matrix': array([[146,   7],
       [ 26,  21]])}, 'Decision Tree': {'Accuracy': 0.75, 'Precision': 0.4727272727272727, 'Recall': 0.5531914893617021, 'F1 Score': 0.5098039215686275, 'ROC AUC': 0.6818245028507858, 'Confusion Matrix': array([[124,  29],
       [ 21,  26]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.785, 'Precision': 0.5370370370370371, 'Recall': 0.6170212765957447, 'F1 Score': 0.5742574257425743, 'ROC AUC': 0.783757474621054, 'Confusion Matrix': array([[128,  25],
       [ 18,  29]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.6521739130434783, 'Recall': 0.6382978723404256, 'F1 Score': 0.6451612903225806, 'ROC AUC': 0.85551383674037, 'Confusion Matrix': array([[137,  16],
       [ 17,  30]])}, 'Decision Tree': {'Accuracy': 0.73, 'Precision': 0.4507042253521127, 'Recall': 0.6808510638297872, 'F1 Score': 0.5423728813559323, 'ROC AUC': 0.7129745515227368, 'Confusion Matrix': array([[114,  39],
       [ 15,  32]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.685, 'Precision': 0.4, 'Recall': 0.6808510638297872, 'F1 Score': 0.5039370078740157, 'ROC AUC': 0.7900152968989014, 'Confusion Matrix': array([[105,  48],
       [ 15,  32]])}, 'SVM': {'Accuracy': 0.76, 'Precision': 0.4931506849315068, 'Recall': 0.7659574468085106, 'F1 Score': 0.6, 'ROC AUC': 0.7990543735224587, 'Confusion Matrix': array([[116,  37],
       [ 11,  36]])}, 'Decision Tree': {'Accuracy': 0.685, 'Precision': 0.4024390243902439, 'Recall': 0.7021276595744681, 'F1 Score': 0.5116279069767442, 'ROC AUC': 0.6909331108329857, 'Confusion Matrix': array([[104,  49],
       [ 14,  33]])}}
_______________________________________________________________________________
For per-cluster pruning ratio 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 20.0% :
{'Logistic Regression': {'Accuracy': 0.815, 'Precision': 0.625, 'Recall': 0.5319148936170213, 'F1 Score': 0.5747126436781609, 'ROC AUC': 0.7922403003754693, 'Confusion Matrix': array([[138,  15],
       [ 22,  25]])}, 'SVM': {'Accuracy': 0.825, 'Precision': 1.0, 'Recall': 0.2553191489361702, 'F1 Score': 0.4067796610169491, 'ROC AUC': 0.8178278403560005, 'Confusion Matrix': array([[153,   0],
       [ 35,  12]])}, 'Decision Tree': {'Accuracy': 0.74, 'Precision': 0.44680851063829785, 'Recall': 0.44680851063829785, 'F1 Score': 0.44680851063829785, 'ROC AUC': 0.6384369350577109, 'Confusion Matrix': array([[127,  26],
       [ 26,  21]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 40.0% :
{'Logistic Regression': {'Accuracy': 0.815, 'Precision': 0.625, 'Recall': 0.5319148936170213, 'F1 Score': 0.5747126436781609, 'ROC AUC': 0.7729105826727853, 'Confusion Matrix': array([[138,  15],
       [ 22,  25]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.85, 'Recall': 0.3617021276595745, 'F1 Score': 0.5074626865671642, 'ROC AUC': 0.819635655680712, 'Confusion Matrix': array([[150,   3],
       [ 30,  17]])}, 'Decision Tree': {'Accuracy': 0.75, 'Precision': 0.4716981132075472, 'Recall': 0.5319148936170213, 'F1 Score': 0.4999999999999999, 'ROC AUC': 0.6744541788346544, 'Confusion Matrix': array([[125,  28],
       [ 22,  25]])}}
_______________________________________________________________________________


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data pruned intelligently at 60.00000000000001% :
{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6046511627906976, 'Recall': 0.5531914893617021, 'F1 Score': 0.5777777777777778, 'ROC AUC': 0.7638715060492282, 'Confusion Matrix': array([[136,  17],
       [ 21,  26]])}, 'SVM': {'Accuracy': 0.825, 'Precision': 0.7727272727272727, 'Recall': 0.3617021276595745, 'F1 Score': 0.4927536231884059, 'ROC AUC': 0.8324294256709777, 'Confusion Matrix': array([[148,   5],
       [ 30,  17]])}, 'Decision Tree': {'Accuracy': 0.72, 'Precision': 0.42857142857142855, 'Recall': 0.574468085106383, 'F1 Score': 0.4909090909090909, 'ROC AUC': 0.6695869837296621, 'Confusion Matrix': array([[117,  36],
       [ 20,  27]])}}
_______________________________________________________________________________




Train data pruned intelligently at 80.0% :
{'Logistic Regression': {'Accuracy': 0.765, 'Precision': 0.5, 'Recall': 0.6382978723404256, 'F1 Score': 0.5607476635514019, 'ROC AUC': 0.784869976359338, 'Confusion Matrix': array([[123,  30],
       [ 17,  30]])}, 'SVM': {'Accuracy': 0.8, 'Precision': 0.5853658536585366, 'Recall': 0.5106382978723404, 'F1 Score': 0.5454545454545454, 'ROC AUC': 0.8404950632735364, 'Confusion Matrix': array([[136,  17],
       [ 23,  24]])}, 'Decision Tree': {'Accuracy': 0.72, 'Precision': 0.4262295081967213, 'Recall': 0.5531914893617021, 'F1 Score': 0.48148148148148145, 'ROC AUC': 0.6622166597135308, 'Confusion Matrix': array([[118,  35],
       [ 21,  26]])}}
_______________________________________________________________________________




Train data pruned intelligently at 100.0% :
{'Logistic Regression': {'Accuracy': 0.62, 'Precision': 0.34065934065934067, 'Recall': 0.6595744680851063, 'F1 Score': 0.4492753623188405, 'ROC AUC': 0.6907245167570575, 'Confusion Matrix': array([[93, 60],
       [16, 31]])}, 'SVM': {'Accuracy': 0.625, 'Precision': 0.36792452830188677, 'Recall': 0.8297872340425532, 'F1 Score': 0.5098039215686274, 'ROC AUC': 0.7195104992351551, 'Confusion Matrix': array([[86, 67],
       [ 8, 39]])}, 'Decision Tree': {'Accuracy': 0.645, 'Precision': 0.3695652173913043, 'Recall': 0.723404255319149, 'F1 Score': 0.4892086330935252, 'ROC AUC': 0.6721596439994437, 'Confusion Matrix': array([[95, 58],
       [13, 34]])}}
_______________________________________________________________________________


#Calling Random Pruning

In [25]:
results_random_pruning = dict()
for ratio in ratios:
  random_pruned_X_train, random_pruned_y_train = random_prune_data(X_train.to_numpy(), y_train.to_numpy(), ratio)
  preprocessed_random_pruned_X_train, scaler, imputer = preprocess_data_train(random_pruned_X_train)
  preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  random_pruned_X_train, random_pruned_y_train = preprocessed_random_pruned_X_train, random_pruned_y_train
  random_pruned_X_test, random_pruned_y_test = preprocessed_X_test, y_test.to_numpy()

  print(f"Train data pruned randomly at {ratio * 100}% :")
  results = evaluate_models(random_pruned_X_train, random_pruned_X_test, random_pruned_y_train, random_pruned_y_test)
  print(results)
  results_random_pruning[ratio] = results
  print("_______________________________________________________________________________")

Train data pruned randomly at 20.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6216216216216216, 'Recall': 0.48936170212765956, 'F1 Score': 0.5476190476190476, 'ROC AUC': 0.7880684188569045, 'Confusion Matrix': array([[139,  14],
       [ 24,  23]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.7391304347826086, 'Recall': 0.3617021276595745, 'F1 Score': 0.4857142857142858, 'ROC AUC': 0.8142122097065776, 'Confusion Matrix': array([[147,   6],
       [ 30,  17]])}, 'Decision Tree': {'Accuracy': 0.755, 'Precision': 0.4807692307692308, 'Recall': 0.5319148936170213, 'F1 Score': 0.505050505050505, 'ROC AUC': 0.6777221526908636, 'Confusion Matrix': array([[126,  27],
       [ 22,  25]])}}
_______________________________________________________________________________
Train data pruned randomly at 40.0% :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.82, 'Precision': 0.6341463414634146, 'Recall': 0.5531914893617021, 'F1 Score': 0.5909090909090909, 'ROC AUC': 0.7683215130023641, 'Confusion Matrix': array([[138,  15],
       [ 21,  26]])}, 'SVM': {'Accuracy': 0.825, 'Precision': 0.75, 'Recall': 0.3829787234042553, 'F1 Score': 0.5070422535211269, 'ROC AUC': 0.8308997357808372, 'Confusion Matrix': array([[147,   6],
       [ 29,  18]])}, 'Decision Tree': {'Accuracy': 0.73, 'Precision': 0.4426229508196721, 'Recall': 0.574468085106383, 'F1 Score': 0.5, 'ROC AUC': 0.6761229314420804, 'Confusion Matrix': array([[119,  34],
       [ 20,  27]])}}
_______________________________________________________________________________
Train data pruned randomly at 60.00000000000001% :




{'Logistic Regression': {'Accuracy': 0.79, 'Precision': 0.5510204081632653, 'Recall': 0.574468085106383, 'F1 Score': 0.5625, 'ROC AUC': 0.789598108747045, 'Confusion Matrix': array([[131,  22],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.855, 'Precision': 0.78125, 'Recall': 0.5319148936170213, 'F1 Score': 0.6329113924050632, 'ROC AUC': 0.8374356834932555, 'Confusion Matrix': array([[146,   7],
       [ 22,  25]])}, 'Decision Tree': {'Accuracy': 0.745, 'Precision': 0.4714285714285714, 'Recall': 0.7021276595744681, 'F1 Score': 0.5641025641025642, 'ROC AUC': 0.7301487971074955, 'Confusion Matrix': array([[116,  37],
       [ 14,  33]])}}
_______________________________________________________________________________
Train data pruned randomly at 80.0% :




{'Logistic Regression': {'Accuracy': 0.78, 'Precision': 0.5263157894736842, 'Recall': 0.6382978723404256, 'F1 Score': 0.5769230769230769, 'ROC AUC': 0.8008621888471701, 'Confusion Matrix': array([[126,  27],
       [ 17,  30]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.627906976744186, 'Recall': 0.574468085106383, 'F1 Score': 0.6, 'ROC AUC': 0.8553747740230844, 'Confusion Matrix': array([[137,  16],
       [ 20,  27]])}, 'Decision Tree': {'Accuracy': 0.735, 'Precision': 0.4605263157894737, 'Recall': 0.7446808510638298, 'F1 Score': 0.5691056910569106, 'ROC AUC': 0.7383534974273398, 'Confusion Matrix': array([[112,  41],
       [ 12,  35]])}}
_______________________________________________________________________________
Train data pruned randomly at 100.0% :




{'Logistic Regression': {'Accuracy': 0.715, 'Precision': 0.43243243243243246, 'Recall': 0.6808510638297872, 'F1 Score': 0.5289256198347106, 'ROC AUC': 0.756223056598526, 'Confusion Matrix': array([[111,  42],
       [ 15,  32]])}, 'SVM': {'Accuracy': 0.785, 'Precision': 0.5303030303030303, 'Recall': 0.7446808510638298, 'F1 Score': 0.6194690265486724, 'ROC AUC': 0.867890418578779, 'Confusion Matrix': array([[122,  31],
       [ 12,  35]])}, 'Decision Tree': {'Accuracy': 0.665, 'Precision': 0.38095238095238093, 'Recall': 0.6808510638297872, 'F1 Score': 0.48854961832061056, 'ROC AUC': 0.6704908913920178, 'Confusion Matrix': array([[101,  52],
       [ 15,  32]])}}
_______________________________________________________________________________


# Calling SDV-Oversampling

In [28]:
sd1, train_df = do_sdv(X_train, y_train)
results_syn_sdv = dict()

# Add synthetic data at different percentages to the main DataFrame
for ratio in ratios:
    combined_df = add_synthetic_data(train_df, sd1, ratio)
    y_train_sdv = combined_df.iloc[:, -1]
    X_train_sdv = combined_df.iloc[:, :-1]

    preprocessed_X_train_sdv, scaler, imputer = preprocess_data_train(X_train_sdv)
    preprocessed_X_test_sdv = preprocess_data_test(X_test, scaler, imputer)

    X_train_sdv, y_train_sdv = preprocessed_X_train_sdv, y_train_sdv.to_numpy()
    X_test_sdv, y_test_sdv = preprocessed_X_test_sdv, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    results = evaluate_models(X_train_sdv, X_test_sdv, y_train_sdv, y_test_sdv)
    results_syn_sdv[ratio] = results
    print(results)
    print("_______________________________________________________________________________")



Train data combined with 20.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.825, 'Precision': 0.6764705882352942, 'Recall': 0.48936170212765956, 'F1 Score': 0.5679012345679013, 'ROC AUC': 0.8003059379780281, 'Confusion Matrix': array([[142,  11],
       [ 24,  23]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.8888888888888888, 'Recall': 0.3404255319148936, 'F1 Score': 0.4923076923076923, 'ROC AUC': 0.822138784591851, 'Confusion Matrix': array([[151,   2],
       [ 31,  16]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5294117647058824, 'Recall': 0.574468085106383, 'F1 Score': 0.5510204081632654, 'ROC AUC': 0.708802670004172, 'Confusion Matrix': array([[129,  24],
       [ 20,  27]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.825, 'Precision': 0.6764705882352942, 'Recall': 0.48936170212765956, 'F1 Score': 0.5679012345679013, 'ROC AUC': 0.8037825059101655, 'Confusion Matrix': array([[142,  11],
       [ 24,  23]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.8888888888888888, 'Recall': 0.3404255319148936, 'F1 Score': 0.4923076923076923, 'ROC AUC': 0.816854401335002, 'Confusion Matrix': array([[151,   2],
       [ 31,  16]])}, 'Decision Tree': {'Accuracy': 0.74, 'Precision': 0.45614035087719296, 'Recall': 0.5531914893617021, 'F1 Score': 0.5, 'ROC AUC': 0.6752885551383675, 'Confusion Matrix': array([[122,  31],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.83, 'Precision': 0.696969696969697, 'Recall': 0.48936170212765956, 'F1 Score': 0.575, 'ROC AUC': 0.8032262550410236, 'Confusion Matrix': array([[143,  10],
       [ 24,  23]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.8888888888888888, 'Recall': 0.3404255319148936, 'F1 Score': 0.4923076923076923, 'ROC AUC': 0.817410652204144, 'Confusion Matrix': array([[151,   2],
       [ 31,  16]])}, 'Decision Tree': {'Accuracy': 0.74, 'Precision': 0.45614035087719296, 'Recall': 0.5531914893617021, 'F1 Score': 0.5, 'ROC AUC': 0.6752885551383675, 'Confusion Matrix': array([[122,  31],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.825, 'Precision': 0.6764705882352942, 'Recall': 0.48936170212765956, 'F1 Score': 0.5679012345679013, 'ROC AUC': 0.8029481296064525, 'Confusion Matrix': array([[142,  11],
       [ 24,  23]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.8888888888888888, 'Recall': 0.3404255319148936, 'F1 Score': 0.4923076923076923, 'ROC AUC': 0.8144903351411485, 'Confusion Matrix': array([[151,   2],
       [ 31,  16]])}, 'Decision Tree': {'Accuracy': 0.74, 'Precision': 0.45614035087719296, 'Recall': 0.5531914893617021, 'F1 Score': 0.5, 'ROC AUC': 0.6752885551383675, 'Confusion Matrix': array([[122,  31],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.83, 'Precision': 0.696969696969697, 'Recall': 0.48936170212765956, 'F1 Score': 0.575, 'ROC AUC': 0.80364344319288, 'Confusion Matrix': array([[143,  10],
       [ 24,  23]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.8888888888888888, 'Recall': 0.3404255319148936, 'F1 Score': 0.4923076923076923, 'ROC AUC': 0.8124043943818662, 'Confusion Matrix': array([[151,   2],
       [ 31,  16]])}, 'Decision Tree': {'Accuracy': 0.74, 'Precision': 0.45614035087719296, 'Recall': 0.5531914893617021, 'F1 Score': 0.5, 'ROC AUC': 0.6752885551383675, 'Confusion Matrix': array([[122,  31],
       [ 21,  26]])}}
_______________________________________________________________________________


# Calling SMOTE-Oversampling

In [29]:
results_smote = dict()

for ratio in ratios:

    X_train_smote, y_train_smote = smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])
    preprocessed_X_train_smote, scaler, imputer = preprocess_data_train((np.array(X_train_smote))[0])
    preprocessed_X_test_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_smote, y_train_smote = preprocessed_X_train_smote, (np.array(y_train_smote))[0]
    X_test_smote, y_test_smote = preprocessed_X_test_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_smote), len(y_train_smote))
    results = evaluate_models(X_train_smote, X_test_smote, y_train_smote, y_test_smote)
    results_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
892 892


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6097560975609756, 'Recall': 0.5319148936170213, 'F1 Score': 0.5681818181818181, 'ROC AUC': 0.7844527882074815, 'Confusion Matrix': array([[137,  16],
       [ 22,  25]])}, 'SVM': {'Accuracy': 0.825, 'Precision': 0.7727272727272727, 'Recall': 0.3617021276595745, 'F1 Score': 0.4927536231884059, 'ROC AUC': 0.8468919482686692, 'Confusion Matrix': array([[148,   5],
       [ 30,  17]])}, 'Decision Tree': {'Accuracy': 0.79, 'Precision': 0.5510204081632653, 'Recall': 0.574468085106383, 'F1 Score': 0.5625, 'ROC AUC': 0.7153386177165902, 'Confusion Matrix': array([[131,  22],
       [ 20,  27]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
988 988


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.78, 'Precision': 0.5283018867924528, 'Recall': 0.5957446808510638, 'F1 Score': 0.56, 'ROC AUC': 0.7832012237519121, 'Confusion Matrix': array([[128,  25],
       [ 19,  28]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.71875, 'Recall': 0.48936170212765956, 'F1 Score': 0.5822784810126582, 'ROC AUC': 0.84995132804895, 'Confusion Matrix': array([[144,   9],
       [ 24,  23]])}, 'Decision Tree': {'Accuracy': 0.8, 'Precision': 0.574468085106383, 'Recall': 0.574468085106383, 'F1 Score': 0.574468085106383, 'ROC AUC': 0.7218745654290085, 'Confusion Matrix': array([[133,  20],
       [ 20,  27]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1084 1084


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.755, 'Precision': 0.4827586206896552, 'Recall': 0.5957446808510638, 'F1 Score': 0.5333333333333333, 'ROC AUC': 0.7805590321234877, 'Confusion Matrix': array([[123,  30],
       [ 19,  28]])}, 'SVM': {'Accuracy': 0.81, 'Precision': 0.6097560975609756, 'Recall': 0.5319148936170213, 'F1 Score': 0.5681818181818181, 'ROC AUC': 0.8448060075093868, 'Confusion Matrix': array([[137,  16],
       [ 22,  25]])}, 'Decision Tree': {'Accuracy': 0.79, 'Precision': 0.5555555555555556, 'Recall': 0.5319148936170213, 'F1 Score': 0.5434782608695652, 'ROC AUC': 0.7005979696843276, 'Confusion Matrix': array([[133,  20],
       [ 22,  25]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1180 1180


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.75, 'Precision': 0.47540983606557374, 'Recall': 0.6170212765957447, 'F1 Score': 0.537037037037037, 'ROC AUC': 0.7820887220136282, 'Confusion Matrix': array([[121,  32],
       [ 18,  29]])}, 'SVM': {'Accuracy': 0.805, 'Precision': 0.5769230769230769, 'Recall': 0.6382978723404256, 'F1 Score': 0.6060606060606061, 'ROC AUC': 0.8381309970796829, 'Confusion Matrix': array([[131,  22],
       [ 17,  30]])}, 'Decision Tree': {'Accuracy': 0.76, 'Precision': 0.49019607843137253, 'Recall': 0.5319148936170213, 'F1 Score': 0.5102040816326531, 'ROC AUC': 0.6809901265470727, 'Confusion Matrix': array([[127,  26],
       [ 22,  25]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
1276 1276


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.73, 'Precision': 0.4492753623188406, 'Recall': 0.6595744680851063, 'F1 Score': 0.5344827586206896, 'ROC AUC': 0.7854262272284801, 'Confusion Matrix': array([[115,  38],
       [ 16,  31]])}, 'SVM': {'Accuracy': 0.745, 'Precision': 0.4696969696969697, 'Recall': 0.6595744680851063, 'F1 Score': 0.5486725663716814, 'ROC AUC': 0.8174106522041441, 'Confusion Matrix': array([[118,  35],
       [ 16,  31]])}, 'Decision Tree': {'Accuracy': 0.735, 'Precision': 0.44642857142857145, 'Recall': 0.5319148936170213, 'F1 Score': 0.48543689320388356, 'ROC AUC': 0.664650257266027, 'Confusion Matrix': array([[122,  31],
       [ 22,  25]])}}
_______________________________________________________________________________


# Calling Random-Oversampling

In [30]:
results_random = dict()

for ratio in ratios:

    X_train_random, y_train_random = random_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_random, scaler, imputer = preprocess_data_train((np.array(X_train_random)[0]))
    preprocessed_X_test_random = preprocess_data_test(X_test, scaler, imputer)

    X_train_random, y_train_random = preprocessed_X_train_random, (np.array(y_train_random))[0]
    X_test_random, y_test_random = preprocessed_X_test_random, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_random), len(y_train_random))
    results = evaluate_models(X_train_random, X_test_random, y_train_random, y_test_random)
    results_random[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
892 892


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6046511627906976, 'Recall': 0.5531914893617021, 'F1 Score': 0.5777777777777778, 'ROC AUC': 0.8073981365595885, 'Confusion Matrix': array([[136,  17],
       [ 21,  26]])}, 'SVM': {'Accuracy': 0.835, 'Precision': 0.8181818181818182, 'Recall': 0.3829787234042553, 'F1 Score': 0.5217391304347826, 'ROC AUC': 0.8428591294673897, 'Confusion Matrix': array([[149,   4],
       [ 29,  18]])}, 'Decision Tree': {'Accuracy': 0.785, 'Precision': 0.5476190476190477, 'Recall': 0.48936170212765956, 'F1 Score': 0.5168539325842697, 'ROC AUC': 0.6825893477958559, 'Confusion Matrix': array([[134,  19],
       [ 24,  23]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
988 988


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.8, 'Precision': 0.574468085106383, 'Recall': 0.574468085106383, 'F1 Score': 0.574468085106383, 'ROC AUC': 0.7978028090668892, 'Confusion Matrix': array([[133,  20],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.825, 'Precision': 0.6578947368421053, 'Recall': 0.5319148936170213, 'F1 Score': 0.5882352941176471, 'ROC AUC': 0.8573216520650813, 'Confusion Matrix': array([[140,  13],
       [ 22,  25]])}, 'Decision Tree': {'Accuracy': 0.76, 'Precision': 0.4883720930232558, 'Recall': 0.44680851063829785, 'F1 Score': 0.4666666666666666, 'ROC AUC': 0.6515088304825476, 'Confusion Matrix': array([[131,  22],
       [ 26,  21]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1084 1084


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.765, 'Precision': 0.5, 'Recall': 0.574468085106383, 'F1 Score': 0.5346534653465347, 'ROC AUC': 0.7848699763593382, 'Confusion Matrix': array([[126,  27],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.805, 'Precision': 0.5833333333333334, 'Recall': 0.5957446808510638, 'F1 Score': 0.5894736842105263, 'ROC AUC': 0.8450841329439577, 'Confusion Matrix': array([[133,  20],
       [ 19,  28]])}, 'Decision Tree': {'Accuracy': 0.715, 'Precision': 0.391304347826087, 'Recall': 0.3829787234042553, 'F1 Score': 0.3870967741935484, 'ROC AUC': 0.5999860937282714, 'Confusion Matrix': array([[125,  28],
       [ 29,  18]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1180 1180


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.75, 'Precision': 0.47619047619047616, 'Recall': 0.6382978723404256, 'F1 Score': 0.5454545454545455, 'ROC AUC': 0.7900152968989014, 'Confusion Matrix': array([[120,  33],
       [ 17,  30]])}, 'SVM': {'Accuracy': 0.8, 'Precision': 0.5614035087719298, 'Recall': 0.6808510638297872, 'F1 Score': 0.6153846153846153, 'ROC AUC': 0.8453622583785287, 'Confusion Matrix': array([[128,  25],
       [ 15,  32]])}, 'Decision Tree': {'Accuracy': 0.745, 'Precision': 0.45652173913043476, 'Recall': 0.44680851063829785, 'F1 Score': 0.45161290322580644, 'ROC AUC': 0.6417049089139202, 'Confusion Matrix': array([[128,  25],
       [ 26,  21]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
1276 1276


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.73, 'Precision': 0.4492753623188406, 'Recall': 0.6595744680851063, 'F1 Score': 0.5344827586206896, 'ROC AUC': 0.7825059101654848, 'Confusion Matrix': array([[115,  38],
       [ 16,  31]])}, 'SVM': {'Accuracy': 0.78, 'Precision': 0.5238095238095238, 'Recall': 0.7021276595744681, 'F1 Score': 0.6, 'ROC AUC': 0.838687247948825, 'Confusion Matrix': array([[123,  30],
       [ 14,  33]])}, 'Decision Tree': {'Accuracy': 0.775, 'Precision': 0.5238095238095238, 'Recall': 0.46808510638297873, 'F1 Score': 0.49438202247191015, 'ROC AUC': 0.6686830760673064, 'Confusion Matrix': array([[133,  20],
       [ 25,  22]])}}
_______________________________________________________________________________


# Calling SVM-SMOTE Over-Sampling

In [31]:
results_svm_smote = dict()

for ratio in ratios:

    X_train_svm_smote, y_train_svm_smote = svm_smote_oversampling(X_train.to_numpy(), y_train.to_numpy(), [ratio])

    preprocessed_X_train_svm_smote, scaler, imputer = preprocess_data_train((np.array(X_train_svm_smote))[0])
    preprocessed_X_test_svm_smote = preprocess_data_test(X_test, scaler, imputer)

    X_train_svm_smote, y_train_svm_smote = preprocessed_X_train_svm_smote, (np.array(y_train_svm_smote))[0]
    X_test_svm_smote, y_test_svm_smote = preprocessed_X_test_svm_smote, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_svm_smote), len(y_train_svm_smote))
    results = evaluate_models(X_train_svm_smote, X_test_svm_smote, y_train_svm_smote, y_test_svm_smote)
    results_svm_smote[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

Train data combined with 20.0% synthetic data of minority class:
892 892


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6097560975609756, 'Recall': 0.5319148936170213, 'F1 Score': 0.5681818181818181, 'ROC AUC': 0.7955778055903213, 'Confusion Matrix': array([[137,  16],
       [ 22,  25]])}, 'SVM': {'Accuracy': 0.85, 'Precision': 0.8148148148148148, 'Recall': 0.46808510638297873, 'F1 Score': 0.5945945945945945, 'ROC AUC': 0.8438325684883883, 'Confusion Matrix': array([[148,   5],
       [ 25,  22]])}, 'Decision Tree': {'Accuracy': 0.765, 'Precision': 0.5, 'Recall': 0.574468085106383, 'F1 Score': 0.5346534653465347, 'ROC AUC': 0.6989987484355444, 'Confusion Matrix': array([[126,  27],
       [ 20,  27]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
988 988


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6, 'Recall': 0.574468085106383, 'F1 Score': 0.5869565217391305, 'ROC AUC': 0.8117090807954388, 'Confusion Matrix': array([[135,  18],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.855, 'Precision': 0.8, 'Recall': 0.5106382978723404, 'F1 Score': 0.6233766233766233, 'ROC AUC': 0.8584341538033653, 'Confusion Matrix': array([[147,   6],
       [ 23,  24]])}, 'Decision Tree': {'Accuracy': 0.79, 'Precision': 0.5531914893617021, 'Recall': 0.5531914893617021, 'F1 Score': 0.5531914893617021, 'ROC AUC': 0.707968293700459, 'Confusion Matrix': array([[132,  21],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
1084 1084


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.79, 'Precision': 0.5510204081632653, 'Recall': 0.574468085106383, 'F1 Score': 0.5625, 'ROC AUC': 0.8032262550410235, 'Confusion Matrix': array([[131,  22],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.845, 'Precision': 0.7222222222222222, 'Recall': 0.5531914893617021, 'F1 Score': 0.6265060240963856, 'ROC AUC': 0.852454456960089, 'Confusion Matrix': array([[143,  10],
       [ 21,  26]])}, 'Decision Tree': {'Accuracy': 0.77, 'Precision': 0.5121951219512195, 'Recall': 0.44680851063829785, 'F1 Score': 0.4772727272727273, 'ROC AUC': 0.6580447781949659, 'Confusion Matrix': array([[133,  20],
       [ 26,  21]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
1180 1180


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7902934223334724, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 0.6341463414634146, 'Recall': 0.5531914893617021, 'F1 Score': 0.5909090909090909, 'ROC AUC': 0.8514810179390906, 'Confusion Matrix': array([[138,  15],
       [ 21,  26]])}, 'Decision Tree': {'Accuracy': 0.765, 'Precision': 0.5, 'Recall': 0.5957446808510638, 'F1 Score': 0.5436893203883495, 'ROC AUC': 0.7063690724516757, 'Confusion Matrix': array([[125,  28],
       [ 19,  28]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
1276 1276


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.755, 'Precision': 0.48214285714285715, 'Recall': 0.574468085106383, 'F1 Score': 0.5242718446601942, 'ROC AUC': 0.7900152968989015, 'Confusion Matrix': array([[124,  29],
       [ 20,  27]])}, 'SVM': {'Accuracy': 0.81, 'Precision': 0.5918367346938775, 'Recall': 0.6170212765957447, 'F1 Score': 0.6041666666666666, 'ROC AUC': 0.8475872618550967, 'Confusion Matrix': array([[133,  20],
       [ 18,  29]])}, 'Decision Tree': {'Accuracy': 0.715, 'Precision': 0.41935483870967744, 'Recall': 0.5531914893617021, 'F1 Score': 0.47706422018348627, 'ROC AUC': 0.6589486858573217, 'Confusion Matrix': array([[117,  36],
       [ 21,  26]])}}
_______________________________________________________________________________


# No-Sampling Results

In [39]:
results_no_sampling = dict()

for ratio in ratios:

    X_train_no_sampling, y_train_no_sampling = X_train.to_numpy(), y_train.to_numpy()

    preprocessed_X_train_no_sampling, scaler, imputer = preprocess_data_train(X_train_no_sampling)
    preprocessed_X_test_no_sampling = preprocess_data_test(X_test, scaler, imputer)

    X_train_no_sampling, y_train_no_sampling = preprocessed_X_train_no_sampling, y_train_no_sampling
    X_test_no_sampling, y_test_no_sampling = preprocessed_X_test_no_sampling, y_test.to_numpy()

    print(f"Train data combined with {ratio * 100}% synthetic data of minority class:")
    print(len(X_train_no_sampling), len(y_train_no_sampling))
    results = evaluate_models(X_train_no_sampling, X_test_no_sampling, y_train_no_sampling, y_test_no_sampling)
    results_no_sampling[ratio] = results
    print(results)
    print("_______________________________________________________________________________")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train data combined with 20.0% synthetic data of minority class:
797 797
{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6285714285714286, 'Recall': 0.46808510638297873, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.7968293700458907, 'Confusion Matrix': array([[140,  13],
       [ 25,  22]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 1.0, 'Recall': 0.23404255319148937, 'F1 Score': 0.3793103448275862, 'ROC AUC': 0.8115700180781532, 'Confusion Matrix': array([[153,   0],
       [ 36,  11]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7014323459880406, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 40.0% synthetic data of minority class:
797 797


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6285714285714286, 'Recall': 0.46808510638297873, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.7968293700458907, 'Confusion Matrix': array([[140,  13],
       [ 25,  22]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 1.0, 'Recall': 0.23404255319148937, 'F1 Score': 0.3793103448275862, 'ROC AUC': 0.8115700180781532, 'Confusion Matrix': array([[153,   0],
       [ 36,  11]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7014323459880406, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 60.00000000000001% synthetic data of minority class:
797 797


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6285714285714286, 'Recall': 0.46808510638297873, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.7968293700458907, 'Confusion Matrix': array([[140,  13],
       [ 25,  22]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 1.0, 'Recall': 0.23404255319148937, 'F1 Score': 0.3793103448275862, 'ROC AUC': 0.8115700180781532, 'Confusion Matrix': array([[153,   0],
       [ 36,  11]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7014323459880406, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 80.0% synthetic data of minority class:
797 797


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6285714285714286, 'Recall': 0.46808510638297873, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.7968293700458907, 'Confusion Matrix': array([[140,  13],
       [ 25,  22]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 1.0, 'Recall': 0.23404255319148937, 'F1 Score': 0.3793103448275862, 'ROC AUC': 0.8115700180781532, 'Confusion Matrix': array([[153,   0],
       [ 36,  11]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7014323459880406, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}}
_______________________________________________________________________________
Train data combined with 100.0% synthetic data of minority class:
797 797


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': {'Accuracy': 0.81, 'Precision': 0.6285714285714286, 'Recall': 0.46808510638297873, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.7968293700458907, 'Confusion Matrix': array([[140,  13],
       [ 25,  22]])}, 'SVM': {'Accuracy': 0.82, 'Precision': 1.0, 'Recall': 0.23404255319148937, 'F1 Score': 0.3793103448275862, 'ROC AUC': 0.8115700180781532, 'Confusion Matrix': array([[153,   0],
       [ 36,  11]])}, 'Decision Tree': {'Accuracy': 0.78, 'Precision': 0.5306122448979592, 'Recall': 0.5531914893617021, 'F1 Score': 0.5416666666666667, 'ROC AUC': 0.7014323459880406, 'Confusion Matrix': array([[130,  23],
       [ 21,  26]])}}
_______________________________________________________________________________
