<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/rrp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.11.0-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.6/125.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.34.79-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.34.79-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.10,>=0.9.0 (from sdv)
  Downloading ctgan-0.9.1-py3-none-any.whl (24 kB)
Collecting deepecho<0.6,>=0.5 (from sdv)
  Downl

In [2]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
import csv
import os
import sys
import time
import math
from scipy.io import arff


In [4]:
def preprocess_data_train(X_train):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_train).sum()

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_train).sum()

    # Normalize numeric columns
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    return X_train, scaler, imputer

def preprocess_data_test(X_test, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = np.isnan(X_test).sum()

    # Handle missing data
    X_test = imputer.transform(X_test)

    # Count missing values after handling missing data
    missing_after = np.isnan(X_test).sum()

    # Normalize numeric columns
    X_test = scaler.transform(X_test)

    return X_test

In [5]:
breast_cancer = fetch_data('breast_cancer')
breast_cancer.describe()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
count,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0
mean,2.664336,1.073427,4.062937,1.073427,1.167832,2.048951,0.468531,2.772727,0.237762,0.297203
std,1.011818,0.98668,2.151187,1.935321,0.443052,0.738217,0.499883,1.099006,0.426459,0.457828
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0
50%,3.0,2.0,4.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0
75%,3.0,2.0,5.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0
max,5.0,2.0,10.0,6.0,2.0,3.0,1.0,5.0,1.0,1.0


In [6]:
def evaluate_models(X_train, X_test, y_train, y_test):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results

In [7]:
y = breast_cancer['target']
X = breast_cancer.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [47]:
def cluster(candidates, enough, res):
    if len(candidates) < enough:
        res.append(candidates)
        return res

    east, west, east_items, west_items = split(candidates)
    res = cluster(east_items, enough, res)
    res = cluster(west_items, enough, res)

    return res


def split(candidates):
    pivot = random.choice(candidates)
    east_pivot = find_farest(pivot, candidates)
    west_pivot = find_farest(east_pivot, candidates)
    c = cal_distance(east_pivot, west_pivot)

    if c == 0:
        east_items = candidates[:len(candidates)//2]
        west_items = candidates[len(candidates)//2:]
        return east_pivot, west_pivot, east_items, west_items

    all_distance = []
    for candidate in candidates:
        a = cal_distance(candidate, west_pivot)
        b = cal_distance(candidate, east_pivot)
        d = (a ** 2 + c ** 2 - b ** 2) / (2 * c)
        all_distance.append((d, candidate))

    all_distance.sort(key=lambda x: x[0])
    sorted_candidates = [item[1] for item in all_distance]
    east_items = sorted_candidates[:len(sorted_candidates)//2]
    west_items = sorted_candidates[len(sorted_candidates)//2:]

    return east_pivot, west_pivot, east_items, west_items


def find_farest(pivot, candidates):
    max_d = 0
    most_point = pivot

    for candidate in candidates:
        cur_d = cal_distance(pivot, candidate)
        if  cur_d > max_d:
            max_d = cur_d
            most_point = candidate

    return most_point


def cal_distance(p1, p2):
    return math.sqrt(sum([(v1 - v2) ** 2 for v1, v2 in zip(p1[:-1], p2[:-1])]))


def process_mixed_cluster(cluster):
    """
    in DE operation, use current-to-best to mutate the candidates
    v_i = x_i + F * (x_b - x_i) + F_min * (x_r1 - x_r2)
    """
    DE_params = {"F": 0.8, "Fmin": 0.1, "CR": 1.0}
    pos_point = [(idx, item) for idx, item in enumerate(cluster) if item[-1] == 1]
    neg_point = [(idx, item) for idx, item in enumerate(cluster) if item[-1] == 0]

    candidate_l = []
    if len(pos_point) == 1:
        # only 1 pos point in cluster, then mutate all neg points toward to the pos point
        xb = pos_point[0][1]
        R = random.choice(range(len(xb)-1))

        for _, xi in neg_point:
            new_candidate = []
            for i in range(len(xi)-1):
                ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                if ri < DE_params["CR"] or i == R:
                    new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]))
                else:
                    new_candidate.append(xi[i])

            new_candidate.append(1)
            candidate_l.append(np.array(new_candidate))
    else:
        # more than 2 pos points in cluster, then randomly pick 3 points, first 1 is current point, and
        # another two are support points
        for idx1, xb in pos_point:
            R = random.choice(range(len(xb)-1))
            for idx2, xi in enumerate(cluster):
                if not idx1 == idx2:
                    available_points = []
                    for idx3, p in enumerate(cluster):
                        if not idx3 == idx2 and not idx3 == idx1:
                            available_points.append(p)
                    [xr1, xr2] = random.sample(available_points, 2)

                    new_candidate = []
                    # if xr1 and xr2 all negative class, then just use xi
                    if xr1[-1] == 0 and xr2[-1] == 0:
                        for i in range(len(xi)-1):
                            ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                            if ri < DE_params["CR"] or i == R:
                                new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]))
                            else:
                                new_candidate.append(xi[i])
                    else:
                        for i in range(len(xi)-1):
                            ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                            if ri < DE_params["CR"] or i == R:
                                if xr1[-1] == 1:
                                    new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]) + DE_params["Fmin"] * (xr1[i] - xr2[i]))
                                else:
                                    new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]) + DE_params["Fmin"] * (xr2[i] - xr1[i]))
                            else:
                                new_candidate.append(xi[i])

                    new_candidate.append(1)
                    candidate_l.append(np.array(new_candidate))

    return candidate_l


def process_positive_cluster(cluster):
    """
    in DE operation, use best to mutate the candidates
    v_i = x_b + F * (x_r1 - x_r2)
    """
    DE_params = {"F": 0.8, "CR": 1.0}
    pos_point = [item for item in cluster if item[-1] == 1]
    candidate_l = []

    for idx1 in range(len(pos_point)-2):
        for idx2 in range(idx1+1, len(pos_point)-1):
            for idx3 in range(idx2+1, len(pos_point)):
                [xb, xr1, xr2] = random.sample([idx1, idx2, idx3], 3)
                xb, xr1, xr2 = pos_point[xb], pos_point[xr1], pos_point[xr2]

                R = random.choice(range(len(xb)-1))
                new_candidate = []

                for i in range(len(xb)-1):
                    ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                    if ri < DE_params["CR"] or i == R:
                        new_candidate.append(xb[i] + DE_params["F"] * (xr1[i] - xr2[i]))
                    else:
                        new_candidate.append(xb[i])

                new_candidate.append(1)
                candidate_l.append(np.array(new_candidate))

    return candidate_l


def process_mixed_cluster_extra(cluster):
    """
    in DE operation, use current-to-best-extra to mutate the candidates
    v_i = x_b + F * (x_r1 - x_r2) + F_ex * (x_r3 - x_r4)
    """
    DE_params = {"F": 0.8, "CR": 1.0, "F_xc": 0.1}
    pos_point = [item for item in cluster if item[-1] == 1]

    candidate_l = []
    for xb in pos_point:
        R = random.choice(range(len(xb)-1))

        for xi in cluster:
            if not np.array_equal(xb, xi):
                available_points = []
                for p in cluster:
                    if not np.array_equal(p, xi) and not np.array_equal(p, xb):
                        available_points.append(p)

                for _ in range(20):
                    [xr1, xr2, xr3, xr4] = random.sample(available_points, 4)

                    new_candidate = []

                    for i in range(len(xi)-1):
                        ri = np.random.uniform(low=0.0, high=1.0, size=1)[0]

                        if ri < DE_params["CR"] or i == R:
                            new_candidate.append(xi[i] + DE_params["F"] * (xb[i] - xi[i]) + DE_params["F_xc"] * (xr1[i] - xr2[i]) + DE_params["F_xc"] * (xr3[i] - xr4[i]))
                        else:
                            new_candidate.append(xi[i])

                    new_candidate.append(1)
                    candidate_l.append(new_candidate)

    return candidate_l


def RandomProjectionOversampling(X_train, y_train, threshold):
    train_df = pd.concat([X_train, y_train], axis=1)
    train_df.reset_index(inplace=True, drop=True)
    tar = y_train.name
    X_train[tar] = y_train
    X_train.reset_index(inplace=True, drop=True)
    col_names = X_train.columns

    n_data_to_generate = X_train[tar].value_counts()[0] - X_train[tar].value_counts()[1]
    X_train = X_train.to_numpy()
    start_time = time.time()
    res = cluster(X_train, threshold, [])

    new_data_negative_cluster = []
    new_data_positive_cluster = []
    for c in res:
        if sum([item[-1] for item in c]) > len(c)//2:
            cur_new_data = process_positive_cluster(c)
            new_data_positive_cluster += cur_new_data
        else:
            cur_new_data = process_mixed_cluster(c)
            new_data_negative_cluster += cur_new_data

    rt = time.time() - start_time
    if len(new_data_negative_cluster) >= n_data_to_generate - len(new_data_positive_cluster):
        new_data = new_data_positive_cluster + random.sample(new_data_negative_cluster,
                                                            n_data_to_generate - len(new_data_positive_cluster))
    else:
        extra_data = []
        for c in res:
            cur_extra_data = process_mixed_cluster_extra(c)
            extra_data += cur_extra_data

        rest_data_to_generate = n_data_to_generate - len(new_data_positive_cluster) - len(new_data_negative_cluster)
        new_data = new_data_negative_cluster + new_data_positive_cluster + random.sample(extra_data, rest_data_to_generate)

    new_data_df = pd.DataFrame(np.array(new_data), columns=col_names)

    return rt, new_data_df, train_df

# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage, seed=42):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)
    np.random.seed(seed)
    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False, random_state=seed)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df


In [48]:
X_train_copy, y_train_copy = X_train.copy(), y_train.copy()

rt, new_data_df, train_df = RandomProjectionOversampling(X_train=X_train_copy,
                                                                    y_train=y_train_copy,
                                                                    threshold=10)

In [50]:
combined_df = add_synthetic_data(train_df, new_data_df, 0.2, seed=42)

In [51]:
combined_df

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
0,3.0,2.0,5.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,3.0,0.0,3.0,4.0,2.0,2.0,1.0,3.0,0.0,0.0
2,2.0,2.0,5.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0
3,3.0,0.0,2.0,0.0,2.0,2.0,0.0,1.0,1.0,0.0
4,1.0,2.0,4.0,0.0,1.0,2.0,1.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
241,2.8,2.0,2.0,0.4,1.2,2.2,0.0,2.0,0.0,1.0
242,4.0,0.0,3.4,3.2,1.2,2.0,0.0,1.0,0.2,1.0
243,4.0,0.0,7.8,4.8,2.0,3.8,1.0,2.8,-0.8,1.0
244,1.0,2.0,5.0,4.0,1.8,3.0,0.2,4.6,1.0,1.0
