<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/data_pruning_sdv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baselines


### Docs for SDV

https://docs.sdv.dev/sdv

In [75]:
!pip install pmlb



In [76]:
!pip install sdv



In [77]:
# All imports here

from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [78]:
# Generic data pre-processing

def preprocess_data_train(df):

    # Count missing values before handling missing data
    missing_before = df.isnull().sum().sum()
    print("Number of missing values before handling:", missing_before)

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
    df[df.select_dtypes(include=['float64', 'int64']).columns] = imputer.fit_transform(df.select_dtypes(include=['float64', 'int64']))

    # Count missing values after handling missing data
    missing_after = df.isnull().sum().sum()
    print("Number of missing values after handling:", missing_after)

    # Normalize numeric columns
    scaler = StandardScaler()
    df[df.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(df.select_dtypes(include=['float64', 'int64']))

    return df, scaler, imputer

def preprocess_data_test(df, scaler, imputer):
    # Count missing values before handling missing data
    missing_before = df.isnull().sum().sum()
    print("Number of missing values before handling in test_dataset:", missing_before)

    # Handle missing data
    df[df.select_dtypes(include=['float64', 'int64']).columns] = imputer.transform(df.select_dtypes(include=['float64', 'int64']))

    # Count missing values after handling missing data
    missing_after = df.isnull().sum().sum()
    print("Number of missing values after handling in test_dataset:", missing_after)

    # Normalize numeric columns
    df[df.select_dtypes(include=['float64', 'int64']).columns] = scaler.transform(df.select_dtypes(include=['float64', 'int64']))

    return df


In [79]:
breast_cancer = fetch_data('breast_cancer')
breast_cancer.describe()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
count,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0
mean,2.664336,1.073427,4.062937,1.073427,1.167832,2.048951,0.468531,2.772727,0.237762,0.297203
std,1.011818,0.98668,2.151187,1.935321,0.443052,0.738217,0.499883,1.099006,0.426459,0.457828
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0
50%,3.0,2.0,4.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0
75%,3.0,2.0,5.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0
max,5.0,2.0,10.0,6.0,2.0,3.0,1.0,5.0,1.0,1.0


In [80]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results

In [81]:

y = breast_cancer['target']
X = breast_cancer.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

preprocessed_X_train, scaler, imputer = preprocess_data_train(X_train)
preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()

Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0


In [82]:
results = evaluate_models(X_train, X_test, y_train, y_test)
print(results)

{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.7291666666666667, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.7833333333333333, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.47619047619047616, 'Recall': 0.5555555555555556, 'F1 Score': 0.5128205128205129, 'ROC AUC': 0.6604166666666667, 'Confusion Matrix': array([[29, 11],
       [ 8, 10]])}}


## Gaussian Copula Synthesizer

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

train_df = pd.concat([X_train, y_train], axis=1)
class_counts = y_train.value_counts()

# Find minority class label
minority_class_label = class_counts.idxmin()

# Filter rows with minority class label
minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

# Calculate counts of majority and minority classes
majority_count = class_counts.max()
minority_count = class_counts.min()

# Print minority class DataFrame and counts
print("DataFrame consisting of rows of the minority class only:")
print(minority_df)

DataFrame consisting of rows of the minority class only:
     age  menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  \
90     4          0           3          0          1          1       1   
250    3          0           5          6          2          3       0   
68     3          2          10          6          2          2       1   
165    3          0           5          5          2          2       0   
255    3          2           4          0          1          3       1   
..   ...        ...         ...        ...        ...        ...     ...   
257    4          0           4          0          1          3       0   
99     2          2           6          0          1          1       1   
20     3          1           3          0          0          1       0   
106    4          0           3          4          1          2       0   
270    3          0           4          5          1          3       0   

     breast-quad  irradiat  ta

In [84]:
metadata_data = SingleTableMetadata()
metadata_data.detect_from_dataframe(minority_df)

In [85]:
# Generate synthetic data using GaussianCopulaSynthesizer

synthesizer_breast_data = GaussianCopulaSynthesizer(metadata_data)
synthesizer_breast_data.fit(minority_df)



In [86]:
# Print sample synthetic data
sd1 = synthesizer_breast_data.sample(num_rows=majority_count-minority_count)
sd1

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
0,2,2,3,0,2,3,0,5,0,1
1,2,2,3,0,1,2,1,5,0,1
2,3,2,6,0,2,1,1,4,0,1
3,3,0,4,4,1,2,1,5,0,1
4,2,0,6,4,2,3,0,4,1,1
...,...,...,...,...,...,...,...,...,...,...
89,2,0,5,0,1,3,1,3,1,1
90,1,0,3,4,2,3,0,2,0,1
91,3,0,6,0,1,3,1,2,0,1
92,1,2,2,4,2,3,0,1,0,1


In [87]:
# Function to add synthetic data to the main DataFrame based on percentage
def add_synthetic_data(main_df, synthetic_df, percentage):
    # Calculate number of rows to sample
    num_rows = int(len(synthetic_df) * percentage)

    # Sample the specified percentage of synthetic data
    sampled_synthetic_data = synthetic_df.sample(n=num_rows, replace=False)
    # print(sampled_synthetic_data)

    # Concatenate sampled synthetic data with main DataFrame
    combined_df = pd.concat([main_df, sampled_synthetic_data], ignore_index=True)
    # print(combined_df)
    return combined_df

# Define percentages
percentages = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

# Add synthetic data at different percentages to the main DataFrame
for percentage in percentages:
    combined_df = add_synthetic_data(train_df, sd1, percentage)
    y_train = combined_df['target']
    X_train = combined_df.drop('target', axis=1)

    preprocessed_X_train, scaler, imputer = preprocess_data_train(X_train)
    preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

    X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()

    print(f"Train data combined with {percentage * 100}% synthetic data of minority class:")
    print(len(X_train), len(y_train))
    results = evaluate_models(X_train, X_test, y_train, y_test)
    print(results)
    print("_______________________________________________________________________________")
    # break


Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0
Train data combined with 0% synthetic data of minority class:
228 228




{'Logistic Regression': {'Accuracy': 0.7931034482758621, 'Precision': 0.8, 'Recall': 0.4444444444444444, 'F1 Score': 0.5714285714285714, 'ROC AUC': 0.7291666666666667, 'Confusion Matrix': array([[38,  2],
       [10,  8]])}, 'SVM': {'Accuracy': 0.7758620689655172, 'Precision': 0.7777777777777778, 'Recall': 0.3888888888888889, 'F1 Score': 0.5185185185185185, 'ROC AUC': 0.7833333333333333, 'Confusion Matrix': array([[38,  2],
       [11,  7]])}, 'Decision Tree': {'Accuracy': 0.6724137931034483, 'Precision': 0.4782608695652174, 'Recall': 0.6111111111111112, 'F1 Score': 0.5365853658536586, 'ROC AUC': 0.6749999999999999, 'Confusion Matrix': array([[28, 12],
       [ 7, 11]])}}
_______________________________________________________________________________
Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0
Train data combined with 20.0% sy

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.7347222222222223, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}, 'SVM': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.4111111111111111, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}, 'Decision Tree': {'Accuracy': 0.3620689655172414, 'Precision': 0.27906976744186046, 'Recall': 0.6666666666666666, 'F1 Score': 0.3934426229508197, 'ROC AUC': 0.4458333333333333, 'Confusion Matrix': array([[ 9, 31],
       [ 6, 12]])}}
_______________________________________________________________________________
Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0
Train data combined with 40.0% synthetic data of minority class:
265 265


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.6930555555555555, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}, 'SVM': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.40555555555555556, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.5, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}}
_______________________________________________________________________________
Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0
Train data combined with 60.0% synthetic data of minority class:
284 284


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.7083333333333333, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}, 'SVM': {'Accuracy': 0.3103448275862069, 'Precision': 0.3103448275862069, 'Recall': 1.0, 'F1 Score': 0.4736842105263158, 'ROC AUC': 0.5555555555555556, 'Confusion Matrix': array([[ 0, 40],
       [ 0, 18]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.5, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}}
_______________________________________________________________________________
Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0
Train data combined with 80.0% synthetic data of minority class:
303 303


  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.7413793103448276, 'Precision': 1.0, 'Recall': 0.16666666666666666, 'F1 Score': 0.2857142857142857, 'ROC AUC': 0.6902777777777778, 'Confusion Matrix': array([[40,  0],
       [15,  3]])}, 'SVM': {'Accuracy': 0.3103448275862069, 'Precision': 0.3103448275862069, 'Recall': 1.0, 'F1 Score': 0.4736842105263158, 'ROC AUC': 0.5, 'Confusion Matrix': array([[ 0, 40],
       [ 0, 18]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.5, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}}
_______________________________________________________________________________
Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0
Train data combined with 100.0% synthetic data of minority class:
322 322


  _warn_prf(average, modifier, msg_start, len(result))


{'Logistic Regression': {'Accuracy': 0.6896551724137931, 'Precision': 0.5, 'Recall': 0.16666666666666666, 'F1 Score': 0.25, 'ROC AUC': 0.6875, 'Confusion Matrix': array([[37,  3],
       [15,  3]])}, 'SVM': {'Accuracy': 0.3103448275862069, 'Precision': 0.3103448275862069, 'Recall': 1.0, 'F1 Score': 0.4736842105263158, 'ROC AUC': 0.5, 'Confusion Matrix': array([[ 0, 40],
       [ 0, 18]])}, 'Decision Tree': {'Accuracy': 0.6896551724137931, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'ROC AUC': 0.5, 'Confusion Matrix': array([[40,  0],
       [18,  0]])}}
_______________________________________________________________________________
