<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/data-synth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.11.0-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.6/125.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.34.79-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.34.79-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.10,>=0.9.0 (from sdv)
  Downloading ctgan-0.9.1-py3-none-any.whl (24 kB)
Collecting deepecho<0.6,>=0.5 (from sdv)
  Downl

In [1]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [2]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [3]:
# All imports here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pmlb import fetch_data
import matplotlib.pyplot as plt
import seaborn as sns
import random

# from sdv.datasets.local import load_csvs
# from sdv.metadata import SingleTableMetadata
# from sdv.single_table import GaussianCopulaSynthesizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

## Data preprocessing

In [91]:
# Generic data pre-processing

def preprocess_data_train(df):
    df = df.copy()
    # Count missing values before handling missing data
    missing_before = df.isnull().sum().sum()
    print("Number of missing values before handling:", missing_before)

    # Handle missing data
    imputer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
    df[df.select_dtypes(include=['float64', 'int64']).columns] = imputer.fit_transform(df.select_dtypes(include=['float64', 'int64']))

    # Count missing values after handling missing data
    missing_after = df.isnull().sum().sum()
    print("Number of missing values after handling:", missing_after)

    # Normalize numeric columns
    scaler = StandardScaler()
    df[df.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(df.select_dtypes(include=['float64', 'int64']))

    return df, scaler, imputer

def preprocess_data_test(df, scaler, imputer):
    df = df.copy()
    # Count missing values before handling missing data
    missing_before = df.isnull().sum().sum()
    print("Number of missing values before handling in test_dataset:", missing_before)

    # Handle missing data
    df[df.select_dtypes(include=['float64', 'int64']).columns] = imputer.transform(df.select_dtypes(include=['float64', 'int64']))

    # Count missing values after handling missing data
    missing_after = df.isnull().sum().sum()
    print("Number of missing values after handling in test_dataset:", missing_after)

    # Normalize numeric columns
    df[df.select_dtypes(include=['float64', 'int64']).columns] = scaler.transform(df.select_dtypes(include=['float64', 'int64']))

    return df

## Experiments

### Dataset 1: Breast cancer

In [5]:
breast_cancer = fetch_data('breast_cancer')
breast_cancer.describe()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
count,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0,286.0
mean,2.664336,1.073427,4.062937,1.073427,1.167832,2.048951,0.468531,2.772727,0.237762,0.297203
std,1.011818,0.98668,2.151187,1.935321,0.443052,0.738217,0.499883,1.099006,0.426459,0.457828
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0
50%,3.0,2.0,4.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0
75%,3.0,2.0,5.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0
max,5.0,2.0,10.0,6.0,2.0,3.0,1.0,5.0,1.0,1.0


## Training and testing using ML models

In [93]:
# Generic function to test synthetic data using LR, SVM, DT

def evaluate_models(X_train, X_test, y_train, y_test):

    # Initialize classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    # Results dictionary to store evaluation metrics
    results = {}

    # Iterate over classifiers
    for name, clf in classifiers.items():
        print(name)
        # Fit classifier
        clf.fit(X_train, y_train)

        # Predictions
        y_pred = clf.predict(X_test)

        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # AUC-ROC
        if hasattr(clf, "predict_proba"):
            y_prob = clf.predict_proba(X_test)[:,1]
        else:
            y_prob = clf.decision_function(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Confusion Matrix": cm
        }

        # Plot AUC-ROC curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - AUC-ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(f'{name}_auc_roc_curve.png', dpi=300)
        plt.close()

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'{name} - Confusion Matrix')
        plt.savefig(f'{name}_confusion_matrix.png', dpi=300)
        plt.close()

    return results


In [115]:
y = breast_cancer['target']
X = breast_cancer.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [116]:
column_types = X_train.dtypes
y_column_type = y_train.dtypes

In [117]:
# for i, j in column_types.items():


In [118]:
# df['column_name'].astype(np.int64)

In [119]:
y_train.dtypes

dtype('int64')

In [120]:
train_df = pd.concat([X_train, y_train], axis=1)
class_counts = y_train.value_counts()

minority_class_label = class_counts.idxmin()

majority_df = train_df[train_df.iloc[:, -1] != minority_class_label]
minority_df = train_df[train_df.iloc[:, -1] == minority_class_label]

minority_df.reset_index(inplace=True)
majority_df.reset_index(inplace=True)

majority_count = class_counts.max()
minority_count = class_counts.min()

In [121]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network

def data_synthesizer(df, percentage, diff, seed=42):
  # print(df.head())
  np.random.seed(seed)

  temp_file = 'temp.csv'
  description_file = 'test_description.json'
  synthetic_data_file = 'synthetic_breast_data.csv'

  df.to_csv(temp_file, index=False)

  categorical_attributes = {
    'age': True,
    'menopause': True,
    'tumor-size': True,
    'inv-nodes': True,
    'node-caps': True,
    'deg-malig': True,
    'breast': True,
    'breast-quad': True,
    'irradiat': True,
    'target': True
    }

  epsilon = 0.1
  degree_of_bayesian_network = 2
  num_tuples_to_generate = int(diff * percentage)

  describer = DataDescriber(category_threshold=5)
  # Describe the dataset to create a Bayesian network
  describer.describe_dataset_in_correlated_attribute_mode(dataset_file= temp_file,
                                                        epsilon=epsilon,
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes
                                                        )
  # Save dataset description to a JSON file
  describer.save_dataset_description_to_file(description_file)
  # Display the Bayesian network
  display_bayesian_network(describer.bayesian_network)

  generator = DataGenerator()
  generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
  # Save synthetic data to a CSV file
  generator.save_synthetic_data(synthetic_data_file)

  df_syn = pd.read_csv(synthetic_data_file)
  return pd.concat([df, df_syn], ignore_index=True)

In [124]:
percentages = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
for percentage in percentages:
  print("---------------------------------------------------------------")
  print("percentage: ", percentage)
  minority_syn = data_synthesizer(minority_df, percentage, majority_count - minority_count)
  syn_df = pd.concat([minority_syn, majority_df], ignore_index=True)
  X_train_1 = syn_df.drop(['target', 'index'], axis=1)
  y_train_1 = syn_df['target']


  for column, dtype in column_types.items():
    X_train_1[column] = X_train_1[column].astype(dtype)

  # for column, dtype in column_types.items():
  y_train_1 = y_train_1.astype(y_column_type)

  # print(type(y_train))
  # print(X_train.dtypes)

  preprocessed_X_train, scaler, imputer = preprocess_data_train(X_train_1)
  preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

  X_train_1, y_train_1 = preprocessed_X_train.to_numpy(), y_train_1.to_numpy()
  X_test_1, y_test_1 = preprocessed_X_test.to_numpy(), y_test.to_numpy()

  evaluate_models(X_train_1, X_test_1, y_train_1, y_test_1)
  print("---------------------------------------------------------------")
  # break

---------------------------------------------------------------
percentage:  0
Adding ROOT breast
Adding attribute node-caps
Adding attribute breast-quad
Adding attribute deg-malig
Adding attribute inv-nodes
Adding attribute tumor-size
Adding attribute irradiat
Adding attribute menopause
Adding attribute target
Adding attribute age
Constructed Bayesian network:
    node-caps   has parents ['breast'].
    breast-quad has parents ['node-caps', 'breast'].
    deg-malig   has parents ['node-caps', 'breast'].
    inv-nodes   has parents ['breast-quad', 'breast'].
    tumor-size  has parents ['breast-quad', 'breast'].
    irradiat    has parents ['tumor-size', 'node-caps'].
    menopause   has parents ['breast-quad', 'node-caps'].
    target      has parents ['menopause', 'deg-malig'].
    age         has parents ['target', 'irradiat'].
Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of 

In [69]:
print(type(y_train))
# print(y_test)

<class 'pandas.core.series.Series'>


In [32]:

X_train.dtypes


age            object
menopause      object
tumor-size     object
inv-nodes      object
node-caps      object
deg-malig      object
breast         object
breast-quad    object
irradiat       object
dtype: object

In [33]:
for column, dtype in column_types.items():
  print(column, dtype)
  X_train[column] = X_train[column].astype(dtype)

age object
menopause object
tumor-size object
inv-nodes object
node-caps object
deg-malig object
breast object
breast-quad object
irradiat object


In [14]:
preprocessed_X_train, scaler, imputer = preprocess_data_train(X_train)
preprocessed_X_test = preprocess_data_test(X_test, scaler, imputer)

X_train, y_train = preprocessed_X_train.to_numpy(), y_train.to_numpy()
X_test, y_test = preprocessed_X_test.to_numpy(), y_test.to_numpy()

Number of missing values before handling: 0
Number of missing values after handling: 0
Number of missing values before handling in test_dataset: 0
Number of missing values after handling in test_dataset: 0


In [15]:
# X_train_0, y_train_0 =  prune_data(X_train, y_train, 0)
evaluate_models(X_train, X_test, y_train, y_test)

{'Logistic Regression': {'Accuracy': 0.7068965517241379,
  'Precision': 0.5294117647058824,
  'Recall': 0.5,
  'F1 Score': 0.5142857142857143,
  'ROC AUC': 0.6625,
  'Confusion Matrix': array([[32,  8],
         [ 9,  9]])},
 'SVM': {'Accuracy': 0.7586206896551724,
  'Precision': 0.625,
  'Recall': 0.5555555555555556,
  'F1 Score': 0.5882352941176471,
  'ROC AUC': 0.7833333333333333,
  'Confusion Matrix': array([[34,  6],
         [ 8, 10]])},
 'Decision Tree': {'Accuracy': 0.7241379310344828,
  'Precision': 0.55,
  'Recall': 0.6111111111111112,
  'F1 Score': 0.5789473684210527,
  'ROC AUC': 0.7145833333333332,
  'Confusion Matrix': array([[31,  9],
         [ 7, 11]])}}

In [None]:
X_train_20, y_train_20 =  prune_data(X_train, y_train, 20)
evaluate_models(X_train_20, X_test, y_train_20, y_test)

{'Logistic Regression': {'Accuracy': 0.7931034482758621,
  'Precision': 0.8,
  'Recall': 0.4444444444444444,
  'F1 Score': 0.5714285714285714,
  'ROC AUC': 0.7236111111111111,
  'Confusion Matrix': array([[38,  2],
         [10,  8]])},
 'SVM': {'Accuracy': 0.7586206896551724,
  'Precision': 0.7,
  'Recall': 0.3888888888888889,
  'F1 Score': 0.5,
  'ROC AUC': 0.8027777777777778,
  'Confusion Matrix': array([[37,  3],
         [11,  7]])},
 'Decision Tree': {'Accuracy': 0.7413793103448276,
  'Precision': 0.5789473684210527,
  'Recall': 0.6111111111111112,
  'F1 Score': 0.5945945945945946,
  'ROC AUC': 0.7055555555555556,
  'Confusion Matrix': array([[32,  8],
         [ 7, 11]])}}

In [None]:
X_train_40, y_train_40 =  prune_data(X_train, y_train, 40)
evaluate_models(X_train_40, X_test, y_train_40, y_test)

{'Logistic Regression': {'Accuracy': 0.7758620689655172,
  'Precision': 0.7272727272727273,
  'Recall': 0.4444444444444444,
  'F1 Score': 0.5517241379310345,
  'ROC AUC': 0.7194444444444446,
  'Confusion Matrix': array([[37,  3],
         [10,  8]])},
 'SVM': {'Accuracy': 0.7758620689655172,
  'Precision': 0.7777777777777778,
  'Recall': 0.3888888888888889,
  'F1 Score': 0.5185185185185185,
  'ROC AUC': 0.8097222222222222,
  'Confusion Matrix': array([[38,  2],
         [11,  7]])},
 'Decision Tree': {'Accuracy': 0.7068965517241379,
  'Precision': 0.5263157894736842,
  'Recall': 0.5555555555555556,
  'F1 Score': 0.5405405405405405,
  'ROC AUC': 0.6597222222222222,
  'Confusion Matrix': array([[31,  9],
         [ 8, 10]])}}

In [None]:
X_train_60, y_train_60 =  prune_data(X_train, y_train, 60)
evaluate_models(X_train_60, X_test, y_train_60, y_test)

{'Logistic Regression': {'Accuracy': 0.7758620689655172,
  'Precision': 0.7272727272727273,
  'Recall': 0.4444444444444444,
  'F1 Score': 0.5517241379310345,
  'ROC AUC': 0.673611111111111,
  'Confusion Matrix': array([[37,  3],
         [10,  8]])},
 'SVM': {'Accuracy': 0.7586206896551724,
  'Precision': 0.6666666666666666,
  'Recall': 0.4444444444444444,
  'F1 Score': 0.5333333333333333,
  'ROC AUC': 0.8055555555555556,
  'Confusion Matrix': array([[36,  4],
         [10,  8]])},
 'Decision Tree': {'Accuracy': 0.6724137931034483,
  'Precision': 0.47619047619047616,
  'Recall': 0.5555555555555556,
  'F1 Score': 0.5128205128205129,
  'ROC AUC': 0.6402777777777777,
  'Confusion Matrix': array([[29, 11],
         [ 8, 10]])}}

In [None]:
X_train_80, y_train_80 =  prune_data(X_train, y_train, 80)
evaluate_models(X_train_80, X_test, y_train_80, y_test)

{'Logistic Regression': {'Accuracy': 0.7758620689655172,
  'Precision': 0.6666666666666666,
  'Recall': 0.5555555555555556,
  'F1 Score': 0.606060606060606,
  'ROC AUC': 0.6583333333333333,
  'Confusion Matrix': array([[35,  5],
         [ 8, 10]])},
 'SVM': {'Accuracy': 0.7413793103448276,
  'Precision': 0.6153846153846154,
  'Recall': 0.4444444444444444,
  'F1 Score': 0.5161290322580646,
  'ROC AUC': 0.7944444444444445,
  'Confusion Matrix': array([[35,  5],
         [10,  8]])},
 'Decision Tree': {'Accuracy': 0.7586206896551724,
  'Precision': 0.6,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.631578947368421,
  'ROC AUC': 0.7333333333333333,
  'Confusion Matrix': array([[32,  8],
         [ 6, 12]])}}

In [None]:
X_train_100, y_train_100 =  prune_data(X_train, y_train, 100)
evaluate_models(X_train_100, X_test, y_train_100, y_test)

{'Logistic Regression': {'Accuracy': 0.6724137931034483,
  'Precision': 0.4782608695652174,
  'Recall': 0.6111111111111112,
  'F1 Score': 0.5365853658536586,
  'ROC AUC': 0.6722222222222223,
  'Confusion Matrix': array([[28, 12],
         [ 7, 11]])},
 'SVM': {'Accuracy': 0.7241379310344828,
  'Precision': 0.55,
  'Recall': 0.6111111111111112,
  'F1 Score': 0.5789473684210527,
  'ROC AUC': 0.788888888888889,
  'Confusion Matrix': array([[31,  9],
         [ 7, 11]])},
 'Decision Tree': {'Accuracy': 0.6379310344827587,
  'Precision': 0.4444444444444444,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.5333333333333333,
  'ROC AUC': 0.6416666666666666,
  'Confusion Matrix': array([[25, 15],
         [ 6, 12]])}}

In [8]:
type(breast_cancer)

In [9]:
breast_cancer.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
0,2,2,2,0,2,3,1,3,0,1
1,3,0,2,0,1,1,1,1,0,0
2,3,0,6,0,1,2,0,2,0,1
3,2,2,6,0,2,3,1,2,1,0
4,2,2,5,4,2,2,0,5,0,1


In [12]:
breast_cancer.columns

Index(['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
       'breast', 'breast-quad', 'irradiat', 'target'],
      dtype='object')

In [21]:
breast_cancer['target'].unique()

array([1, 0])