##### Script to Test Synthetic Data Generation Accuracy by using RandomForestClassifier predictions as Ground Truth

In [6]:
import dataprofiler as dp
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from synthetic_data.synthetic_data import make_data_from_report
from dataprofiler import Data
from IPython.utils import io


In [7]:
# Load Iris dataset
iris = datasets.load_iris(as_frame=True).frame

# Load Wisconsin Breast Cancer dataset
breast_cancer = datasets.load_breast_cancer(as_frame=True).frame
# Need to standardize otherwise correlation matrix calculated is "ill conditioned" (raises Errors)
for column in breast_cancer.columns[:-1]:
    breast_cancer[column] = (breast_cancer[column] - breast_cancer[column].mean()) / breast_cancer[column].std()

In [8]:
# Generate report
profile_options = dp.ProfilerOptions()
profile_options.set({
    "data_labeler.is_enabled": False,
    "correlation.is_enabled": True,
    "structured_options.multiprocess.is_enabled": False
})

# Supress unneeded print function outputs
with io.capture_output() as captured:
    iris_profile = dp.Profiler(iris, options=profile_options)
    iris_report = iris_profile.report()

    breast_cancer_profile = dp.Profiler(breast_cancer, options=profile_options)
    breast_cancer_report = breast_cancer_profile.report()

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 
INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 
INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 
INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 


In [9]:
# Accuracy test on Iris dataset (multiclass classification)

X_train, X_test, y_train, y_test = train_test_split(iris.iloc[:, :-1], iris.iloc[:, -1], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=111).fit(X_train, y_train)
data_roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test), multi_class="ovr")
print(f"ROC AUC score of RandomForestClassifier on original Iris dataset: {data_roc_auc}")

synthetic_data = make_data_from_report(iris_report, seed=111)
X_synthetic, y_synthetic = synthetic_data.iloc[:, :-1], synthetic_data.iloc[:, -1]
synthetic_data_roc_auc = roc_auc_score(y_synthetic, clf.predict_proba(X_synthetic), multi_class="ovr")
print(f"ROC AUC score of RandomForestClassifier on synthetic Iris dataset: {synthetic_data_roc_auc}")

ROC AUC score of RandomForestClassifier on original Iris dataset: 1.0
ROC AUC score of RandomForestClassifier on synthetic Iris dataset: 0.8744240087640313


In [10]:
# Accuracy test on Wisconsin breast cancer dataset (binary classification)

X_train, X_test, y_train, y_test = train_test_split(breast_cancer.iloc[:, :-1], breast_cancer.iloc[:, -1], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=111).fit(X_train, y_train)
data_roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC score of RandomForestClassifier on original Breast cancer dataset: {data_roc_auc}")

synthetic_data = make_data_from_report(breast_cancer_report, seed=42)
X_synthetic, y_synthetic = synthetic_data.iloc[:, :-1], synthetic_data.iloc[:, -1]
synthetic_data_roc_auc = roc_auc_score(y_synthetic, clf.predict_proba(X_synthetic)[:, 1])
print(f"ROC AUC score of RandomForestClassifier on synthetic Breast cancer dataset: {synthetic_data_roc_auc}")

ROC AUC score of RandomForestClassifier on original Breast cancer dataset: 0.996069439895185
ROC AUC score of RandomForestClassifier on synthetic Breast cancer dataset: 0.7642101621194148
