##### Script to Test Synthetic Data Generation Accuracy by using RandomForestClassifier predictions as Ground Truth

In [1]:
import dataprofiler as dp
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from synthetic_data.synthetic_data import make_data_from_report
from dataprofiler import Data

In [2]:
# Load Iris dataset
iris = datasets.load_iris(as_frame=True).frame

# Load Wisconsin Breast Cancer dataset
breast_cancer = datasets.load_breast_cancer(as_frame=True).frame
# Need to standardize otherwise correlation matrix calculated is "ill conditioned" (raises Errors)
for column in breast_cancer.columns[:-1]:
    breast_cancer[column] = (breast_cancer[column] - breast_cancer[column].mean()) / breast_cancer[column].std()

In [3]:
data = iris

# Generate report
profile_options = dp.ProfilerOptions()
profile_options.set({
    "data_labeler.is_enabled": False,
    "correlation.is_enabled": True,
    "structured_options.multiprocess.is_enabled": False
})

profile = dp.Profiler(data, options=profile_options)
report = profile.report()
report

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 


  df_series = df_series.loc[true_sample_list]
  df_series = df_series.loc[true_sample_list]
  df_series = df_series.loc[true_sample_list]
  df_series = df_series.loc[true_sample_list]
  df_series = df_series.loc[true_sample_list]
100%|██████████| 5/5 [00:00<00:00, 12.40it/s]

INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 



100%|██████████| 5/5 [00:00<00:00, 22.47it/s]


{'global_stats': {'samples_used': 150,
  'column_count': 5,
  'row_count': 150,
  'row_has_null_ratio': 0.0,
  'row_is_null_ratio': 0.0,
  'unique_row_ratio': 0.9933333333333333,
  'duplicate_row_count': 1,
  'file_type': "<class 'pandas.core.frame.DataFrame'>",
  'encoding': None,
  'correlation_matrix': array([[ 1.        , -0.11756978,  0.87175378,  0.81794113,  0.78256123],
         [-0.11756978,  1.        , -0.4284401 , -0.36612593, -0.42665756],
         [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543,  0.9490347 ],
         [ 0.81794113, -0.36612593,  0.96286543,  1.        ,  0.95654733],
         [ 0.78256123, -0.42665756,  0.9490347 ,  0.95654733,  1.        ]]),
  'chi2_matrix': array([[nan, nan, nan, nan, nan],
         [nan,  1., nan,  0.,  0.],
         [nan, nan, nan, nan, nan],
         [nan,  0., nan,  1.,  0.],
         [nan,  0., nan,  0.,  1.]]),
  'profile_schema': defaultdict(list,
              {'sepal length (cm)': [0],
               'sepal width (cm)': [1

In [4]:
X_train, y_train = data.iloc[:, :-1], data.iloc[:, -1]
clf = RandomForestClassifier(random_state=111)
clf.fit(X_train, y_train)

accuracies = []
for _ in range(10):
    synthetic_data = make_data_from_report(report)
    X_test, y_test = synthetic_data.iloc[:, :-1], synthetic_data.iloc[:, -1]
    predictions = clf.predict(X_test)
    accuracy = np.count_nonzero(predictions == y_test) / len(y_test)
    accuracies.append(accuracy)
print(f"The model was {round(np.mean(accuracies) * 100, 2)}% accurate on the synthetic data")

The model was 69.93% accurate on the synthetic data
