##### Replicate null values in Synthetic Data Generation based on LDA classification
Datasets may contain null values in columns that may or may not be null at random, i.e. values from other columns might influence whether a column value is null or not. `null_replication.py` replicates null values in generated synthetic data to mimic null values from original dataset report. Whether a column value should be null or not is determind by a Linear Discriminant Analysis based binary classifier.

In [24]:
import sys

import dataprofiler as dp
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

sys.path.append("..")
from synthetic_data.generator_builder import Generator

In [25]:
data = datasets.load_iris(as_frame=True).frame

# Turn values in "petal length (cm)" column to null with a pattern
sepal_length_mean = data["sepal length (cm)"].mean()
sepal_width_mean = data["sepal width (cm)"].mean()
data.loc[(data["sepal length (cm)"] > sepal_length_mean) & (data["sepal width (cm)"] > sepal_width_mean), "petal length (cm)"] = None

data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,,2.3,2


In [26]:
profile_options = dp.ProfilerOptions()
profile_options.set({
    "data_labeler.is_enabled": False,
    "correlation.is_enabled": True,
    "structured_options.multiprocess.is_enabled": False,
    "null_replication_metrics.is_enabled": True
})

profile = dp.Profiler(data, options=profile_options)
report = profile.report()
report

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 
INFO:DataProfiler.profilers.profile_builder: Processing Column 1/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 2/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 3/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 4/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 5/5
INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 
INFO:DataProfiler.profilers.profile_builder: Processing Column 1/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 2/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 3/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 4/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 5/5


{'global_stats': {'samples_used': 150,
  'column_count': 5,
  'row_count': 150,
  'row_has_null_ratio': 0.16666666666666666,
  'row_is_null_ratio': 0.0,
  'unique_row_ratio': 0.9933333333333333,
  'duplicate_row_count': 1,
  'file_type': "<class 'pandas.core.frame.DataFrame'>",
  'encoding': None,
  'correlation_matrix': array([[ 1.        , -0.11756978,  0.73330783,  0.81794113,  0.78256123],
         [-0.11756978,  1.        , -0.59646644, -0.36612593, -0.42665756],
         [ 0.73330783, -0.59646644,  1.        ,  0.8305469 ,  0.85399762],
         [ 0.81794113, -0.36612593,  0.8305469 ,  1.        ,  0.95654733],
         [ 0.78256123, -0.42665756,  0.85399762,  0.95654733,  1.        ]]),
  'chi2_matrix': array([[nan, nan, nan, nan, nan],
         [nan,  1., nan,  0.,  0.],
         [nan, nan, nan, nan, nan],
         [nan,  0., nan,  1.,  0.],
         [nan,  0., nan,  0.,  1.]]),
  'profile_schema': defaultdict(list,
              {'sepal length (cm)': [0],
               'sepal

In [27]:
generator = Generator(profile=profile, seed=111)
synthetic_data = generator.synthesize(num_samples=200)
synthetic_data

sepal length (cm) True
0 float
sepal width (cm) True
1 float
petal length (cm) True
2 float
petal width (cm) True
3 float
target False
4 int


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,6.0,2.6,5.2,2.26,2.0
1,6.1,3.1,4.4,1.65,1.0
2,6.3,3.3,1.6,0.62,0.0
3,5.6,2.6,3.8,1.22,1.0
4,6.3,3.1,4.4,1.27,1.0
...,...,...,...,...,...
195,5.2,3.3,1.2,0.27,0.0
196,4.4,3.0,1.1,0.18,0.0
197,4.4,2.7,1.1,0.19,0.0
198,6.9,2.1,6.2,2.06,2.0


In [28]:
synthetic_data_profile = dp.Profiler(synthetic_data, options=profile_options)
synthetic_data_report = synthetic_data_profile.report()
synthetic_data_report

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 
INFO:DataProfiler.profilers.profile_builder: Processing Column 1/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 2/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 3/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 4/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 5/5
INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 
INFO:DataProfiler.profilers.profile_builder: Processing Column 1/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 2/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 3/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 4/5
INFO:DataProfiler.profilers.profile_builder: Processing Column 5/5


{'global_stats': {'samples_used': 200,
  'column_count': 5,
  'row_count': 200,
  'row_has_null_ratio': 0.095,
  'row_is_null_ratio': 0.0,
  'unique_row_ratio': 1.0,
  'duplicate_row_count': 0,
  'file_type': "<class 'pandas.core.frame.DataFrame'>",
  'encoding': None,
  'correlation_matrix': array([[ 1.        , -0.1657682 ,  0.71687498,  0.76525359,  0.66633034],
         [-0.1657682 ,  1.        , -0.55787899, -0.37648254, -0.39968467],
         [ 0.71687498, -0.55787899,  1.        ,  0.74972302,  0.72197156],
         [ 0.76525359, -0.37648254,  0.74972302,  1.        ,  0.89864444],
         [ 0.66633034, -0.39968467,  0.72197156,  0.89864444,  1.        ]]),
  'chi2_matrix': array([[ 1.,  0., nan, nan,  0.],
         [ 0.,  1., nan, nan,  0.],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [ 0.,  0., nan, nan,  1.]]),
  'profile_schema': defaultdict(list,
              {'sepal length (cm)': [0],
               'sepal width (cm)': [1],
          

In [29]:
X = data.drop(columns="petal length (cm)")
y = data['petal length (cm)'].isnull()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

clf = RandomForestClassifier().fit(X_train, y_train)
original_roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC score of RandomForestClassifier on predicting null values in original dataset: {original_roc_auc}")

X_synthetic = synthetic_data.drop(columns="petal length (cm)")
y_synthetic = synthetic_data['petal length (cm)'].isnull()
synthetic_roc_auc = roc_auc_score(y_synthetic, clf.predict_proba(X_synthetic)[:, 1])
print(f"ROC AUC score of RandomForestClassifier on predicting null values in synthetic dataset: {synthetic_roc_auc}")

ROC AUC score of RandomForestClassifier on predicting null values in original dataset: 1.0
ROC AUC score of RandomForestClassifier on predicting null values in synthetic dataset: 0.9321023553358534
