##### Replicate null values in Synthetic Data Generation based on LDA classification
Datasets may contain null values in columns that may or may not be null at random, i.e. values from other columns might influence whether a column value is null or not. `null_replication.py` replicates null values in generated synthetic data to mimic null values from original dataset report. Whether a column value should be null or not is determind by a Linear Discriminant Analysis based binary classifier.

In [None]:
import dataprofiler as dp
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from synthetic_data.synthetic_data import make_data_from_report

In [None]:
data = datasets.load_iris(as_frame=True).frame

# Turn values in "petal length (cm)" column to null with a pattern
sepal_length_mean = data["sepal length (cm)"].mean()
sepal_width_mean = data["sepal width (cm)"].mean()
data.loc[(data["sepal length (cm)"] > sepal_length_mean) & (data["sepal width (cm)"] > sepal_width_mean), "petal length (cm)"] = None

data

In [None]:
profile_options = dp.ProfilerOptions()
profile_options.set({
    "data_labeler.is_enabled": False,
    "correlation.is_enabled": True,
    "structured_options.multiprocess.is_enabled": False,
    "null_replication_metrics.is_enabled": True
})

profile = dp.Profiler(data, options=profile_options)
report = profile.report()
report

In [None]:
synthetic_data = make_data_from_report(report)
synthetic_data

In [None]:
synthetic_data_profile = dp.Profiler(synthetic_data, options=profile_options)
synthetic_data_report = synthetic_data_profile.report()
synthetic_data_report

In [None]:
X = data.drop(columns="petal length (cm)")
y = data['petal length (cm)'].isnull()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

clf = RandomForestClassifier().fit(X_train, y_train)
original_roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC score of RandomForestClassifier on predicting null values in original dataset: {original_roc_auc}")

X_synthetic = synthetic_data.drop(columns="petal length (cm)")
y_synthetic = synthetic_data['petal length (cm)'].isnull()
synthetic_roc_auc = roc_auc_score(y_synthetic, clf.predict_proba(X_synthetic)[:, 1])
print(f"ROC AUC score of RandomForestClassifier on predicting null values in synthetic dataset: {synthetic_roc_auc}")