SyntheticDataOptions and metrics calculations for NaN replication #571

Ta7ar · 2022-07-29T13:48:52Z

Added SyntheticDataOptions which can be set using profile_options.set({"synthetic_data.is_enabled": True}). This enables calculation of metrics specifically needed for capitalone/synthetic-data.

Added calculation of metrics needed by capitalone/synthetic-data for replicating NaN values in the generated dataset.

taylorfturner · 2022-07-29T15:52:55Z

dataprofiler/profilers/profile_builder.py

+            report["synthetic_data"] = {
+                "nan_replication": self._nan_replication_metrics
+            }


would put this in report["data_stats"][<column_index>]["nan_replication_metrics"]

taylorfturner · 2022-07-29T15:54:03Z

dataprofiler/tests/profilers/profiler_options/test_profiler_options.py

@@ -27,7 +27,7 @@ def test_default_profiler_options(self, *mocks):
        self.assertTrue(profile.options.data_labeler.is_enabled)
        for column in profile.options.properties:
            # TODO: remove the check for correlation option once it's updated to True
-            if column == "correlation":
+            if column == "correlation" or column == "synthetic_data":


change option name ... nan_replication_metrics

taylorfturner · 2022-07-29T16:11:47Z

dataprofiler/profilers/profile_builder.py

+        data = pd.DataFrame(clean_samples).apply(pd.to_numeric, errors="coerce")
+
+        for col_id in range(len(self._profile)):
+            null_count = getattr(self._profile[col_id], "null_count")


add something like compiler = self._profile[id] so you don't have to keep repeating the self._profile[col_id] call

taylorfturner · 2022-07-29T16:12:40Z

dataprofiler/profilers/profile_builder.py

+
+            # Calculate class priors
+            # i.e probability that a value in target col is or is not NaN
+            sample_size = getattr(self._profile[col_id], "sample_size")


see 2164 comment

taylorfturner · 2022-07-29T16:12:55Z

dataprofiler/profilers/profile_builder.py

+                    prev_mean_nan, mean_nan, prev_null_count, added_null_count
+                )
+
+            col_name = getattr(self._profile[col_id], "name")


see 2164 comment

taylorfturner · 2022-07-29T16:13:44Z

dataprofiler/profilers/profile_builder.py

+
+        merged_properties = {}
+        for col_id in range(len(self._profile)):
+            self_null_count = getattr(self._profile[col_id], "null_count")


see 2164 comment

taylorfturner · 2022-07-29T16:13:47Z

dataprofiler/profilers/profile_builder.py

+                continue
+
+            merged_properties[col_id] = {}
+            self_sample_size = getattr(self._profile[col_id], "sample_size")


see 2164 comment

taylorfturner · 2022-07-29T16:13:51Z

dataprofiler/profilers/profile_builder.py

+
+            merged_properties[col_id]["class_prior"] = [prior_not_nan, prior_nan]
+
+            col_name = getattr(self._profile[col_id], "name")


see 2164 comment

taylorfturner · 2022-07-29T16:14:23Z

dataprofiler/profilers/profile_builder.py

+            null_count = getattr(self._profile[col_id], "null_count", 0)
+            sample_size = getattr(self._profile[col_id], "sample_size", 0)


see 2164 comment

taylorfturner · 2022-07-29T16:14:48Z

dataprofiler/profilers/profile_builder.py

+
+            merged_properties[col_id] = {}
+            self_sample_size = getattr(self._profile[col_id], "sample_size")
+            other_sample_size = getattr(other._profile[col_id], "sample_size")


see 2164 comment

taylorfturner · 2022-07-29T16:16:15Z

dataprofiler/profilers/profile_builder.py

+                merged_properties[col_id]["class_mean"] = other_mean
+
+            elif col_id not in other._nan_replication_metrics:
+                self_mean = self._nan_replication_metrics[col_id]["class_mean"]


self._nan_replication_metrics[col_id]["class_mean"] repated...

self._nan_replication_metrics[col_id]["class_mean"] --> class_mean = self._nan_replication_metrics[col_id]["class_mean"] and then just use class_mean in the rest of the function

taylorfturner · 2022-07-29T16:16:34Z

dataprofiler/profilers/profile_builder.py

+                other_true_count = other_sample_size - other_null_count
+
+                self_mean = self._nan_replication_metrics[col_id]["class_mean"]
+                other_mean = other._nan_replication_metrics[col_id]["class_mean"]


see comment on 2270 ... same comment for other_mean

taylorfturner · 2022-07-29T16:18:42Z

dataprofiler/profilers/profiler_options.py

+class SyntheticDataOptions(BaseInspectorOptions):
+    """For configuring options for synthetic data generation"""
+
+    def __init__(self):
+        """
+        Initialize options for synthetic data generation
+        :ivar is_enabled: boolean option to enable/disable synthetic data generation options
+        :vartype is_enabled: BooleanOption
+        :ivar replicate_nan: boolean option to enable/disable calculating metrics needed for
+        replicating nan values in synthetic data generator
+        :vartype replicate_nan: BooleanOption
+        """
+
+        self.replicate_nan = BooleanOption(is_enabled=True)
+        super().__init__(is_enabled=False)
+
+    def _validate_helper(self, variable_path="SyntheticDataOptions"):
+        """
+        Validate the options do not conflict and cause errors.
+
+        :param variable_path: current path to variable set.
+        :type variable_path: str
+        :return: list of errors (if raise_error is false)
+        :rtype: list(str)
+        """
+        return super()._validate_helper(variable_path)
+
+


as discussed, remove and put nan_replication_metrics as self.nan_replication_metrics in the class StructuredOptions(BaseOption) in profiler_options.py

taylorfturner · 2022-07-29T16:19:16Z

dataprofiler/profilers/profiler_options.py

@@ -1170,6 +1200,7 @@ def __init__(self, null_values=None):
        self.data_labeler = DataLabelerOptions()
        self.correlation = CorrelationOptions()
        self.chi2_homogeneity = BooleanOption(is_enabled=True)
+        self.synthetic_data = SyntheticDataOptions()


update to StructuredOptions I think

This options is an attribute of/ part of the StructuredOptions class (the code highlighted is in the def __init__ of the StructuredOptions class) so it wouldn’t really make sense to use StructuredOptions here. Was thinking of a simple BooleanOptions.

taylorfturner · 2022-07-29T16:20:33Z

dataprofiler/profilers/utils.py

+    :return: Combined mean vector
+    :rtype: np.ndarray
+    """
+    return (mean1 * n1 + mean2 * n2) / (n1 + n2)


why not return (mean1 + mean2) / 2?

That equation would only work if n1 == n2. If sample sizes the means were calculated from are different then we need to use (mean1 * n1 + mean2 * n2) / (n1 + n2)

Ref: https://www.statisticshowto.com/combined-mean/

not weighted properly return (mean1 + mean2) / 2

Ta7ar · 2022-07-29T18:42:21Z

Updates in the recent commit include:

Reduced code redundancy
Removed SyntheticDataOptions class and use BooleanOptions class instead
Renamed options from synthetic_data to null_replication_metrics (null instead of the previous nan because DataProfiler reports null_count, null_types, null_types_index so null sounds more cohesive)
Change class_mean type from np.ndarray to list (serializable)
null_replication_metrics reported in data_stats array
Updated test cases

taylorfturner · 2022-07-29T19:03:50Z

dataprofiler/profilers/profile_builder.py

+            self_compiler = self._profile[col_id]
+            other_compiler = other._profile[col_id]


I may have done self_profile and other_profile but won't hold the PR up over this

Ta7ar · 2022-08-02T18:41:09Z

Changes made in latest commit:

General improvements in runtime and space complexity.
- mean_not_null no longer explicitly calculated by iterating over dataset. Instead sum_null is calculated by iterating over rows where target column value is null. Then sum_null is subtracted from total_row_sum (pre calculated) to obtain sum_not_null from which mean_not_null gets calculated by element wise division.
- Copies of dataset are no longer created from partitioning data and dropping columns. Instead required calculations take place through explicit dataframe indexing.
Removed _get_null_and_sample_counts and utils.combine_means as they are no longer needed.
Updated test cases to test newly added field class_sum in null_replication_metrics.
Indices of null values obtained from null_types_index instead of df[col_id].isnull()

dataprofiler/profilers/profile_builder.py

taylorfturner · 2022-08-02T19:24:52Z

dataprofiler/profilers/profile_builder.py

@@ -1438,6 +1438,9 @@ def __init__(self, data, samples_per_update=None, min_true_samples=0, options=No
        self.correlation_matrix = None
        self.chi2_matrix = None

+        # capitalone/synthetic-data specific metrics
+        self._null_replication_metrics = dict()


instantiating class variables with None is best practice. Interesting we do it on line 1435.

https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments

we do it on line 1438 and 1439

taylorfturner · 2022-08-02T19:29:16Z

dataprofiler/profilers/profile_builder.py

+                # Array index serves as class label
+                # 0 indicates not null, 1 indicates null


nit picky and not going to hold the PR up over it... just move the comments to above the dict and not inside the dict.

…py array

JGSweets · 2022-08-02T20:40:34Z

dataprofiler/profilers/profile_builder.py

+        :param clean_samples: input cleaned dataset
+        :type clean_samples: dict
+        """
+        data = pd.DataFrame(clean_samples).apply(pd.to_numeric, errors="coerce")


is this call still needed? I assume yes for the conversion to numeric? Do we need to apply the entire array? is this expensive?

Honestly, not too sure about that. Doing it because that is what correlation calculation does and I need the same type of data for null replication as correlation calculation uses.

JGSweets · 2022-08-02T20:43:25Z

dataprofiler/tests/profilers/test_profile_builder.py

+    def test_null_replication_metrics_calculation(self):
+        data = pd.DataFrame(
+            {
+                "a": [3, 2, np.nan, 7, np.nan],


should we validate that the clean, etc. works for nulls that aren't just caught by np.nan. e.g. blank spaces, or None?

As long as the DataProfiler successfully detects null values the calculation shouldn't be affected. Could definitely add that to the tests though.

JGSweets · 2022-08-02T20:47:18Z

dataprofiler/tests/profilers/test_profile_builder.py

+            }
+        )
+
+        profiler.update_profile(data_2)


shouldn't the values after this and the values after summing two profiles, e.g. merge, be the same?

Initially profiler has data, then update profiler with data_2 (so profiler has data and data_2). Then initialize profiler2 with data then merge it with profiler, therefore merged_profile contains 2*data+data_2 whereas profile after update contains data+data_2 so it shouldn't be the same.

That makes sense. I thought it was going to be data + data2

I might suggest doing that as we should validate that merge / update should provide the same results.

one way would be to do merge prior to update to validate.

JGSweets · 2022-08-02T22:19:18Z

dataprofiler/tests/profilers/test_profile_builder.py

+            {
+                "data_labeler.is_enabled": False,
+                "structured_options.multiprocess.is_enabled": False,
+                "null_replication_metrics.is_enabled": True,
+            }


PR'overhauled

SyntheticDataOptions and metrics calculations for NaN replication

e241402

Ta7ar requested review from JGSweets, ksneab7, taylorfturner, micdavis and tyfarnan as code owners July 29, 2022 13:48

taylorfturner reviewed Jul 29, 2022

View reviewed changes

taylorfturner assigned Ta7ar Jul 29, 2022

taylorfturner reviewed Jul 29, 2022

View reviewed changes

taylorfturner added Work In Progress Solution is being developed High Priority Dramatic improvement, inaccurate calculation(s) or bug / feature making the library unusable New Feature A feature addition not currently in the library labels Jul 29, 2022

updates to null replication options

e8c6bc2

reduced code redundancy

b30cdb7

taylorfturner reviewed Jul 29, 2022

View reviewed changes

taylorfturner enabled auto-merge (squash) July 29, 2022 19:13

change variable name from compiler to profile

99709a0

auto-merge was automatically disabled July 29, 2022 19:23
Head branch was pushed to by a user without write access

runtime and space optimization

d1eee80

auto-merge was automatically disabled August 2, 2022 18:26
Head branch was pushed to by a user without write access

Ta7ar dismissed stale reviews from ksneab7 and taylorfturner via d1eee80 August 2, 2022 18:26

taylorfturner enabled auto-merge (squash) August 2, 2022 19:07

taylorfturner reviewed Aug 2, 2022

View reviewed changes

Initialize with None, move comment, null_indices not converted to num…

c27f605

…py array

auto-merge was automatically disabled August 2, 2022 19:49
Head branch was pushed to by a user without write access

taylorfturner enabled auto-merge (squash) August 2, 2022 19:59

taylorfturner approved these changes Aug 2, 2022

View reviewed changes

taylorfturner self-requested a review August 2, 2022 20:08

taylorfturner previously approved these changes Aug 2, 2022

View reviewed changes

Check if _null_replication_metrics is instance of dict instead of None

a8fc742

auto-merge was automatically disabled August 2, 2022 20:13
Head branch was pushed to by a user without write access

Ta7ar dismissed taylorfturner’s stale review via a8fc742 August 2, 2022 20:13

taylorfturner enabled auto-merge (squash) August 2, 2022 20:13

taylorfturner previously approved these changes Aug 2, 2022

View reviewed changes

JGSweets reviewed Aug 2, 2022

View reviewed changes

check if metrics same after update and merge and various null types

07b12a1

auto-merge was automatically disabled August 3, 2022 14:03
Head branch was pushed to by a user without write access

Ta7ar dismissed taylorfturner’s stale review via 07b12a1 August 3, 2022 14:03

taylorfturner enabled auto-merge (squash) August 3, 2022 14:10

JGSweets approved these changes Aug 3, 2022

View reviewed changes

taylorfturner approved these changes Aug 3, 2022

View reviewed changes

taylorfturner merged commit b2a2484 into capitalone:main Aug 3, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SyntheticDataOptions and metrics calculations for NaN replication #571

SyntheticDataOptions and metrics calculations for NaN replication #571

Ta7ar commented Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022

Ta7ar Jul 29, 2022 •

edited

taylorfturner Jul 29, 2022

Ta7ar Jul 29, 2022

taylorfturner Jul 29, 2022

taylorfturner Jul 29, 2022 •

edited

Ta7ar commented Jul 29, 2022

taylorfturner Jul 29, 2022

Ta7ar commented Aug 2, 2022

taylorfturner Aug 2, 2022

taylorfturner Aug 2, 2022

taylorfturner Aug 2, 2022

JGSweets Aug 2, 2022

Ta7ar Aug 2, 2022

JGSweets Aug 2, 2022

Ta7ar Aug 2, 2022

JGSweets Aug 2, 2022 •

edited

Ta7ar Aug 2, 2022

JGSweets Aug 2, 2022

JGSweets Aug 2, 2022


		merged_properties[col_id]["class_prior"] = [prior_not_nan, prior_nan]

		col_name = getattr(self._profile[col_id], "name")

		null_count = getattr(self._profile[col_id], "null_count", 0)
		sample_size = getattr(self._profile[col_id], "sample_size", 0)

		self_compiler = self._profile[col_id]
		other_compiler = other._profile[col_id]

		# Array index serves as class label
		# 0 indicates not null, 1 indicates null

SyntheticDataOptions and metrics calculations for NaN replication #571

SyntheticDataOptions and metrics calculations for NaN replication #571

Conversation

Ta7ar commented Jul 29, 2022

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Ta7ar Jul 29, 2022 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

taylorfturner Jul 29, 2022 • edited

Choose a reason for hiding this comment

Ta7ar commented Jul 29, 2022

Choose a reason for hiding this comment

Ta7ar commented Aug 2, 2022

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

JGSweets Aug 2, 2022 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Ta7ar Jul 29, 2022 •

edited

taylorfturner Jul 29, 2022 •

edited

JGSweets Aug 2, 2022 •

edited