diff --git a/buckaroo/customizations/polars_commands.py b/buckaroo/customizations/polars_commands.py index 6a3237524..107b9a090 100644 --- a/buckaroo/customizations/polars_commands.py +++ b/buckaroo/customizations/polars_commands.py @@ -6,12 +6,19 @@ #from ..auto_clean.cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string) class Command(object): - @staticmethod + """A Command's `transform` returns either the new df, or a 2-tuple + `(df, sd_updates)` where sd_updates is a partial SDType + ({col: {key: value}}) to be merged into cleaning_sd. The 2-tuple form + lets a Command thread styling-relevant metadata (e.g. a search term + that becomes a highlight phrase downstream) without round-tripping + through the df.""" + + @staticmethod def transform(df, col, val): return df.with_columns(pl.col(col).fill_null(val)) return df - @staticmethod + @staticmethod def transform_to_py(df, col, val): return " df = df.with_columns(pl.col('%s').fill_null(%r))" % (col, val) @@ -141,6 +148,6 @@ def transform(df, col, val): return df.filter(pl.any_horizontal(pl.col(pl.String).str.contains(val))) - @staticmethod + @staticmethod def transform_to_py(df, col, val): return f" df = df.filter(pl.any_horizontal(pl.col(pl.String).str.contains('{val}')))" diff --git a/buckaroo/dataflow/autocleaning.py b/buckaroo/dataflow/autocleaning.py index 35de2e142..cd423f87d 100644 --- a/buckaroo/dataflow/autocleaning.py +++ b/buckaroo/dataflow/autocleaning.py @@ -107,20 +107,23 @@ def _setup_from_command_kls_list(self, name): self.quick_command_klasses = conf.quick_command_klasses - def _run_df_interpreter(self, df, operations): + def _run_df_interpreter(self, df, operations, initial_sd): full_ops = [{'symbol': 'begin'}] def wrap_set_df(form): """ - wrap each passed in form with a set! call to update the df symbol + Wrap each form so its result passes through apply-result! before + updating df. apply-result! threads any (df, sd_updates) tuple into + the sd binding bound in the lisp env and returns the bare df. """ - return [s("set!"), s("df"), form] + return [s("set!"), s("df"), + [s("apply-result!"), s("sd"), form]] full_ops.extend(map(wrap_set_df, operations)) full_ops.append(s("df")) - if len(full_ops) == 1: - return df - - return self.df_interpreter(full_ops , df) + if not operations: + return df, dict(initial_sd) + + return self.df_interpreter(full_ops, df, initial_sd) def _run_code_generator(self, operations): if len(operations) == 0: @@ -198,7 +201,10 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing return [df, {}, "", []] - cleaned_df = self._run_df_interpreter(df, final_ops) + cleaned_df, op_sd = self._run_df_interpreter(df, final_ops, {}) + if op_sd: + from .styling_core import merge_sds + cleaning_sd = merge_sds(cleaning_sd, op_sd) merged_cleaned_df = self.make_origs(df, cleaned_df, cleaning_sd) generated_code = self._run_code_generator(final_ops) return [merged_cleaned_df, cleaning_sd, generated_code, final_ops] diff --git a/buckaroo/dataflow/dataflow.py b/buckaroo/dataflow/dataflow.py index a33ff51e6..0e915d407 100644 --- a/buckaroo/dataflow/dataflow.py +++ b/buckaroo/dataflow/dataflow.py @@ -404,8 +404,8 @@ def _merged_sd(self, change): def add_command(self, incomingCommandKls): return self.ac_obj.add_command(incomingCommandKls) - def _run_df_interpreter(self, df, operations): - self.ac_obj._run_df_interpreter(df, operations) + def _run_df_interpreter(self, df, operations, initial_sd): + return self.ac_obj._run_df_interpreter(df, operations, initial_sd) def run_code_generator(self, operations): self.ac_obj.run_code_generator(operations) diff --git a/buckaroo/jlisp/configure_utils.py b/buckaroo/jlisp/configure_utils.py index 4a9490d08..88fe068f3 100644 --- a/buckaroo/jlisp/configure_utils.py +++ b/buckaroo/jlisp/configure_utils.py @@ -1,10 +1,27 @@ import pandas as pd from .lispy import make_interpreter + + +def _apply_result(sd_dict, result): + """A Command's transform may return either a df or a 2-tuple + (df, sd_updates). When called from the interpreter wrapper, this + primitive unpacks the tuple form: it merges sd_updates into the live + sd dict bound in the lisp env and returns the bare df so the + enclosing (set! df ...) can update the df binding. For the bare-df + return shape it passes the value through unchanged.""" + if isinstance(result, tuple) and len(result) == 2 and isinstance(result[1], dict): + df, updates = result + for col, kv in updates.items(): + sd_dict.setdefault(col, {}).update(kv) + return df + return result + + def configure_buckaroo(transforms): command_defaults = {} command_patterns = {} - transform_lisp_primitives = {} + transform_lisp_primitives = {'apply-result!': _apply_result} to_py_lisp_primitives = {} for T in transforms: t = T() @@ -13,16 +30,17 @@ def configure_buckaroo(transforms): command_patterns[transform_name] = t.command_pattern transform_lisp_primitives[transform_name] = T.transform to_py_lisp_primitives[transform_name] = T.transform_to_py - + buckaroo_eval, raw_parse = make_interpreter(transform_lisp_primitives) - def buckaroo_transform(instructions, df): + def buckaroo_transform(instructions, df, initial_sd): if isinstance(df, pd.DataFrame): df_copy = df.copy() else: # hack we know it's polars here... just getting something working for now df_copy = df.clone() - ret_val = buckaroo_eval(instructions, {'df':df_copy}) - return ret_val + sd_dict = dict(initial_sd) + ret_df = buckaroo_eval(instructions, {'df': df_copy, 'sd': sd_dict}) + return ret_df, sd_dict convert_to_python, __unused = make_interpreter(to_py_lisp_primitives) def buckaroo_to_py(instructions): diff --git a/tests/unit/commands/command_test.py b/tests/unit/commands/command_test.py index 592d0e93b..0c104ed5c 100644 --- a/tests/unit/commands/command_test.py +++ b/tests/unit/commands/command_test.py @@ -24,7 +24,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df): _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls]) tdf_ops = [{'symbol': 'begin'}] tdf_ops.extend(operations) - tdf = transform_df(tdf_ops, test_df.copy()) + tdf, _sd = transform_df(tdf_ops, test_df.copy(), {}) py_code_string = transform_to_py(operations) edf = result_from_exec(py_code_string, test_df.copy()) diff --git a/tests/unit/commands/pandas_commands_test.py b/tests/unit/commands/pandas_commands_test.py index f3307c47a..0d0fc09b9 100644 --- a/tests/unit/commands/pandas_commands_test.py +++ b/tests/unit/commands/pandas_commands_test.py @@ -34,7 +34,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df): _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls]) tdf_ops = [{'symbol': 'begin'}] tdf_ops.extend(operations) - tdf = transform_df(tdf_ops, test_df.copy()) + tdf, _sd = transform_df(tdf_ops, test_df.copy(), {}) py_code_string = transform_to_py(operations) edf = result_from_exec(py_code_string, test_df.copy()) diff --git a/tests/unit/commands/polars_command_test.py b/tests/unit/commands/polars_command_test.py index a86a00b17..ba0bf9863 100644 --- a/tests/unit/commands/polars_command_test.py +++ b/tests/unit/commands/polars_command_test.py @@ -28,7 +28,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df): _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls]) tdf_ops = [{'symbol': 'begin'}] tdf_ops.extend(operations) - tdf = transform_df(tdf_ops, test_df.clone()) + tdf, _sd = transform_df(tdf_ops, test_df.clone(), {}) py_code_string = transform_to_py(operations) edf = result_from_exec(py_code_string, test_df.clone()) diff --git a/tests/unit/dataflow/autocleaning_pd_test.py b/tests/unit/dataflow/autocleaning_pd_test.py index 118bf9887..fe2547f5e 100644 --- a/tests/unit/dataflow/autocleaning_pd_test.py +++ b/tests/unit/dataflow/autocleaning_pd_test.py @@ -202,10 +202,11 @@ def test_run_df_interpreter(): ac = PandasAutocleaning([ACErrorConf]) df = pd.DataFrame({'a': ["30", "40"]}) - output_df = ac._run_df_interpreter( + output_df, _sd = ac._run_df_interpreter( df, [ - [{'symbol': 'safe_int', 'meta':{'auto_clean': True}}, {'symbol': 'df'}, 'a']]) + [{'symbol': 'safe_int', 'meta':{'auto_clean': True}}, {'symbol': 'df'}, 'a']], + {}) expected = pd.DataFrame({'a': [30, 40]}) assert output_df.to_dict() == expected.to_dict() diff --git a/tests/unit/dataflow/autocleaning_pl_test.py b/tests/unit/dataflow/autocleaning_pl_test.py index 1aa7cb937..a0e799286 100644 --- a/tests/unit/dataflow/autocleaning_pl_test.py +++ b/tests/unit/dataflow/autocleaning_pl_test.py @@ -6,8 +6,9 @@ from buckaroo.dataflow.autocleaning import merge_ops, format_ops, AutocleaningConfig from buckaroo.polars_buckaroo import PolarsAutocleaning from buckaroo.customizations.polars_commands import ( - PlSafeInt, DropCol, FillNA, GroupBy, NoOp + Command, PlSafeInt, DropCol, FillNA, GroupBy, NoOp ) +from buckaroo.jlisp.lisp_utils import s dirty_df = pl.DataFrame( @@ -168,6 +169,39 @@ def test_handle_clean_df(): df = df.with_columns(pl.col('a').cast(pl.Int64, strict=False)) return df""" +class TaggingCommand(Command): + """A Command whose transform returns the 2-tuple (df, sd_updates).""" + command_default = [s('tag'), s('df'), 'col', ''] + command_pattern = [[3, 'tag', 'type', 'string']] + + @staticmethod + def transform(df, col, val): + return df, {col: {'note': val}} + + @staticmethod + def transform_to_py(df, col, val): + return " # tag" + + +class TagConf(AutocleaningConfig): + autocleaning_analysis_klasses = [] + command_klasses = [TaggingCommand] + name = "" + + +def test_transform_can_return_sd_updates_via_2tuple(): + """A Command's transform may return (df, sd_updates); the interpreter + accumulates sd_updates and autocleaning merges them into cleaning_sd.""" + ac = PolarsAutocleaning([TagConf]) + df = pl.DataFrame({'a': [1, 2, 3]}) + op = [{'symbol': 'tag'}, s('df'), 'a', 'hello'] + + _df, cleaning_sd, _gen, _ops = ac.handle_ops_and_clean( + df, cleaning_method='', quick_command_args={}, existing_operations=[op]) + + assert cleaning_sd.get('a', {}).get('note') == 'hello' + + def test_autoclean_codegen(): ac = PolarsAutocleaning([ACConf, NoCleaning]) df = pl.DataFrame({'a': ["30", "40"]})