buckaroo-data · paddymul · May 17, 2026
diff --git a/buckaroo/customizations/polars_commands.py b/buckaroo/customizations/polars_commands.py
@@ -6,12 +6,19 @@
 #from ..auto_clean.cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string)
 
 class Command(object):
-    @staticmethod 
+    """A Command's `transform` returns either the new df, or a 2-tuple
+    `(df, sd_updates)` where sd_updates is a partial SDType
+    ({col: {key: value}}) to be merged into cleaning_sd. The 2-tuple form
+    lets a Command thread styling-relevant metadata (e.g. a search term
+    that becomes a highlight phrase downstream) without round-tripping
+    through the df."""
+
+    @staticmethod
     def transform(df, col, val):
         return df.with_columns(pl.col(col).fill_null(val))
         return df
 
-    @staticmethod 
+    @staticmethod
     def transform_to_py(df, col, val):
         return "    df = df.with_columns(pl.col('%s').fill_null(%r))" % (col, val)
 
@@ -141,6 +148,6 @@ def transform(df, col, val):
         return df.filter(pl.any_horizontal(pl.col(pl.String).str.contains(val)))
 
 
-    @staticmethod 
+    @staticmethod
     def transform_to_py(df, col, val):
         return f"    df = df.filter(pl.any_horizontal(pl.col(pl.String).str.contains('{val}')))"
diff --git a/buckaroo/dataflow/autocleaning.py b/buckaroo/dataflow/autocleaning.py
@@ -107,20 +107,23 @@ def _setup_from_command_kls_list(self, name):
         self.quick_command_klasses = conf.quick_command_klasses
 
 
-    def _run_df_interpreter(self, df, operations):
+    def _run_df_interpreter(self, df, operations, initial_sd):
         full_ops = [{'symbol': 'begin'}]
 
         def wrap_set_df(form):
             """
-            wrap each passed in form with a set! call to update the df symbol
+            Wrap each form so its result passes through apply-result! before
+            updating df. apply-result! threads any (df, sd_updates) tuple into
+            the sd binding bound in the lisp env and returns the bare df.
             """
-            return [s("set!"), s("df"), form]
+            return [s("set!"), s("df"),
+                    [s("apply-result!"), s("sd"), form]]
         full_ops.extend(map(wrap_set_df, operations))
         full_ops.append(s("df"))
-        if len(full_ops) == 1:
-            return df
-        
-        return self.df_interpreter(full_ops , df)
+        if not operations:
+            return df, dict(initial_sd)
+
+        return self.df_interpreter(full_ops, df, initial_sd)
 
     def _run_code_generator(self, operations):
         if len(operations) == 0:
@@ -198,7 +201,10 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing
             return [df, {}, "", []]
 
 
-        cleaned_df = self._run_df_interpreter(df, final_ops)
+        cleaned_df, op_sd = self._run_df_interpreter(df, final_ops, {})
+        if op_sd:
+            from .styling_core import merge_sds
+            cleaning_sd = merge_sds(cleaning_sd, op_sd)
         merged_cleaned_df = self.make_origs(df, cleaned_df, cleaning_sd)
         generated_code = self._run_code_generator(final_ops)
         return [merged_cleaned_df, cleaning_sd, generated_code, final_ops]

diff --git a/buckaroo/dataflow/dataflow.py b/buckaroo/dataflow/dataflow.py
@@ -404,8 +404,8 @@ def _merged_sd(self, change):
     def add_command(self, incomingCommandKls):
         return self.ac_obj.add_command(incomingCommandKls)
 
-    def _run_df_interpreter(self, df, operations):
-        self.ac_obj._run_df_interpreter(df, operations)
+    def _run_df_interpreter(self, df, operations, initial_sd):
+        return self.ac_obj._run_df_interpreter(df, operations, initial_sd)
 
     def run_code_generator(self, operations):
         self.ac_obj.run_code_generator(operations)

diff --git a/buckaroo/jlisp/configure_utils.py b/buckaroo/jlisp/configure_utils.py
@@ -1,10 +1,27 @@
 import pandas as pd
 from .lispy import make_interpreter
+
+
+def _apply_result(sd_dict, result):
+    """A Command's transform may return either a df or a 2-tuple
+    (df, sd_updates). When called from the interpreter wrapper, this
+    primitive unpacks the tuple form: it merges sd_updates into the live
+    sd dict bound in the lisp env and returns the bare df so the
+    enclosing (set! df ...) can update the df binding. For the bare-df
+    return shape it passes the value through unchanged."""
+    if isinstance(result, tuple) and len(result) == 2 and isinstance(result[1], dict):
+        df, updates = result
+        for col, kv in updates.items():
+            sd_dict.setdefault(col, {}).update(kv)
+        return df
+    return result
+
+
 def configure_buckaroo(transforms):
     command_defaults = {}
     command_patterns = {}
 
-    transform_lisp_primitives = {}
+    transform_lisp_primitives = {'apply-result!': _apply_result}
     to_py_lisp_primitives = {}
     for T in transforms:
         t = T()
@@ -13,16 +30,17 @@ def configure_buckaroo(transforms):
         command_patterns[transform_name] = t.command_pattern
         transform_lisp_primitives[transform_name] = T.transform
         to_py_lisp_primitives[transform_name] = T.transform_to_py
-    
+
     buckaroo_eval, raw_parse = make_interpreter(transform_lisp_primitives)
 
-    def buckaroo_transform(instructions, df):
+    def buckaroo_transform(instructions, df, initial_sd):
         if isinstance(df, pd.DataFrame):
             df_copy = df.copy()
         else: # hack we know it's polars here... just getting something working for now
             df_copy = df.clone()
-        ret_val =  buckaroo_eval(instructions, {'df':df_copy})
-        return ret_val
+        sd_dict = dict(initial_sd)
+        ret_df = buckaroo_eval(instructions, {'df': df_copy, 'sd': sd_dict})
+        return ret_df, sd_dict
 
     convert_to_python, __unused = make_interpreter(to_py_lisp_primitives)
     def buckaroo_to_py(instructions):

diff --git a/tests/unit/commands/command_test.py b/tests/unit/commands/command_test.py
@@ -24,7 +24,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df):
     _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
     tdf_ops = [{'symbol': 'begin'}]
     tdf_ops.extend(operations)
-    tdf = transform_df(tdf_ops, test_df.copy())
+    tdf, _sd = transform_df(tdf_ops, test_df.copy(), {})
     py_code_string = transform_to_py(operations)
 
     edf = result_from_exec(py_code_string, test_df.copy())

diff --git a/tests/unit/commands/pandas_commands_test.py b/tests/unit/commands/pandas_commands_test.py
@@ -34,7 +34,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df):
     _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
     tdf_ops = [{'symbol': 'begin'}]
     tdf_ops.extend(operations)
-    tdf = transform_df(tdf_ops, test_df.copy())
+    tdf, _sd = transform_df(tdf_ops, test_df.copy(), {})
     py_code_string = transform_to_py(operations)
 
     edf = result_from_exec(py_code_string, test_df.copy())

diff --git a/tests/unit/commands/polars_command_test.py b/tests/unit/commands/polars_command_test.py
@@ -28,7 +28,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df):
     _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
     tdf_ops = [{'symbol': 'begin'}]
     tdf_ops.extend(operations)
-    tdf = transform_df(tdf_ops, test_df.clone())
+    tdf, _sd = transform_df(tdf_ops, test_df.clone(), {})
     py_code_string = transform_to_py(operations)
 
     edf = result_from_exec(py_code_string, test_df.clone())

diff --git a/tests/unit/dataflow/autocleaning_pd_test.py b/tests/unit/dataflow/autocleaning_pd_test.py
@@ -202,10 +202,11 @@ def test_run_df_interpreter():
     ac = PandasAutocleaning([ACErrorConf])
     df = pd.DataFrame({'a': ["30", "40"]})
 
-    output_df = ac._run_df_interpreter(
+    output_df, _sd = ac._run_df_interpreter(
         df,
         [
-            [{'symbol': 'safe_int', 'meta':{'auto_clean': True}}, {'symbol': 'df'}, 'a']])
+            [{'symbol': 'safe_int', 'meta':{'auto_clean': True}}, {'symbol': 'df'}, 'a']],
+        {})
     expected = pd.DataFrame({'a': [30, 40]})
     assert output_df.to_dict() == expected.to_dict()
 

diff --git a/tests/unit/dataflow/autocleaning_pl_test.py b/tests/unit/dataflow/autocleaning_pl_test.py
@@ -6,8 +6,9 @@
 from buckaroo.dataflow.autocleaning import merge_ops, format_ops, AutocleaningConfig
 from buckaroo.polars_buckaroo import PolarsAutocleaning
 from buckaroo.customizations.polars_commands import (
-    PlSafeInt, DropCol, FillNA, GroupBy, NoOp
+    Command, PlSafeInt, DropCol, FillNA, GroupBy, NoOp
 )
+from buckaroo.jlisp.lisp_utils import s
 
 
 dirty_df = pl.DataFrame(
@@ -168,6 +169,39 @@ def test_handle_clean_df():
     df = df.with_columns(pl.col('a').cast(pl.Int64, strict=False))
     return df"""
 
+class TaggingCommand(Command):
+    """A Command whose transform returns the 2-tuple (df, sd_updates)."""
+    command_default = [s('tag'), s('df'), 'col', '']
+    command_pattern = [[3, 'tag', 'type', 'string']]
+
+    @staticmethod
+    def transform(df, col, val):
+        return df, {col: {'note': val}}
+
+    @staticmethod
+    def transform_to_py(df, col, val):
+        return "    # tag"
+
+
+class TagConf(AutocleaningConfig):
+    autocleaning_analysis_klasses = []
+    command_klasses = [TaggingCommand]
+    name = ""
+
+
+def test_transform_can_return_sd_updates_via_2tuple():
+    """A Command's transform may return (df, sd_updates); the interpreter
+    accumulates sd_updates and autocleaning merges them into cleaning_sd."""
+    ac = PolarsAutocleaning([TagConf])
+    df = pl.DataFrame({'a': [1, 2, 3]})
+    op = [{'symbol': 'tag'}, s('df'), 'a', 'hello']
+
+    _df, cleaning_sd, _gen, _ops = ac.handle_ops_and_clean(
+        df, cleaning_method='', quick_command_args={}, existing_operations=[op])
+
+    assert cleaning_sd.get('a', {}).get('note') == 'hello'
+
+
 def test_autoclean_codegen():
     ac = PolarsAutocleaning([ACConf, NoCleaning])
     df = pl.DataFrame({'a': ["30", "40"]})