Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions buckaroo/customizations/polars_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,19 @@
#from ..auto_clean.cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string)

class Command(object):
@staticmethod
"""A Command's `transform` returns either the new df, or a 2-tuple
`(df, sd_updates)` where sd_updates is a partial SDType
({col: {key: value}}) to be merged into cleaning_sd. The 2-tuple form
lets a Command thread styling-relevant metadata (e.g. a search term
that becomes a highlight phrase downstream) without round-tripping
through the df."""

@staticmethod
def transform(df, col, val):
return df.with_columns(pl.col(col).fill_null(val))
return df

@staticmethod
@staticmethod
def transform_to_py(df, col, val):
return " df = df.with_columns(pl.col('%s').fill_null(%r))" % (col, val)

Expand Down Expand Up @@ -141,6 +148,6 @@ def transform(df, col, val):
return df.filter(pl.any_horizontal(pl.col(pl.String).str.contains(val)))


@staticmethod
@staticmethod
def transform_to_py(df, col, val):
return f" df = df.filter(pl.any_horizontal(pl.col(pl.String).str.contains('{val}')))"
22 changes: 14 additions & 8 deletions buckaroo/dataflow/autocleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,23 @@ def _setup_from_command_kls_list(self, name):
self.quick_command_klasses = conf.quick_command_klasses


def _run_df_interpreter(self, df, operations):
def _run_df_interpreter(self, df, operations, initial_sd):
full_ops = [{'symbol': 'begin'}]

def wrap_set_df(form):
"""
wrap each passed in form with a set! call to update the df symbol
Wrap each form so its result passes through apply-result! before
updating df. apply-result! threads any (df, sd_updates) tuple into
the sd binding bound in the lisp env and returns the bare df.
"""
return [s("set!"), s("df"), form]
return [s("set!"), s("df"),
[s("apply-result!"), s("sd"), form]]
full_ops.extend(map(wrap_set_df, operations))
full_ops.append(s("df"))
if len(full_ops) == 1:
return df
return self.df_interpreter(full_ops , df)
if not operations:
return df, dict(initial_sd)

return self.df_interpreter(full_ops, df, initial_sd)

def _run_code_generator(self, operations):
if len(operations) == 0:
Expand Down Expand Up @@ -198,7 +201,10 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing
return [df, {}, "", []]


cleaned_df = self._run_df_interpreter(df, final_ops)
cleaned_df, op_sd = self._run_df_interpreter(df, final_ops, {})
if op_sd:
from .styling_core import merge_sds
cleaning_sd = merge_sds(cleaning_sd, op_sd)
merged_cleaned_df = self.make_origs(df, cleaned_df, cleaning_sd)
generated_code = self._run_code_generator(final_ops)
return [merged_cleaned_df, cleaning_sd, generated_code, final_ops]
Expand Down
4 changes: 2 additions & 2 deletions buckaroo/dataflow/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,8 @@ def _merged_sd(self, change):
def add_command(self, incomingCommandKls):
return self.ac_obj.add_command(incomingCommandKls)

def _run_df_interpreter(self, df, operations):
self.ac_obj._run_df_interpreter(df, operations)
def _run_df_interpreter(self, df, operations, initial_sd):
return self.ac_obj._run_df_interpreter(df, operations, initial_sd)

def run_code_generator(self, operations):
self.ac_obj.run_code_generator(operations)
Expand Down
28 changes: 23 additions & 5 deletions buckaroo/jlisp/configure_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
import pandas as pd
from .lispy import make_interpreter


def _apply_result(sd_dict, result):
"""A Command's transform may return either a df or a 2-tuple
(df, sd_updates). When called from the interpreter wrapper, this
primitive unpacks the tuple form: it merges sd_updates into the live
sd dict bound in the lisp env and returns the bare df so the
enclosing (set! df ...) can update the df binding. For the bare-df
return shape it passes the value through unchanged."""
if isinstance(result, tuple) and len(result) == 2 and isinstance(result[1], dict):
df, updates = result
for col, kv in updates.items():
sd_dict.setdefault(col, {}).update(kv)
return df
return result


def configure_buckaroo(transforms):
command_defaults = {}
command_patterns = {}

transform_lisp_primitives = {}
transform_lisp_primitives = {'apply-result!': _apply_result}
to_py_lisp_primitives = {}
for T in transforms:
t = T()
Expand All @@ -13,16 +30,17 @@ def configure_buckaroo(transforms):
command_patterns[transform_name] = t.command_pattern
transform_lisp_primitives[transform_name] = T.transform
to_py_lisp_primitives[transform_name] = T.transform_to_py

buckaroo_eval, raw_parse = make_interpreter(transform_lisp_primitives)

def buckaroo_transform(instructions, df):
def buckaroo_transform(instructions, df, initial_sd):
if isinstance(df, pd.DataFrame):
df_copy = df.copy()
else: # hack we know it's polars here... just getting something working for now
df_copy = df.clone()
ret_val = buckaroo_eval(instructions, {'df':df_copy})
return ret_val
sd_dict = dict(initial_sd)
ret_df = buckaroo_eval(instructions, {'df': df_copy, 'sd': sd_dict})
return ret_df, sd_dict

convert_to_python, __unused = make_interpreter(to_py_lisp_primitives)
def buckaroo_to_py(instructions):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/commands/command_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df):
_a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
tdf_ops = [{'symbol': 'begin'}]
tdf_ops.extend(operations)
tdf = transform_df(tdf_ops, test_df.copy())
tdf, _sd = transform_df(tdf_ops, test_df.copy(), {})
py_code_string = transform_to_py(operations)

edf = result_from_exec(py_code_string, test_df.copy())
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/commands/pandas_commands_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df):
_a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
tdf_ops = [{'symbol': 'begin'}]
tdf_ops.extend(operations)
tdf = transform_df(tdf_ops, test_df.copy())
tdf, _sd = transform_df(tdf_ops, test_df.copy(), {})
py_code_string = transform_to_py(operations)

edf = result_from_exec(py_code_string, test_df.copy())
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/commands/polars_command_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def assert_to_py_same_transform_df(command_kls, operations, test_df):
_a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
tdf_ops = [{'symbol': 'begin'}]
tdf_ops.extend(operations)
tdf = transform_df(tdf_ops, test_df.clone())
tdf, _sd = transform_df(tdf_ops, test_df.clone(), {})
py_code_string = transform_to_py(operations)

edf = result_from_exec(py_code_string, test_df.clone())
Expand Down
5 changes: 3 additions & 2 deletions tests/unit/dataflow/autocleaning_pd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,11 @@ def test_run_df_interpreter():
ac = PandasAutocleaning([ACErrorConf])
df = pd.DataFrame({'a': ["30", "40"]})

output_df = ac._run_df_interpreter(
output_df, _sd = ac._run_df_interpreter(
df,
[
[{'symbol': 'safe_int', 'meta':{'auto_clean': True}}, {'symbol': 'df'}, 'a']])
[{'symbol': 'safe_int', 'meta':{'auto_clean': True}}, {'symbol': 'df'}, 'a']],
{})
expected = pd.DataFrame({'a': [30, 40]})
assert output_df.to_dict() == expected.to_dict()

Expand Down
36 changes: 35 additions & 1 deletion tests/unit/dataflow/autocleaning_pl_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
from buckaroo.dataflow.autocleaning import merge_ops, format_ops, AutocleaningConfig
from buckaroo.polars_buckaroo import PolarsAutocleaning
from buckaroo.customizations.polars_commands import (
PlSafeInt, DropCol, FillNA, GroupBy, NoOp
Command, PlSafeInt, DropCol, FillNA, GroupBy, NoOp
)
from buckaroo.jlisp.lisp_utils import s


dirty_df = pl.DataFrame(
Expand Down Expand Up @@ -168,6 +169,39 @@ def test_handle_clean_df():
df = df.with_columns(pl.col('a').cast(pl.Int64, strict=False))
return df"""

class TaggingCommand(Command):
"""A Command whose transform returns the 2-tuple (df, sd_updates)."""
command_default = [s('tag'), s('df'), 'col', '']
command_pattern = [[3, 'tag', 'type', 'string']]

@staticmethod
def transform(df, col, val):
return df, {col: {'note': val}}

@staticmethod
def transform_to_py(df, col, val):
return " # tag"


class TagConf(AutocleaningConfig):
autocleaning_analysis_klasses = []
command_klasses = [TaggingCommand]
name = ""


def test_transform_can_return_sd_updates_via_2tuple():
"""A Command's transform may return (df, sd_updates); the interpreter
accumulates sd_updates and autocleaning merges them into cleaning_sd."""
ac = PolarsAutocleaning([TagConf])
df = pl.DataFrame({'a': [1, 2, 3]})
op = [{'symbol': 'tag'}, s('df'), 'a', 'hello']

_df, cleaning_sd, _gen, _ops = ac.handle_ops_and_clean(
df, cleaning_method='', quick_command_args={}, existing_operations=[op])

assert cleaning_sd.get('a', {}).get('note') == 'hello'


def test_autoclean_codegen():
ac = PolarsAutocleaning([ACConf, NoCleaning])
df = pl.DataFrame({'a': ["30", "40"]})
Expand Down
Loading