Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 46 additions & 12 deletions buckaroo/xorq_buckaroo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import logging
import traceback
from io import BytesIO
from typing import Any
from typing import Any, Dict, Tuple

import pandas as pd
import pyarrow as pa
Expand All @@ -24,7 +24,8 @@
from .customizations.styling import DefaultMainStyling, DefaultSummaryStatsStyling
from .customizations.xorq_stats_v2 import XORQ_STATS_V2
from .dataflow.autocleaning import (
AutocleaningConfig, PandasAutocleaning, generate_quick_ops, merge_ops, ops_eq)
AutocleaningConfig, PandasAutocleaning, _rekey_op_sd_to_internal,
generate_quick_ops, merge_ops, ops_eq)
from .dataflow.dataflow import CustomizableDataflow
from .dataflow.dataflow_extras import Sampling
from .df_util import old_col_new_col
Expand Down Expand Up @@ -90,11 +91,17 @@ def serialize_sample(cls, df_or_expr):


def _xorq_search(expr, _col, val):
"""Filter rows where any string column contains ``val``.
"""Filter rows where any string column contains ``val``, and emit
``sd_updates`` so the JS-side string displayer highlights matches.

Mirrors the contract of the pandas / polars Search commands: an
empty value short-circuits to a no-op so the frontend can clear
the search by sending ``""``.
Return shape mirrors the SDResult contract from #758:
- bare expr : empty value short-circuits to a no-op
- (expr, sd_updates) : filtered expr + per-string-column highlight

ibis ``StringValue.contains`` is a literal substring match (not
regex), so the term flows as ``highlight_phrase`` (list) rather
than ``highlight_regex`` — matching the filter semantics on the
JS-side string displayer.
"""
if val is None or val == "":
return expr
Expand All @@ -106,7 +113,8 @@ def _xorq_search(expr, _col, val):
for c in string_cols:
c_cond = expr[c].contains(val)
cond = c_cond if cond is None else cond | c_cond
return expr.filter(cond)
sd_updates = {c: {'highlight_phrase': [val]} for c in string_cols}
return expr.filter(cond), sd_updates


class XorqSearch:
Expand Down Expand Up @@ -163,19 +171,45 @@ def handle_ops_and_clean(self, df, cleaning_method, quick_command_args, existing
final_ops = merge_ops(existing_for_merge, quick_ops)
if not final_ops:
return [df, {}, "", []]
result = self._apply_xorq_ops(df, final_ops)
return [result, {}, "", final_ops]
result_expr, sd_updates = self._apply_xorq_ops(df, final_ops)
# Rekey op-supplied sd entries from orig col names onto buckaroo's
# internal a/b/c letter keys, so they merge cleanly with the
# summary_sd that XorqDataflow._get_summary_sd produces (also
# keyed by letter). Every current handler preserves column
# identity (filter ops), so orig→letter is the same on the input
# and the result. The assert guards against a future handler that
# renames/drops/reorders columns silently corrupting the rekey.
assert list(result_expr.columns) == list(df.columns), (
"xorq op changed column identity — _rekey_op_sd_to_internal "
"would mis-map sd entries; rekey against result_expr or "
"thread the orig→letter mapping through _apply_xorq_ops")
cleaning_sd = _rekey_op_sd_to_internal(sd_updates, df)
return [result_expr, cleaning_sd, "", final_ops]

@staticmethod
def _apply_xorq_ops(expr, ops):
def _apply_xorq_ops(expr, ops) -> Tuple[Any, Dict[str, Dict[str, Any]]]:
"""Apply ops to ``expr``; accumulate op-contributed sd entries.

Each handler may return either a bare expr (legacy) or an
``(expr, sd_updates)`` tuple. Tuples merge col-by-col into the
running sd_updates dict so multiple ops touching the same
column compose.
"""
sd_updates: Dict[str, Dict[str, Any]] = {}
for op in ops:
sym_name = op[0]['symbol'] if isinstance(op[0], dict) else op[0]
handler = _XORQ_OP_HANDLERS.get(sym_name)
if handler is None:
continue
handler_args = op[2:]
expr = handler(expr, *handler_args)
return expr
result = handler(expr, *handler_args)
if isinstance(result, tuple):
expr, op_sd = result
for col, updates in op_sd.items():
sd_updates.setdefault(col, {}).update(updates)
else:
expr = result
return expr, sd_updates


class XorqDataflow(CustomizableDataflow):
Expand Down
52 changes: 52 additions & 0 deletions tests/unit/test_xorq_buckaroo_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,58 @@ def test_empty_search_clears_filter(self):
assert w.df_meta["filtered_rows"] == 5


def _find_cc(column_config, col_name):
for entry in column_config:
if entry.get("col_name") == col_name:
return entry
raise AssertionError(f"col_name {col_name!r} not in column_config")


class TestSearchHighlight:
"""Equivalent of the polars-search highlight wiring from #758, ported
to the xorq backend. ibis ``StringValue.contains`` is a literal
substring match, so the search term flows to the JS displayer as
``highlight_phrase`` (list), not ``highlight_regex``."""

def test_search_op_delivers_highlight_phrase_into_displayer_args(self):
"""End-to-end through XorqBuckarooWidget — a `search` op should
plumb its term into ``displayer_args.highlight_phrase`` for every
ibis-String column and skip non-string columns."""
w = XorqBuckarooWidget(_searchable_expr())
state = w.buckaroo_state.copy()
state["quick_command_args"] = {"search": ["admin"]}
w.buckaroo_state = state

cc = w.df_display_args["main"]["df_viewer_config"]["column_config"]
# 'a' is name (string)
a_args = _find_cc(cc, "a")["displayer_args"]
assert a_args["displayer"] == "string"
assert a_args["highlight_phrase"] == ["admin"]
Comment thread
paddymul marked this conversation as resolved.
# 'b' is role (string)
b_args = _find_cc(cc, "b")["displayer_args"]
assert b_args["displayer"] == "string"
assert b_args["highlight_phrase"] == ["admin"]
# 'c' is score (integer) — no highlight
c_args = _find_cc(cc, "c")["displayer_args"]
assert "highlight_phrase" not in c_args

def test_empty_search_drops_highlight_from_displayer_args(self):
"""Clearing the search box (``""``) should remove the highlight
from displayer_args, matching the filter going back to no-op."""
w = XorqBuckarooWidget(_searchable_expr())
state = w.buckaroo_state.copy()
state["quick_command_args"] = {"search": ["admin"]}
w.buckaroo_state = state

state = w.buckaroo_state.copy()
state["quick_command_args"] = {"search": [""]}
w.buckaroo_state = state

cc = w.df_display_args["main"]["df_viewer_config"]["column_config"]
a_args = _find_cc(cc, "a")["displayer_args"]
assert "highlight_phrase" not in a_args


def _paginated_expr():
return xo.memtable(
{"a": [3, 1, 4, 1, 5, 9, 2, 6, 5, 3], "b": ["p", "q", "r", "s", "t", "u", "v", "w", "x", "y"]})
Expand Down
Loading