From 66274f5f62077fefcbf014b45b0971f803de4f63 Mon Sep 17 00:00:00 2001 From: algorithmiker <104317939+algorithmiker@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:53:12 +0100 Subject: [PATCH] Filtersets V2 Simplifies filtersets. The primary filter type is now a RelDnf (the ids of spans matching a Disjunctive Normal Form over a source). This makes rewrite rules simpler (many -> one (intersection of DNFS)), and also results in better performance, since we can merge many kinds of nodes into one DNF, which corresponds to only a single scan on the data. --- Cargo.lock | 10 + docs/filtersets.md | 31 +-- entrace_core/src/log_provider.rs | 3 +- entrace_query/Cargo.toml | 1 + entrace_query/src/filtersets.rs | 379 ++++++++++++++++++++----------- entrace_query/src/lua_api.rs | 283 +++++++++++------------ entrace_query/src/main.rs | 40 ++-- entrace_script/test_script.lua | 9 +- gui/src/log.rs | 10 +- 9 files changed, 436 insertions(+), 330 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9e73f01..a8cf13c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1366,6 +1366,7 @@ version = "0.1.2" dependencies = [ "anyhow", "entrace_core", + "itertools", "memchr", "mlua", "roaring", @@ -2090,6 +2091,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" diff --git a/docs/filtersets.md b/docs/filtersets.md index 48e6abb..31bed82 100644 --- a/docs/filtersets.md +++ b/docs/filtersets.md @@ -1,10 +1,11 @@ -THE FILTERSET CALCULUS +THE FILTERSET CALCULUS (v2) --- ### TYPES ```rust type FiltersetId = usize; +type PreidcateId = usize; struct Predicate { attr: String, rel: Ordering, @@ -14,9 +15,8 @@ struct Predicate { enum Filterset { Dead, Primitive(Roaring), - Rel(Predicate, FiltersetId), - RelIntersect(Vec, FiltersetId), - RelUnion(Vec, FiltersetId), + BlackBox(FiltersetId), + RelDnf(Vec>, FiltersetId), And(Vec), Or(Vec), Not(FiltersetId), @@ -30,25 +30,12 @@ enum Filterset { And([... And(children) ...]) -> And(flattened) Or([... Or(children) ...]) -> Or(flattened) Not(Not(X)) -> X - -**Eliminating trivial ops (unimplemeneted)** And([A]) -> A Or([A]) -> A -**REL composition** - -I. Rel(p1, Rel(p2, A)) -> RelIntersect([p1, p2], A) -II. Rel(p, RelIntersect(ps, A)) -> RelIntersect([p] + ps, A) -III. RelIntersect(ps1, RelIntersect(ps2, A)) -> RelIntersect(ps1 + ps2, A) -IV. RelIntersect(ps, Rel(p, A)) -> RelIntersect(ps + [p], A) -V. RelUnion(ps1, RelUnion(ps2, A)) -> RelUnion(ps1 + ps2, A) - +**One Rule To Rule Them All** +1. RelDnf(clauses, RelDnf(clauses2, A)) -> new clauses: c_1 \times c_2 (if the result won't be too big) -**Flattening on the same level (unimplemeneted)** -And([RelIntersect(ps, A), RelIntersect(ps2, A), RelIntersect(_, B) ...]) - -> And([RelIntersect(ps1+ps2, A), RelIntersect(_, B)]) - - This could be hard to detect, instead there could be just: - And([RelIntersect(ps_1, A), .. RelIntersect(ps_n, A)]) -> RelIntersect(ps_1+..+ps_n, A) -Or([RelUnion(ps, A), RelUnion(ps2, A), RelUnion(_, B) ...]) - -> Or([RelUnion(ps1+ps2, A), RelIntersect(_, B)]) - - same applies here. +**RelDnf in Or/And** +1. Or([RelDnf(c, A), RelDnf(c2, A), RelDnf(c3, B), RelDnf(c4, B)]) -> Or([RelDnf(c+c2, A), RelDnf(c3+c4, B)]) +2. And([RelDnf(c, A), RelDnf(c2, A), RelDnf(c3, B), RelDnf(c4, B)]) -> And([RelDnf(c x c2, A), RelDnf(c3 x c4, B)]) (if not too big) diff --git a/entrace_core/src/log_provider.rs b/entrace_core/src/log_provider.rs index 8a8a96e..cf2c972 100644 --- a/entrace_core/src/log_provider.rs +++ b/entrace_core/src/log_provider.rs @@ -35,7 +35,8 @@ pub trait LogProvider { /// This MUST be cheap as the frontend might call this every frame. fn len(&self) -> usize; - /// The frontent SHOULD call this at the beginning of each painted frame. + /// The frontent SHOULD call this at the beginning of each painted frame, + /// but there is no guarantee to whether or when it will. /// This runs on the main thread. /// The [LogProvider] implementation MUST ensure that this terminates quickly, /// as it directly affects FPS. diff --git a/entrace_query/Cargo.toml b/entrace_query/Cargo.toml index 24d134d..431fdfa 100644 --- a/entrace_query/Cargo.toml +++ b/entrace_query/Cargo.toml @@ -10,6 +10,7 @@ anyhow = "1.0.100" memchr = "2.7.6" thiserror = "2.0.17" roaring = "0.11.2" +itertools = "0.14.0" [[bin]] name="entrace-query-test" diff --git a/entrace_query/src/filtersets.rs b/entrace_query/src/filtersets.rs index 5bb5534..2cd45e2 100644 --- a/entrace_query/src/filtersets.rs +++ b/entrace_query/src/filtersets.rs @@ -1,13 +1,14 @@ +use itertools::Itertools; use roaring::{MultiOps, RoaringBitmap as Roaring}; use std::collections::HashMap; use std::fmt::{Debug, Write}; use std::{ cmp::Ordering, collections::{HashSet, VecDeque}, - mem, }; pub type FiltersetId = usize; +pub type PredicateId = usize; #[derive(Debug)] pub struct Predicate { pub attr: String, @@ -20,32 +21,41 @@ impl Predicate { } } #[derive(Debug)] -pub enum Filterset { +pub enum Filterset { Dead, Primitive(Roaring), - Rel(Predicate, FiltersetId), - RelIntersect(Vec>, FiltersetId), - RelUnion(Vec>, FiltersetId), + BlackBox(FiltersetId), + RelDnf(Vec>, FiltersetId), + // TODO: HashSet instead of vec? could be faster And(Vec), Or(Vec), Not(FiltersetId), } +impl Filterset { + pub fn children(&self) -> ChildrenRef<'_> { + match self { + Filterset::Dead | Filterset::Primitive(_) => ChildrenRef::None, + Filterset::Not(a) | Filterset::BlackBox(a) | Filterset::RelDnf(_, a) => { + ChildrenRef::One(*a) + } + Filterset::And(i) | Filterset::Or(i) => ChildrenRef::Many(i), + } + } +} +#[derive(Debug)] pub enum RewriteAction { None, // Pointer of outer and, and indices to inner ands in its item list CompressAnd(FiltersetId, Vec), CompressOr(FiltersetId, Vec), EliminateNotNot(FiltersetId, FiltersetId, FiltersetId), - /// First Rel, Second Rel, NewArg - NestedRelToIntersect(FiltersetId, FiltersetId, FiltersetId), - /// First Rel, nested RelIntersect, NewArg - ParentRelToIntersect(FiltersetId, FiltersetId, FiltersetId), - /// Parent RelIntersect, nested Rel, nested Rel arg = new arg - CompressRelInRelIntersect(FiltersetId, FiltersetId, FiltersetId), - /// Parent RelIntersect, nested RelIntersect, NewArg - CompressRelIntersect(FiltersetId, FiltersetId, FiltersetId), - /// Parent RelUnion, nested RelUnion, NewArg - CompressRelUnion(FiltersetId, FiltersetId, FiltersetId), + /// Outer DNF, inner DNF, inner DNF source + DnfDnf(FiltersetId, FiltersetId, FiltersetId), + MergeDnfsInOr(FiltersetId, HashMap>), + MergeDnfsInAnd(FiltersetId, HashMap>), + /// Or([A]) -> A + EliminateSingleOr(FiltersetId), + EliminateSingleAnd(FiltersetId), } pub enum ChildrenRef<'a> { None, @@ -53,14 +63,17 @@ pub enum ChildrenRef<'a> { Many(&'a [FiltersetId]), } -pub struct Evaluator, T> { - pub pool: Vec>, +// I don't know what would be optimal, this is just going by feeling +const MAX_DNF_CLAUSES: usize = 128; +const DNFS_IN_AND_MERGE_MAX_CLAUSES: usize = MAX_DNF_CLAUSES / 2; +pub struct Evaluator { + pool: Vec, + pub predicates: Vec>, pub results: HashMap, - pub matcher: M, } -impl, T> Evaluator { - pub fn from_matcher(matcher: M) -> Self { - Self { pool: vec![], results: HashMap::new(), matcher } +impl Evaluator { + pub fn new() -> Self { + Self { pool: vec![], predicates: vec![], results: HashMap::new() } } pub fn is_and(&self, id: FiltersetId) -> bool { matches!(self.pool[id], Filterset::And(_)) @@ -68,60 +81,128 @@ impl, T> Evaluator { pub fn is_or(&self, id: FiltersetId) -> bool { matches!(self.pool[id], Filterset::Or(_)) } - /// Returns the action which ended up being executed - pub fn rewrite_one(&mut self, id: FiltersetId) -> RewriteAction { - let mut action = RewriteAction::None; + pub fn is_dnf(&self, id: FiltersetId) -> bool { + matches!(self.pool[id], Filterset::RelDnf(..)) + } + /// Take the value of Filterset::RelDnf at id, and replace it with Dead. + pub fn dead_and_take_dnf(&mut self, id: FiltersetId) -> (Vec>, FiltersetId) { + let Filterset::RelDnf(clauses, src) = + std::mem::replace(&mut self.pool[id], Filterset::Dead) + else { + unreachable!() + }; + (clauses, src) + } + pub fn new_dnf(&mut self, clauses: Vec>>, src: FiltersetId) -> FiltersetId { + let mut out_clauses: Vec> = vec![]; + for inner in clauses { + let and_joined_clause = inner.into_iter().map(|x| self.new_predicate(x)); + out_clauses.push(and_joined_clause.collect()); + } + self.new_filterset(Filterset::RelDnf(out_clauses, src)) + } + pub fn new_filterset(&mut self, f: Filterset) -> FiltersetId { + self.pool.push(f); + self.pool.len() - 1 + } + pub fn new_predicate(&mut self, t: Predicate) -> PredicateId { + self.predicates.push(t); + self.predicates.len() - 1 + } + pub fn len_of_merged_dnf(&self, dnfs: impl Iterator) -> usize { + dnfs.filter_map(|x| match self.pool[x] { + Filterset::RelDnf(ref items, _) => Some(items.len()), + _ => None, + }) + .product() + } + pub fn decide_rewrite_action(&self, id: FiltersetId) -> RewriteAction { match &self.pool[id] { Filterset::And(items) => { + if items.len() == 1 { + return RewriteAction::EliminateSingleAnd(id); + } let ands: Vec = items.iter().copied().filter(|p| self.is_and(*p)).collect(); if !ands.is_empty() { - action = RewriteAction::CompressAnd(id, ands); + return RewriteAction::CompressAnd(id, ands); + } + // Try to merge And([RelDnf(c, A), RelDnf(c2, A), RelDnf(c3, B), RelDnf(c4, B)]) + // to And([RelDnf([c & c2], A), RelDnf(c3+c4, B)]) + // will miss duplicate sources, we can't really do anything about that here. + // that'd involve a source deduplication step before rewriting anything else, + // but its not clear how to do that + let dnf_by_source: HashMap> = items + .iter() + .filter_map(|x| match &self.pool[*x] { + Filterset::RelDnf(_cs, src) => Some((*src, *x)), + _ => None, + }) + .into_group_map(); + let can_merge_something = dnf_by_source.iter().any(|(_, ids)| { + ids.len() > 1 + && dnf_by_source.iter().any(|(_, ds)| { + self.len_of_merged_dnf(ds.iter().copied()) + < DNFS_IN_AND_MERGE_MAX_CLAUSES + }) + }); + if can_merge_something { + return RewriteAction::MergeDnfsInAnd(id, dnf_by_source); } } Filterset::Or(items) => { + if items.len() == 1 { + return RewriteAction::EliminateSingleOr(id); + } let ors: Vec = items.iter().copied().filter(|x| self.is_or(*x)).collect(); if !ors.is_empty() { - action = RewriteAction::CompressOr(id, ors); + return RewriteAction::CompressOr(id, ors); + } + // Try to merge Or([RelDnf(c, A), RelDnf(c2, A), RelDnf(c3, B), RelDnf(c4, B)]) + // to Or([RelDnf(c+c2, A), RelDnf(c3+c4, B)]) + // will miss duplicate sources, we can't really do anything about that here. + // that'd involve a source deduplication step before rewriting anything else, + // but its not clear how to do that + let dnf_by_source: HashMap> = items + .iter() + .filter_map(|x| match &self.pool[*x] { + Filterset::RelDnf(_cs, src) => Some((*src, *x)), + _ => None, + }) + .into_group_map(); + let can_merge_something = dnf_by_source.iter().any(|(_, ids)| ids.len() > 1); + if can_merge_something { + return RewriteAction::MergeDnfsInOr(id, dnf_by_source); } } + Filterset::Not(y) => { if let Filterset::Not(q) = &self.pool[*y] { - action = RewriteAction::EliminateNotNot(id, *y, *q) + return RewriteAction::EliminateNotNot(id, *y, *q); } } - Filterset::Rel(_pred, arg) => match &self.pool[*arg] { - Filterset::Rel(_pred2, arg2) => { - action = RewriteAction::NestedRelToIntersect(id, *arg, *arg2); - } - Filterset::RelIntersect(_preds, arg2) => { - action = RewriteAction::ParentRelToIntersect(id, *arg, *arg2); - } - _ => (), - }, - Filterset::RelIntersect(_preds, arg) => match &self.pool[*arg] { - Filterset::Rel(_pred2, arg2) => { - action = RewriteAction::CompressRelInRelIntersect(id, *arg, *arg2); - } - Filterset::RelIntersect(_pred2, arg2) => { - action = RewriteAction::CompressRelIntersect(id, *arg, *arg2); - } - _ => (), - }, - Filterset::RelUnion(_preds, arg) => { - if let Filterset::RelUnion(_preds2, arg2) = &self.pool[*arg] { - action = RewriteAction::CompressRelUnion(id, *arg, *arg2); + Filterset::RelDnf(c1, src) => { + if let Filterset::RelDnf(c2, src2) = &self.pool[*src] + && c1.len().saturating_mul(c2.len()) < MAX_DNF_CLAUSES + { + return RewriteAction::DnfDnf(id, *src, *src2); } } _ => (), } + RewriteAction::None + } + /// Returns the action which ended up being executed + pub fn rewrite_one(&mut self, id: FiltersetId) -> RewriteAction { + let action = self.decide_rewrite_action(id); self.do_rewrite_action(&action); action } /// Very important invariant: we assume anyone who has the index of a Filterset "owns" it, /// so we cannot create dangling references (bad references to Dead values) by rewriting. + /// This is not true for primitives (there can be multiple references to a Primitive), but we + /// never rewrite Primitives. pub fn do_rewrite_action(&mut self, action: &RewriteAction) { - use Filterset::Dead; match action { RewriteAction::None => (), RewriteAction::CompressAnd(id, inner_ands) => { @@ -148,77 +229,103 @@ impl, T> Evaluator { let Filterset::Or(ref others) = self.pool[items[*ptr]] else { unreachable!() }; set.extend(others); } - let Filterset::And(ref mut items) = self.pool[*id] else { unreachable!() }; + let Filterset::Or(ref mut items) = self.pool[*id] else { unreachable!() }; items.clear(); items.extend(set); for ptr in inner_ors { self.pool[*ptr] = Filterset::Dead; } } - RewriteAction::EliminateNotNot(not1p, not2p, innerp) => { - self.pool[*not1p] = std::mem::replace(&mut self.pool[*innerp], Filterset::Dead); - self.pool[*not2p] = Filterset::Dead; - } - RewriteAction::NestedRelToIntersect(r1, r2, rel2src) => { - let Filterset::Rel(pred1, _) = std::mem::replace(&mut self.pool[*r1], Dead) else { - unreachable!() - }; - let Filterset::Rel(pred2, _) = std::mem::replace(&mut self.pool[*r2], Dead) else { + RewriteAction::EliminateSingleOr(id) => { + let Filterset::Or(srcs) = std::mem::replace(&mut self.pool[*id], Filterset::Dead) + else { unreachable!() }; - self.pool[*r1] = Filterset::RelIntersect(vec![pred1, pred2], *rel2src); + self.pool.swap(*id, srcs[0]); } - RewriteAction::ParentRelToIntersect(rel, ist, intersrc) => { - let Filterset::Rel(pred0, _) = std::mem::replace(&mut self.pool[*rel], Dead) else { - unreachable!() - }; - let Filterset::RelIntersect(mut ps, _) = mem::replace(&mut self.pool[*ist], Dead) + RewriteAction::EliminateSingleAnd(id) => { + let Filterset::And(srcs) = std::mem::replace(&mut self.pool[*id], Filterset::Dead) else { unreachable!() }; - ps.push(pred0); // TODO: maybe push_first for better selectivity? - self.pool[*rel] = Filterset::RelIntersect(ps, *intersrc); + self.pool.swap(*id, srcs[0]); } - RewriteAction::CompressRelInRelIntersect(ist, rel, relsrc) => { - let Filterset::Rel(pred, _) = std::mem::replace(&mut self.pool[*rel], Dead) else { - unreachable!() - }; - let Filterset::RelIntersect(ps, arg) = &mut self.pool[*ist] else { unreachable!() }; - ps.push(pred); - *arg = *relsrc; + RewriteAction::EliminateNotNot(not1p, not2p, innerp) => { + self.pool[*not1p] = std::mem::replace(&mut self.pool[*innerp], Filterset::Dead); + self.pool[*not2p] = Filterset::Dead; } - RewriteAction::CompressRelIntersect(ist1, ist2, arg2) => { - let Filterset::RelIntersect(ps2, _) = - std::mem::replace(&mut self.pool[*ist2], Dead) - else { + RewriteAction::DnfDnf(dnf1, dnf2, src2) => { + let (c2, _) = self.dead_and_take_dnf(*dnf2); + let Filterset::RelDnf(ref mut c1, ref mut src1) = self.pool[*dnf1] else { unreachable!() }; - let Filterset::RelIntersect(ps1, a1) = &mut self.pool[*ist1] else { + // TODO: we could reuse an allocation here, for example by copying c2 to c1 first, + // doing the cartesian product on subranges of c1 and collecting to c2, then + // replacing the vector of dnf1. meh. + let new_clauses: Vec> = c1 + .iter() + .cartesian_product(c2) + .map(|(cl1, cl2)| { + cl1.iter().chain(cl2.iter()).cloned().collect::>() + }) + .collect(); + *c1 = new_clauses; + *src1 = *src2; + } + RewriteAction::MergeDnfsInOr(or, dnfs_by_source) => { + use Filterset::Dead; + let Filterset::Or(cs) = std::mem::replace(&mut self.pool[*or], Dead) else { unreachable!() }; - ps1.extend(ps2); - *a1 = *arg2; + let mut or_clauses: HashSet = HashSet::from_iter(cs.iter().copied()); + for (source, dnfs) in dnfs_by_source.iter() { + if dnfs.len() < 2 { + continue; + } + let (mut firstc, _) = self.dead_and_take_dnf(dnfs[0]); + for dnf in dnfs.iter().skip(1) { + let (c, _) = self.dead_and_take_dnf(*dnf); + or_clauses.remove(dnf); + firstc.extend(c.into_iter()); + } + self.pool[dnfs[0]] = Filterset::RelDnf(firstc, *source); + } + self.pool[*or] = Filterset::Or(or_clauses.into_iter().collect()); } - RewriteAction::CompressRelUnion(u1, u2, a2) => { - let Filterset::RelUnion(ps2, _) = std::mem::replace(&mut self.pool[*u2], Dead) - else { + RewriteAction::MergeDnfsInAnd(and, dnfs_by_source) => { + use Filterset::Dead; + let Filterset::And(cs) = std::mem::replace(&mut self.pool[*and], Dead) else { unreachable!() }; - let Filterset::RelIntersect(ps1, a1) = &mut self.pool[*u1] else { unreachable!() }; - ps1.extend(ps2); - *a1 = *a2; - } - } - } + let mut and_clauses: HashSet = HashSet::from_iter(cs); + for dnfs in dnfs_by_source.values() { + if dnfs.len() < 2 + || self.len_of_merged_dnf(dnfs.iter().copied()) + > DNFS_IN_AND_MERGE_MAX_CLAUSES + { + continue; + } + let new_clause_list: Vec> = dnfs + .iter() + .filter_map(|x| match &self.pool[*x] { + Filterset::RelDnf(items, _) => Some(items.iter()), + _ => None, + }) + .multi_cartesian_product() + .map(|combo| combo.into_iter().flatten().copied().collect()) + .collect(); + let Filterset::RelDnf(firstc, _) = &mut self.pool[dnfs[0]] else { + unreachable!() + }; + *firstc = new_clause_list; - pub fn children(&'_ self, id: FiltersetId) -> ChildrenRef<'_> { - match &self.pool[id] { - Filterset::Dead | Filterset::Primitive(_) => ChildrenRef::None, - Filterset::Rel(_, a) - | Filterset::RelIntersect(_, a) - | Filterset::RelUnion(_, a) - | Filterset::Not(a) => ChildrenRef::One(*a), - Filterset::And(i) | Filterset::Or(i) => ChildrenRef::Many(i), + for dnf in dnfs.iter().skip(1) { + let _ = self.dead_and_take_dnf(*dnf); + and_clauses.remove(dnf); + } + } + self.pool[*and] = Filterset::And(and_clauses.into_iter().collect()); + } } } @@ -242,7 +349,7 @@ impl, T> Evaluator { // continue; // } stack2.push(v); - match self.children(v) { + match self.pool[v].children() { ChildrenRef::None => continue, ChildrenRef::One(x) => { stack1.push(x); @@ -264,17 +371,20 @@ impl, T> Evaluator { if !self.results.is_empty() { panic!("Normalizing after there are results is unsafe"); } - let mut worklist = VecDeque::new(); + let mut worklist = VecDeque::with_capacity(self.pool.len()); let (post_order, parent_of) = self.post_order(root); + worklist.extend(post_order.iter().copied()); - // First, scan the entire tree from the leaves up (by a postorder), and try to simplify. - // If we rewrote something, mark the parent for rewriting too. - pub fn inner, T>( - this: &mut Evaluator, x: FiltersetId, worklist: &mut VecDeque, + pub fn inner( + this: &mut Evaluator, x: FiltersetId, worklist: &mut VecDeque, parent_of: &[usize], root: FiltersetId, ) { - let action_taken = this.rewrite_one(x); - if !matches!(action_taken, RewriteAction::None) && x != root { + // reach a local fixpoint before queuing parent + let mut any_action = false; + while !matches!(this.rewrite_one(x), RewriteAction::None) { + any_action = true; + } + if any_action && x != root { let parent = parent_of[x]; if parent == usize::MAX { panic!("Don't know parent of {x} even though it was rewritten. This is a bug."); @@ -282,9 +392,7 @@ impl, T> Evaluator { worklist.push_back(parent); } } - for x in post_order { - inner(self, x, &mut worklist, &parent_of, root); - } + // While there were children rewritten, rewrite the parents (so rewrite until there are no // changes left) while let Some(x) = worklist.pop_front() { @@ -296,7 +404,7 @@ impl, T> Evaluator { /// Guarantees that `results[id]` will exist. /// WARNING: because of how Not() is implemented, the Roaring in results[id] might contain ids /// beyond the end of the actual data. Please clamp it to your actual data ID range. - pub fn materialize(&mut self, id: FiltersetId) { + pub fn materialize(&mut self, matcher: &impl Matcher, id: FiltersetId) { let mut stack = vec![(id, false)]; // "two-phase scheduling" algorithm. a node can either be "ready", meaning we can materialize it right // away, or "unready" which means we need to materialize its children first. @@ -311,7 +419,7 @@ impl, T> Evaluator { while let Some((node, ready)) = stack.pop() { if !ready { stack.push((node, true)); - match self.children(node) { + match self.pool[node].children() { ChildrenRef::None => (), ChildrenRef::One(x) => { stack.push((x, false)); @@ -333,20 +441,9 @@ impl, T> Evaluator { Filterset::Primitive(bm) => { self.results.insert(node, bm.clone()); } - Filterset::Rel(predicate, src) => { - let source_result = &self.results[src]; - let matches = self.matcher.subset_matching(predicate, source_result); - self.results.insert(node, matches); - } - Filterset::RelIntersect(predicates, src) => { + Filterset::BlackBox(src) => { let source_result = &self.results[src]; - let matches = self.matcher.subset_matching_all(predicates, source_result); - self.results.insert(node, matches); - } - Filterset::RelUnion(predicates, src) => { - let source_result = &self.results[src]; - let matches = self.matcher.subset_matching_either(predicates, source_result); - self.results.insert(node, matches); + self.results.insert(node, source_result.clone()); } Filterset::And(items) => { self.results.insert(node, items.iter().map(|x| &self.results[x]).union()); @@ -363,12 +460,21 @@ impl, T> Evaluator { // records beyond the actual record count. self.results.insert(node, Roaring::full() - source_result); } + Filterset::RelDnf(items, src) => { + let source_result = &self.results[src]; + let this_result = matcher.subset_matching_dnf( + items.iter().map(|x| x.iter().map(|y| &self.predicates[*y])), + source_result, + ); + + self.results.insert(node, this_result); + } } } } } -impl, T: Debug> Evaluator { +impl Evaluator { /// Pretty-print the graph in GraphViz .dot pub fn dot(&mut self, root: FiltersetId) -> String { let mut out = String::from("digraph D {\n"); @@ -376,8 +482,7 @@ impl, T: Debug> Evaluator { while let Some(v) = stack.pop() { let node = format!("{:?}", &self.pool[v]).replace('"', "'"); writeln!(out, " n{v} [label=\"{node}\"];").ok(); - let children = self.children(v); - match children { + match self.pool[v].children() { ChildrenRef::None => (), ChildrenRef::One(a) => { stack.push(a); @@ -395,13 +500,23 @@ impl, T: Debug> Evaluator { out } } + +impl Default for Evaluator { + fn default() -> Self { + Self::new() + } +} pub trait Matcher { + /// Note: for good performance, you SHOULD implement [Matcher::subset_matching_dnf], as the default + /// implementation calls this a lot, generating lots of slow scans. fn subset_matching(&self, predicate: &Predicate, input: &Roaring) -> Roaring; - fn subset_matching_all(&self, predicates: &[Predicate], input: &Roaring) -> Roaring { - predicates.iter().map(|x| self.subset_matching(x, input)).intersection() - } - fn subset_matching_either(&self, predicates: &[Predicate], input: &Roaring) -> Roaring { - predicates.iter().map(|x| self.subset_matching(x, input)).union() + fn subset_matching_dnf<'a, O, I>(&self, predicates: O, input: &Roaring) -> Roaring + where + O: Iterator, + I: Iterator>, + T: 'a, + { + predicates.map(|x| x.map(|y| self.subset_matching(y, input)).intersection()).union() } } pub struct YesManMatcher(); diff --git a/entrace_query/src/lua_api.rs b/entrace_query/src/lua_api.rs index b36aedd..0fd0d28 100644 --- a/entrace_query/src/lua_api.rs +++ b/entrace_query/src/lua_api.rs @@ -6,6 +6,7 @@ use std::{ ops::RangeInclusive, rc::Rc, sync::{Arc, RwLock}, + time::Instant, }; use anyhow::bail; @@ -19,7 +20,7 @@ use roaring::RoaringBitmap; use crate::{ QueryError, TraceProvider, - filtersets::{Filterset, Matcher, Predicate}, + filtersets::{Filterset, Matcher, Predicate, PredicateId}, lua_value::{LuaValueRef, LuaValueRefRef}, }; fn level_to_u8(level: &entrace_core::LevelContainer) -> u8 { @@ -360,10 +361,10 @@ pub fn span_matches_filter( // root: 1 // items: { // { type = "prim_list"; value = [1,2,3];}, -// { type = "rel", target = "", relation = "", value = "", src = 0 }, +// { type = "rel_dnf", clauses = {{ target = "", relation = "", value = ""}}, src = 0}, // } // -// Valid item types are: "prim_list", "prim_range", "rel", "rel_intersect", "rel_union", +// Valid item types are: "prim_list", "prim_range", "rel_dnf", // "intersect", "union", "invert" /// en_filterset_from_list() @@ -420,94 +421,24 @@ pub fn en_filterset_from_range(lua: &Lua, (start, end): (usize, usize)) -> mlua: /// relation: a relation, one of "EQ", "LT", "GT" /// value: a constant to compare with /// src: filterset -/// outputs: a filterset with the filter as an item +/// outputs: { type = "filterset", root = 1, items = { src = 0, {type = "rel_dnf", src = 0, clauses = {{ target, relation, value}} }}}, pub fn en_filter(lua: &Lua, (filter, src): (Table, Table)) -> mlua::Result { let old_items: Table = src.get("items")?; let items_len = old_items.len()?; let new_items = deepcopy_table(lua, old_items)?; - let filter2 = deepcopy_table(lua, filter)?; - filter2.set("type", "rel")?; - filter2.set("src", items_len.saturating_sub(1))?; - new_items.push(filter2)?; - let fs = lua.create_table()?; - fs.set("type", "filterset")?; - fs.set("root", items_len)?; - fs.set("items", new_items)?; - Ok(fs) -} -/// en_filter_all() -/// input: -/// filters: list of filter tables: table with -/// target: name of variable eg. "message" or "meta.filename" -/// relation: a relation, one of "EQ", "LT", "GT" -/// value: a constant to compare with -/// src: filterset -/// outputs: a filterset that matches an item if all filters match it (the intersection of filters) -/// { type: "filterset", -/// root: 1, -/// items: { -/// { type = "prim_list", value = {1,2,3}}, -/// { type = "rel_intersect", -/// filters = { -/// { target = "", relation = "EQ", value = ""}, -/// { target = "", relation = "EQ", value = ""} -/// }, -/// src = 0, -/// } -/// } -/// -/// This is the same as the en_filterset_intersect of the filters, or doing en_filter({}, en_filter({}, x)), -/// but faster. The filterset evaluator will try to rewrite to this form if possible. -pub fn en_filter_all(lua: &Lua, (filters, src): (Table, Table)) -> mlua::Result
{ - let old_items: Table = src.get("items")?; - let items_len = old_items.len()?; - let new_items = deepcopy_table(lua, old_items)?; + let old_root: usize = src.get("root")?; - let intersect_filter = lua.create_table()?; - intersect_filter.set("type", "rel_intersect")?; - intersect_filter.set("src", items_len.saturating_sub(1))?; - intersect_filter.set("filters", filters)?; - new_items.push(intersect_filter)?; - - let fs = lua.create_table()?; - fs.set("type", "filterset")?; - fs.set("root", items_len)?; - fs.set("items", new_items)?; - Ok(fs) -} - -/// en_filter_any() -/// input: -/// filters: list of filter tables: table with -/// target: name of variable eg. "message" or "meta.filename" -/// relation: a relation, one of "EQ", "LT", "GT" -/// value: a constant to compare with -/// src: filterset -/// outputs: a filterset that matches an item if all filters match it (the intersection of filters) -/// { type: "filterset", -/// root: 1, -/// items: { -/// { type = "prim_list", value = {1,2,3}}, -/// { type = "rel_union", -/// filters = { -/// { target = "", relation = "EQ", value = ""}, -/// { target = "", relation = "EQ", value = ""} -/// }, -/// src = 0, -/// } -/// } -/// -/// This is the same as the en_filterset_union of the filters, but faster. The filterset evaluator will try to rewrite to this form if possible. -pub fn en_filter_any(lua: &Lua, (filters, src): (Table, Table)) -> mlua::Result
{ - let old_items: Table = src.get("items")?; - let items_len = old_items.len()?; - let new_items = deepcopy_table(lua, old_items)?; + let dnf_filter = lua.create_table()?; + dnf_filter.set("type", "rel_dnf")?; + dnf_filter.set("src", old_root)?; + let clauses_outer = lua.create_table()?; + let clauses_inner = lua.create_table()?; + let pred2 = deepcopy_table(lua, filter)?; - let union_filter = lua.create_table()?; - union_filter.set("type", "rel_union")?; - union_filter.set("src", items_len.saturating_sub(1))?; - union_filter.set("filters", filters)?; - new_items.push(union_filter)?; + clauses_inner.push(pred2)?; + clauses_outer.push(clauses_inner)?; + dnf_filter.set("clauses", clauses_outer)?; + new_items.push(dnf_filter)?; let fs = lua.create_table()?; fs.set("type", "filterset")?; @@ -649,7 +580,52 @@ pub fn en_filterset_intersect(lua: &Lua, filters: Table) -> mlua::Result
Ok(fs) } -/// en_filter() +/// en_filterset_dnf() +/// input: +/// filters: a list of list of filter descriptions, which is interpreted as a DNF clause list. +/// (this example would be (a=1 AND c=0) OR (b=1) +/// { +/// { +/// { target = "a", relation = "EQ", value = "1", src = 0 }, +/// { target = "c", relation = "EQ", value = "0", src = 0 }, +/// } +/// { +/// { target = "b", relation = "EQ", value = "1", src = 0}, +/// } +/// } +/// source: a filterset +/// outputs: a filterset that matches an item if satisfies either of the AND clauses +/// { type: "filterset", +/// root: 1, +/// items: { +/// { type = "prim_list", value = {1,2,3}}, +/// { type = "rel_dnf", src = 0, +/// clauses = { +/// { +/// { target = "a", relation = "EQ", value = "1", src = 0 }, +/// { target = "c", relation = "EQ", value = "0", src = 0 }, +/// } +/// { +/// { target = "b", relation = "EQ", value = "1", src = 0}, +/// } +/// } +/// } +/// } +pub fn en_filterset_dnf(lua: &Lua, (clauses, src): (Table, Table)) -> mlua::Result
{ + let new_fs = deepcopy_table(lua, src)?; + let old_root: usize = new_fs.get("root")?; + let items: Table = new_fs.get("items")?; + + let dnf_item = lua.create_table()?; + dnf_item.set("type", "rel_dnf")?; + dnf_item.set("clauses", deepcopy_table(lua, clauses)?)?; + dnf_item.set("src", old_root)?; + items.push(dnf_item)?; + new_fs.set("root", items.len()? - 1)?; + Ok(new_fs) +} + +/// en_filterset_not() /// input: filterset /// outputs: a filterset that matches an item exactly if it is not in the filterset. pub fn en_filterset_not(lua: &Lua, filterset: Table) -> mlua::Result
{ @@ -687,7 +663,9 @@ fn parse_predicate(t: &Table) -> mlua::Result> { }; Ok(Predicate { attr, rel, constant: en_value }) } -fn item_to_filterset(item: &Table) -> mlua::Result> { +fn item_to_filterset( + item: &Table, mut add_predicate: impl FnMut(Predicate) -> PredicateId, +) -> mlua::Result { let ty: String = item.get("type")?; match ty.as_str() { "prim_list" => { @@ -700,30 +678,31 @@ fn item_to_filterset(item: &Table) -> mlua::Result> { let bm = RoaringBitmap::from_sorted_iter(start..=end).into_lua_err()?; Ok(Filterset::Primitive(bm)) } - "rel" => { - let src: usize = item.get("src")?; - let pred = parse_predicate(item)?; - Ok(Filterset::Rel(pred, src)) - } - "rel_intersect" => { - // { type = "rel_intersect", - // filters = { - // { target = "", relation = "EQ", value = ""}, + "rel_dnf" => { + // { type = "rel_dnf", + // clauses = { + // { + // { target = "", relation = "GT", value = ""}, + // { target = "", relation = "LT", value = ""}, + // }, // }, // src = 0, // } - let filters: Vec
= item.get("filters")?; - let predicates: mlua::Result> = filters.iter().map(parse_predicate).collect(); - let predicates = predicates?; - let src: usize = item.get("src")?; - Ok(Filterset::RelIntersect(predicates, src)) - } - "rel_union" => { - let filters: Vec
= item.get("filters")?; - let predicates: mlua::Result> = filters.iter().map(parse_predicate).collect(); - let predicates = predicates?; + let clauses: Table = item.get("clauses")?; + let clauses_len = clauses.len()? as usize; + let mut new_clauses = Vec::with_capacity(clauses_len); + for i in 1..=clauses_len { + let anded_clauses: Table = clauses.get(i)?; + let anded_clauses_len = anded_clauses.len()? as usize; + let mut new_clause = Vec::with_capacity(anded_clauses_len); + for j in 1..=anded_clauses_len { + let predicate = anded_clauses.get(j)?; + new_clause.push(add_predicate(parse_predicate(&predicate)?)); + } + new_clauses.push(new_clause); + } let src: usize = item.get("src")?; - Ok(Filterset::RelUnion(predicates, src)) + Ok(Filterset::RelDnf(new_clauses, src)) } "intersect" => { // { type: "intersect", srcs = { 1, 3 }} @@ -738,7 +717,13 @@ fn item_to_filterset(item: &Table) -> mlua::Result> { pub struct EnMatcher<'a, L: LogProvider> { pub log: &'a L, } -pub fn predicate_to_en_predicate(p: &Predicate) -> (&str, bool, &Ordering, &EnValue) { +pub struct EnPredicate<'a> { + target: &'a str, + target_is_meta: bool, + rel: Ordering, + con: &'a EnValue, +} +pub fn predicate_to_en_predicate<'a>(p: &'a Predicate) -> EnPredicate<'a> { let Predicate { attr, rel, constant: con } = p; let mut target = attr.as_str(); let mut target_is_meta = false; @@ -746,50 +731,45 @@ pub fn predicate_to_en_predicate(p: &Predicate) -> (&str, bool, &Orderi target = stripped; target_is_meta = true; } - (target, target_is_meta, rel, con) + EnPredicate { target, target_is_meta, rel: *rel, con } } impl Matcher for EnMatcher<'_, L> { fn subset_matching( &self, predicate: &Predicate, input: &RoaringBitmap, ) -> RoaringBitmap { let mut res = input.clone(); - let (target, target_is_meta, rel, con) = predicate_to_en_predicate(predicate); + let EnPredicate { target, target_is_meta, rel, con } = predicate_to_en_predicate(predicate); for id in input { - let matches_here = span_matches_filter(self.log, id, target, target_is_meta, *rel, con); + let matches_here = span_matches_filter(self.log, id, target, target_is_meta, rel, con); if !matches_here { res.remove(id); } } res } - fn subset_matching_all( - &self, predicates: &[Predicate], input: &RoaringBitmap, - ) -> RoaringBitmap { - let mut res = input.clone(); - let en_predicates: Vec<(&str, bool, &Ordering, &EnValue)> = - predicates.iter().map(predicate_to_en_predicate).collect(); - for id in input { - let all_matches = en_predicates.iter().all(|(target, t_is_meta, rel, con)| { - span_matches_filter(self.log, id, target, *t_is_meta, **rel, con) - }); - if !all_matches { - res.remove(id); - } - } - res - } - fn subset_matching_either( - &self, predicates: &[Predicate], input: &RoaringBitmap, - ) -> RoaringBitmap { - let mut res = input.clone(); - let en_predicates: Vec<(&str, bool, &Ordering, &EnValue)> = - predicates.iter().map(predicate_to_en_predicate).collect(); - for id in input { - let any_matches = en_predicates.iter().any(|(target, t_is_meta, rel, con)| { - span_matches_filter(self.log, id, target, *t_is_meta, **rel, con) - }); - if !any_matches { - res.remove(id); + + fn subset_matching_dnf<'a, O, I>(&self, clauses: O, input: &RoaringBitmap) -> RoaringBitmap + where + O: Iterator, + I: Iterator>, + EnValue: 'a, + { + let mut res = RoaringBitmap::new(); + let predicates_prepared: Vec> = + clauses.map(|x| x.map(|y| predicate_to_en_predicate(y)).collect()).collect(); + 'outer: for id in input { + for anded_clause in predicates_prepared.iter() { + let mut matches_in_and = true; + for predicate in anded_clause { + let EnPredicate { target, target_is_meta, rel, con } = predicate; + matches_in_and &= + span_matches_filter(self.log, id, target, *target_is_meta, *rel, con) + } + // OR join -> matches in one anded clause means the whole thing matches + if matches_in_and { + res.insert(id); + continue 'outer; + } } } res @@ -799,25 +779,29 @@ impl Matcher for EnMatcher<'_, L> { /// of operations into a concrete list of matching indices. /// In some lazy languages, this operation is called "force". pub fn en_filterset_materialize( - log: &impl LogProvider, _lua: &Lua, -) -> impl Fn(Table) -> mlua::Result> { + log: &impl LogProvider, lua: &Lua, +) -> impl Fn(Table) -> mlua::Result
{ |filterset: Table| { - let matcher = EnMatcher { log }; - let mut evaluator = crate::filtersets::Evaluator::from_matcher(matcher); + let mut evaluator = crate::filtersets::Evaluator::new(); let root: usize = filterset.get("root")?; let items: Table = filterset.get("items")?; let item_cnt = items.len()?; for i in 1..=item_cnt { let item: Table = items.get(i)?; - let fs = item_to_filterset(&item)?; - evaluator.pool.push(fs); + let fs = item_to_filterset(&item, |p| evaluator.new_predicate(p))?; + evaluator.new_filterset(fs); } + let nstart = Instant::now(); evaluator.normalize(root); - evaluator.materialize(root); - let result: Vec = evaluator.results[&root].iter().collect(); + eprintln!("normalization took {:?}", nstart.elapsed()); + let matcher = EnMatcher { log }; + evaluator.materialize(&matcher, root); - Ok(result) + let tstart = Instant::now(); + let table = lua.create_sequence_from(evaluator.results[&root].iter())?; + eprintln!("allocating results table to lua took {:?}", tstart.elapsed()); + Ok(table) } } struct DynAdapter<'a>(&'a dyn LogProvider); @@ -906,10 +890,9 @@ macro_rules! lua_setup_with_wrappers { globals.set("en_filterset_from_list", $lua.create_function(en_filterset_from_list)?)?; globals.set("en_filterset_from_range", $lua.create_function(en_filterset_from_range)?)?; globals.set("en_filter", $lua.create_function(en_filter)?)?; - globals.set("en_filter_all", $lua.create_function(en_filter_all)?)?; - globals.set("en_filter_any", $lua.create_function(en_filter_any)?)?; globals.set("en_filterset_union", $lua.create_function(en_filterset_union)?)?; globals.set("en_filterset_intersect", $lua.create_function(en_filterset_intersect)?)?; + globals.set("en_filterset_dnf", $lua.create_function(en_filterset_dnf)?)?; globals.set("en_filterset_not", $lua.create_function(en_filterset_not)?)?; globals.set( "en_filterset_materialize", diff --git a/entrace_query/src/main.rs b/entrace_query/src/main.rs index 885473a..0266575 100644 --- a/entrace_query/src/main.rs +++ b/entrace_query/src/main.rs @@ -1,29 +1,25 @@ use std::cmp::Ordering; use entrace_core::EnValue; -use entrace_query::filtersets::{Evaluator, Filterset, Predicate, YesManMatcher}; +use entrace_query::filtersets::{Evaluator, Filterset, Predicate}; use roaring::RoaringBitmap as Roaring; fn main() { // Motivating example: filter people with (180::from_matcher(YesManMatcher()); - evaluator.pool.push(Filterset::Primitive(Roaring::full())); - evaluator - .pool - .push(Filterset::Rel(Predicate::new("height", Ordering::Greater, EnValue::U64(180)), 0)); - evaluator - .pool - .push(Filterset::Rel(Predicate::new("height", Ordering::Less, EnValue::U64(195)), 1)); - evaluator - .pool - .push(Filterset::Rel(Predicate::new("weight", Ordering::Greater, EnValue::U64(75)), 2)); - evaluator - .pool - .push(Filterset::Rel(Predicate::new("weight", Ordering::Less, EnValue::U64(90)), 3)); - evaluator - .pool - .push(Filterset::Rel(Predicate::new("iq", Ordering::Equal, EnValue::U64(120)), 0)); - evaluator.pool.push(Filterset::Or(vec![4, 5])); - println!("Before:\n{}", evaluator.dot(6)); - evaluator.normalize(6); - println!("After:\n{}", evaluator.dot(6)); + let mut evaluator = Evaluator::::new(); + use EnValue::*; + use Ordering::*; + let src = evaluator.new_filterset(Filterset::Primitive(Roaring::full())); + let height_lower = + evaluator.new_dnf(vec![vec![Predicate::new("height", Greater, U64(180))]], src); + let height_upper = evaluator.new_dnf(vec![vec![Predicate::new("height", Less, U64(195))]], src); + let height_and = evaluator.new_filterset(Filterset::And(vec![height_lower, height_upper])); + let weight_lower = + evaluator.new_dnf(vec![vec![Predicate::new("weight", Greater, U64(75))]], height_and); + let weight_upper = + evaluator.new_dnf(vec![vec![Predicate::new("weight", Less, U64(90))]], weight_lower); + let iq = evaluator.new_dnf(vec![vec![Predicate::new("iq", Equal, U64(120))]], 0); + let or = evaluator.new_filterset(Filterset::Or(vec![weight_upper, iq])); + println!("Before:\n{}", evaluator.dot(or)); + evaluator.normalize(or); + println!("After:\n{}", evaluator.dot(or)); } diff --git a/entrace_script/test_script.lua b/entrace_script/test_script.lua index 2aee43d..11e5bdf 100644 --- a/entrace_script/test_script.lua +++ b/entrace_script/test_script.lua @@ -6,4 +6,11 @@ local message_matches = en_filter(msg_filter_desc, base) breadth_filter_desc = { target = "breadth", value = 1, relation = "GT" } local breadth_matches = en_filter(breadth_filter_desc, base) -print(en_pretty_table(breadth_matches)) + +both_matches = en_filterset_dnf({ + { + { target = "breadth", value = 1, relation = "GT" }, + { target = "message", value = "constructed node", relation = "EQ" } + } +}, base) +print(en_pretty_table(both_matches)) diff --git a/gui/src/log.rs b/gui/src/log.rs index c16e7af..a10bc2e 100644 --- a/gui/src/log.rs +++ b/gui/src/log.rs @@ -9,7 +9,7 @@ use entrace_core::{ LogProvider, display_error_context, remote::{IETEvent, Notify, NotifyExt}, }; -use tracing::info; +use tracing::{info, trace}; use crate::{ benchmarkers::SamplingBenchmark, @@ -85,7 +85,13 @@ impl LogState { self.tree_view.update_tree(Some(tree_benchmark), std::iter::once(0), ctx); } pub fn on_frame(&self, notifier: &impl Notify) { - self.trace_provider.write().unwrap().frame_callback(); + if let Ok(mut q) = self.trace_provider.try_write() { + q.frame_callback(); + } else { + trace!( + "Cannot acquire write lock on trace provider, next frame_callback will be delayed" + ) + } if let Some(ref rx) = self.event_rx { while let Ok(y) = rx.try_recv() { match y {