From 3208116c523c32f4c8a3f8e7311a9fc35d6ff17a Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 20 Nov 2025 11:41:18 +0400 Subject: [PATCH 1/4] Add stack vector (pre-0.12.x) (#573) The PR reduces the amount of allocations by using arrayvec crate that allocates memory on a stack instead of a heap. * the number of allocations is reduced (-19%, memory-usage/brave-list-initial/alloc-count); * building time is improved (about -10%, memory-usage/brave-list-initial); * the token limit is increased up to 256; * the tests expectations were updated (one case hit the old token limit). --- Cargo.lock | 7 +++++ Cargo.toml | 1 + src/filters/fb_network_builder.rs | 6 ++-- src/filters/network.rs | 50 +++++++++++++++---------------- src/request.rs | 4 +-- src/utils.rs | 28 +++++++++-------- tests/matching.rs | 2 +- tests/unit/filters/network.rs | 8 ++--- 8 files changed, 58 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9574b656..5f2f2fc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7,6 +7,7 @@ name = "adblock" version = "0.11.1" dependencies = [ "addr", + "arrayvec", "base64", "bitflags", "criterion", @@ -105,6 +106,12 @@ version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "autocfg" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index 2a615ad0..b9fa55e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ rustc-hash = { version = "1.1.0", default-features = false } memchr = "2.4" base64 = "0.22" rmp-serde = "0.15" +arrayvec = "0.7" cssparser = { version = "0.34", optional = true } selectors = { version = "0.26", optional = true } precomputed-hash = "0.1" diff --git a/src/filters/fb_network_builder.rs b/src/filters/fb_network_builder.rs index e569a3fb..ebbeb940 100644 --- a/src/filters/fb_network_builder.rs +++ b/src/filters/fb_network_builder.rs @@ -7,6 +7,7 @@ use flatbuffers::WIPOffset; use crate::filters::fb_builder::EngineFlatBuilder; use crate::filters::network::{FilterTokens, NetworkFilter}; use crate::filters::token_selector::TokenSelector; +use crate::utils::TokensBuffer; use crate::filters::network::NetworkFilterMaskHelper; use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder; @@ -134,6 +135,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { let mut optimizable = HashMap::>::new(); let mut token_frequencies = TokenSelector::new(rule_list.filters.len()); + let mut tokens_buffer = TokensBuffer::default(); { for network_filter in rule_list.filters { @@ -157,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } }; - let multi_tokens = network_filter.get_tokens_optimized(); + let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer); match multi_tokens { FilterTokens::Empty => { // No tokens, add to fallback bucket (token 0) @@ -171,7 +173,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } } FilterTokens::Other(tokens) => { - let best_token = token_frequencies.select_least_used_token(&tokens); + let best_token = token_frequencies.select_least_used_token(tokens); token_frequencies.record_usage(best_token); store_filter(best_token); } diff --git a/src/filters/network.rs b/src/filters/network.rs index 043b7825..01b73baf 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -15,9 +15,7 @@ use crate::filters::abstract_network::{ use crate::lists::ParseOptions; use crate::regex_manager::RegexManager; use crate::request; -use crate::utils::{self, Hash}; - -pub(crate) const TOKENS_BUFFER_SIZE: usize = 200; +use crate::utils::{self, Hash, TokensBuffer}; /// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns. static VALID_PARAM: Lazy = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap()); @@ -312,10 +310,10 @@ pub enum FilterPart { } #[derive(Debug, PartialEq)] -pub enum FilterTokens { +pub enum FilterTokens<'a> { Empty, - OptDomains(Vec), - Other(Vec), + OptDomains(&'a [Hash]), + Other(&'a [Hash]), } pub struct FilterPartIterator<'a> { @@ -885,17 +883,21 @@ impl NetworkFilter { #[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")] pub fn get_tokens(&self) -> Vec> { - match self.get_tokens_optimized() { + let mut tokens_buffer = TokensBuffer::default(); + match self.get_tokens_optimized(&mut tokens_buffer) { FilterTokens::OptDomains(domains) => { - domains.into_iter().map(|domain| vec![domain]).collect() + domains.iter().map(|domain| vec![*domain]).collect() } - FilterTokens::Other(tokens) => vec![tokens], + FilterTokens::Other(tokens) => vec![tokens.to_vec()], FilterTokens::Empty => vec![], } } - pub fn get_tokens_optimized(&self) -> FilterTokens { - let mut tokens: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + pub fn get_tokens_optimized<'a>( + &'a self, + tokens_buffer: &'a mut TokensBuffer, + ) -> FilterTokens<'a> { + tokens_buffer.clear(); // If there is only one domain and no domain negation, we also use this // domain as a token. @@ -905,7 +907,7 @@ impl NetworkFilter { { if let Some(domains) = self.opt_domains.as_ref() { if let Some(domain) = domains.first() { - tokens.push(*domain) + tokens_buffer.push(*domain); } } } @@ -918,7 +920,7 @@ impl NetworkFilter { (self.is_plain() || self.is_regex()) && !self.is_right_anchor(); let skip_first_token = self.is_right_anchor(); - utils::tokenize_filter_to(f, skip_first_token, skip_last_token, &mut tokens); + utils::tokenize_filter_to(f, skip_first_token, skip_last_token, tokens_buffer); } } FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request @@ -928,45 +930,43 @@ impl NetworkFilter { // Append tokens from hostname, if any if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { if let Some(hostname) = self.hostname.as_ref() { - utils::tokenize_to(hostname, &mut tokens); + utils::tokenize_to(hostname, tokens_buffer); } } else if let Some(hostname) = self.hostname.as_ref() { // Find last dot to tokenize the prefix let last_dot_pos = hostname.rfind('.'); if let Some(last_dot_pos) = last_dot_pos { - utils::tokenize_to(&hostname[..last_dot_pos], &mut tokens); + utils::tokenize_to(&hostname[..last_dot_pos], tokens_buffer); } } - if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { + if tokens_buffer.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { if let Some(removeparam) = &self.modifier_option { if VALID_PARAM.is_match(removeparam) { - utils::tokenize_to(&removeparam.to_ascii_lowercase(), &mut tokens); + utils::tokenize_to(&removeparam.to_ascii_lowercase(), tokens_buffer); } } } // If we got no tokens for the filter/hostname part, then we will dispatch // this filter in multiple buckets based on the domains option. - if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() { + if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() + { if let Some(opt_domains) = self.opt_domains.as_ref() { if !opt_domains.is_empty() { - return FilterTokens::OptDomains(opt_domains.clone()); + return FilterTokens::OptDomains(opt_domains); } } FilterTokens::Empty } else { // Add optional token for protocol if self.for_http() && !self.for_https() { - tokens.push(utils::fast_hash("http")); + tokens_buffer.push(utils::fast_hash("http")); } else if self.for_https() && !self.for_http() { - tokens.push(utils::fast_hash("https")); + tokens_buffer.push(utils::fast_hash("https")); } - // Remake a vector to drop extra capacity. - let mut t = Vec::with_capacity(tokens.len()); - t.extend(tokens); - FilterTokens::Other(t) + FilterTokens::Other(tokens_buffer.as_slice()) } } } diff --git a/src/request.rs b/src/request.rs index 5d853cb4..fd5409f1 100644 --- a/src/request.rs +++ b/src/request.rs @@ -239,11 +239,11 @@ impl Request { } fn calculate_tokens(url_lower_cased: &str) -> Vec { - let mut tokens = vec![]; + let mut tokens = utils::TokensBuffer::default(); utils::tokenize_pooled(url_lower_cased, &mut tokens); // Add zero token as a fallback to wildcard rule bucket tokens.push(0); - tokens + tokens.into_iter().collect() } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index b9cbe995..a24a7052 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,6 +6,8 @@ use seahash::hash; #[cfg(target_pointer_width = "32")] use seahash::reference::hash; +pub use arrayvec::ArrayVec; + pub type Hash = u64; // A smaller version of Hash that is used in serialized format. @@ -27,16 +29,16 @@ fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } -pub(crate) const TOKENS_BUFFER_SIZE: usize = 128; -pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1; -const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED; +/// A fixed-size array-like vector of hashes with maximum capacity of 256. +/// Used instread of Vec to avoid heap allocations. +pub type TokensBuffer = ArrayVec; fn fast_tokenizer_no_regex( pattern: &str, is_allowed_code: &dyn Fn(char) -> bool, skip_first_token: bool, skip_last_token: bool, - tokens_buffer: &mut Vec, + tokens_buffer: &mut TokensBuffer, ) { // let mut tokens_buffer_index = 0; let mut inside: bool = false; @@ -44,8 +46,8 @@ fn fast_tokenizer_no_regex( let mut preceding_ch: Option = None; // Used to check if a '*' is not just before a token for (i, c) in pattern.char_indices() { - if tokens_buffer.len() >= TOKENS_MAX { - return; + if tokens_buffer.capacity() - tokens_buffer.len() <= 1 { + return; // reserve one free slot for the zero token } if is_allowed_code(c) { if !inside { @@ -75,17 +77,17 @@ fn fast_tokenizer_no_regex( } } -pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec) { +pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } pub fn tokenize(pattern: &str) -> Vec { - let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens_buffer = TokensBuffer::default(); tokenize_to(pattern, &mut tokens_buffer); - tokens_buffer + tokens_buffer.into_iter().collect() } -pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut Vec) { +pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } @@ -95,21 +97,21 @@ pub(crate) fn tokenize_filter( skip_first_token: bool, skip_last_token: bool, ) -> Vec { - let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens_buffer = TokensBuffer::default(); tokenize_filter_to( pattern, skip_first_token, skip_last_token, &mut tokens_buffer, ); - tokens_buffer + tokens_buffer.into_iter().collect() } pub(crate) fn tokenize_filter_to( pattern: &str, skip_first_token: bool, skip_last_token: bool, - tokens_buffer: &mut Vec, + tokens_buffer: &mut TokensBuffer, ) { fast_tokenizer_no_regex( pattern, diff --git a/tests/matching.rs b/tests/matching.rs index d4f26934..e8649c07 100644 --- a/tests/matching.rs +++ b/tests/matching.rs @@ -199,5 +199,5 @@ fn check_rule_matching_browserlike() { let (blocked, passes) = bench_rule_matching_browserlike(&engine, &requests); let msg = "The number of blocked/passed requests has changed. ".to_string() + "If this is expected, update the expected values in the test."; - assert_eq!((blocked, passes), (106860, 136085), "{msg}"); + assert_eq!((blocked, passes), (106861, 136084), "{msg}"); } diff --git a/tests/unit/filters/network.rs b/tests/unit/filters/network.rs index b22e0a52..6d636831 100644 --- a/tests/unit/filters/network.rs +++ b/tests/unit/filters/network.rs @@ -1191,12 +1191,10 @@ mod parse_tests { fn test_simple_pattern_tokenization() { let rule = "||some.primewire.c*/sw$script,1p"; let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap(); + let mut tokens_buffer = utils::TokensBuffer::default(); assert_eq!( - filter.get_tokens_optimized(), - FilterTokens::Other(vec![ - utils::fast_hash("some"), - utils::fast_hash("primewire") - ]) + filter.get_tokens_optimized(&mut tokens_buffer), + FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")]) ); } } From b3f66ea8ccdf65c447b3db55a1339cbd7b4cefe1 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 20 Nov 2025 11:41:49 +0400 Subject: [PATCH 2/4] Pr dont use old methods in tests (pre-0.12.x) (#572) The PR removes impl NetworkMatchable for NetworkFilter and port the related tests to the flatbuffer impl (that we actually use in production) --- src/blocker.rs | 28 ++++-- src/engine.rs | 10 +++ src/filters/network.rs | 36 +++----- src/filters/network_matchers.rs | 35 -------- src/regex_manager.rs | 2 +- tests/legacy_harness.rs | 35 ++++---- tests/matching.rs | 39 +++++--- tests/unit/filters/network_matchers.rs | 63 ++++++++++--- tests/unit/optimizer.rs | 120 ++++++++----------------- tests/unit/regex_manager.rs | 79 +++++++++------- 10 files changed, 225 insertions(+), 222 deletions(-) diff --git a/src/blocker.rs b/src/blocker.rs index 4e062119..b87bd45e 100644 --- a/src/blocker.rs +++ b/src/blocker.rs @@ -79,6 +79,11 @@ pub struct Blocker { pub(crate) filter_data_context: FilterDataContextRef, } +#[cfg(feature = "single-thread")] +pub(crate) type RegexManagerRef<'a> = std::cell::RefMut<'a, RegexManager>; +#[cfg(not(feature = "single-thread"))] +pub(crate) type RegexManagerRef<'a> = std::sync::MutexGuard<'a, RegexManager>; + impl Blocker { /// Decide if a network request (usually from WebRequest API) should be /// blocked, redirected or allowed. @@ -130,10 +135,14 @@ impl Blocker { self.get_list(NetworkFilterListId::TaggedFiltersAll) } - #[cfg(feature = "single-thread")] - fn borrow_regex_manager(&self) -> std::cell::RefMut<'_, RegexManager> { + /// Borrow mutable reference to the regex manager for the ['Blocker`]. + /// Only one caller can borrow the regex manager at a time. + pub(crate) fn borrow_regex_manager(&self) -> RegexManagerRef<'_> { + #[cfg(feature = "single-thread")] #[allow(unused_mut)] let mut manager = self.regex_manager.borrow_mut(); + #[cfg(not(feature = "single-thread"))] + let mut manager = self.regex_manager.lock().unwrap(); #[cfg(not(target_arch = "wasm32"))] manager.update_time(); @@ -141,13 +150,6 @@ impl Blocker { manager } - #[cfg(not(feature = "single-thread"))] - fn borrow_regex_manager(&self) -> std::sync::MutexGuard<'_, RegexManager> { - let mut manager = self.regex_manager.lock().unwrap(); - manager.update_time(); - manager - } - pub fn check_generic_hide(&self, hostname_request: &Request) -> bool { let mut regex_manager = self.borrow_regex_manager(); self.generic_hide() @@ -155,6 +157,14 @@ impl Blocker { .is_some() } + #[cfg(test)] + pub(crate) fn check_exceptions(&self, request: &Request) -> bool { + let mut regex_manager = self.borrow_regex_manager(); + self.exceptions() + .check(request, &HashSet::new(), &mut regex_manager) + .is_some() + } + pub fn check_parameterised( &self, request: &Request, diff --git a/src/engine.rs b/src/engine.rs index 47962030..7867e0cd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -139,6 +139,11 @@ impl Engine { self.blocker.check(request, &self.resources) } + #[cfg(test)] + pub(crate) fn check_network_request_exceptions(&self, request: &Request) -> bool { + self.blocker.check_exceptions(request) + } + pub fn check_network_request_subset( &self, request: &Request, @@ -266,6 +271,11 @@ impl Engine { self.blocker.set_regex_discard_policy(new_discard_policy); } + #[cfg(test)] + pub fn borrow_regex_manager(&self) -> crate::blocker::RegexManagerRef<'_> { + self.blocker.borrow_regex_manager() + } + #[cfg(feature = "debug-info")] pub fn discard_regex(&mut self, regex_id: u64) { self.blocker.discard_regex(regex_id); diff --git a/src/filters/network.rs b/src/filters/network.rs index 01b73baf..a06c8859 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -969,6 +969,18 @@ impl NetworkFilter { FilterTokens::Other(tokens_buffer.as_slice()) } } + + #[cfg(test)] + pub(crate) fn matches_test(&self, request: &request::Request) -> bool { + let filter_set = crate::FilterSet::new_with_rules(vec![self.clone()], vec![], true); + let engine = crate::Engine::from_filter_set(filter_set, true); + + if self.is_exception() { + engine.check_network_request_exceptions(request) + } else { + engine.check_network_request(request).matched + } + } } impl NetworkFilterMaskHelper for NetworkFilter { @@ -993,30 +1005,6 @@ pub trait NetworkMatchable { fn matches_test(&self, request: &request::Request) -> bool; } -impl NetworkMatchable for NetworkFilter { - fn matches(&self, request: &request::Request, regex_manager: &mut RegexManager) -> bool { - use crate::filters::network_matchers::{ - check_excluded_domains, check_included_domains, check_options, check_pattern, - }; - check_options(self.mask, request) - && check_included_domains(self.opt_domains.as_deref(), request) - && check_excluded_domains(self.opt_not_domains.as_deref(), request) - && check_pattern( - self.mask, - self.filter.iter(), - self.hostname.as_deref(), - (self as *const NetworkFilter) as u64, - request, - regex_manager, - ) - } - - #[cfg(test)] - fn matches_test(&self, request: &request::Request) -> bool { - self.matches(request, &mut RegexManager::default()) - } -} - // --------------------------------------------------------------------------- // Filter parsing // --------------------------------------------------------------------------- diff --git a/src/filters/network_matchers.rs b/src/filters/network_matchers.rs index 62fe7cf6..e28fbd66 100644 --- a/src/filters/network_matchers.rs +++ b/src/filters/network_matchers.rs @@ -414,22 +414,6 @@ pub fn check_options(mask: NetworkFilterMask, request: &request::Request) -> boo true } -#[inline] -pub fn check_included_domains(opt_domains: Option<&[Hash]>, request: &request::Request) -> bool { - // Source URL must be among these domains to match - if let Some(included_domains) = opt_domains.as_ref() { - if let Some(source_hashes) = request.source_hostname_hashes.as_ref() { - if source_hashes - .iter() - .all(|h| !utils::bin_lookup(included_domains, *h)) - { - return false; - } - } - } - true -} - #[inline] pub fn check_included_domains_mapped( opt_domains: Option<&[u32]>, @@ -454,25 +438,6 @@ pub fn check_included_domains_mapped( true } -#[inline] -pub fn check_excluded_domains( - opt_not_domains: Option<&[Hash]>, - request: &request::Request, -) -> bool { - if let Some(excluded_domains) = opt_not_domains.as_ref() { - if let Some(source_hashes) = request.source_hostname_hashes.as_ref() { - if source_hashes - .iter() - .any(|h| utils::bin_lookup(excluded_domains, *h)) - { - return false; - } - } - } - - true -} - #[inline] pub fn check_excluded_domains_mapped( opt_not_domains: Option<&[u32]>, diff --git a/src/regex_manager.rs b/src/regex_manager.rs index 5713b620..2d22b7f6 100644 --- a/src/regex_manager.rs +++ b/src/regex_manager.rs @@ -15,7 +15,7 @@ use std::time::Duration; #[cfg(test)] #[cfg(not(target_arch = "wasm32"))] -use mock_instant::global::Instant; +use mock_instant::thread_local::Instant; #[cfg(not(test))] #[cfg(not(target_arch = "wasm32"))] use std::time::Instant; diff --git a/tests/legacy_harness.rs b/tests/legacy_harness.rs index dc5d8a4e..1cb133d6 100644 --- a/tests/legacy_harness.rs +++ b/tests/legacy_harness.rs @@ -1,9 +1,8 @@ mod legacy_test_filters { use adblock::filters::network::NetworkFilter; use adblock::filters::network::NetworkFilterMask; - use adblock::filters::network::NetworkMatchable; - use adblock::regex_manager::RegexManager; use adblock::request::Request; + use adblock::Engine; fn test_filter<'a>( raw_filter: &str, @@ -35,12 +34,15 @@ mod legacy_test_filters { filter.filter ); + let engine = Engine::from_rules_debug([raw_filter], Default::default()); + for to_block in blocked { assert!( - filter.matches( - &Request::new(to_block, "https://example.com", "other").unwrap(), - &mut RegexManager::default() - ), + engine + .check_network_request( + &Request::new(to_block, "https://example.com", "other").unwrap(), + ) + .matched, "Expected filter {} to match {}", raw_filter, &to_block @@ -49,10 +51,11 @@ mod legacy_test_filters { for to_pass in not_blocked { assert!( - !filter.matches( - &Request::new(to_pass, "https://example.com", "other").unwrap(), - &mut RegexManager::default() - ), + !engine + .check_network_request( + &Request::new(to_pass, "https://example.com", "other").unwrap(), + ) + .matched, "Expected filter {} to pass {}", raw_filter, &to_pass @@ -302,14 +305,12 @@ mod legacy_test_filters { ); // explicit, separate testcase construction of the "script" option as it is not the deafult - let filter = NetworkFilter::parse( - "||googlesyndication.com/safeframe/$third-party,script", - true, - Default::default(), - ) - .unwrap(); let request = Request::new("http://tpc.googlesyndication.com/safeframe/1-0-2/html/container.html#xpc=sf-gdn-exp-2&p=http%3A//slashdot.org;", "https://this-is-always-third-party.com", "script").unwrap(); - assert!(filter.matches(&request, &mut RegexManager::default())); + let engine = Engine::from_rules_debug( + ["||googlesyndication.com/safeframe/$third-party,script"], + Default::default(), + ); + assert!(engine.check_network_request(&request).matched); } } diff --git a/tests/matching.rs b/tests/matching.rs index e8649c07..9c9b6a7d 100644 --- a/tests/matching.rs +++ b/tests/matching.rs @@ -1,5 +1,4 @@ -use adblock::filters::network::{NetworkFilter, NetworkFilterMaskHelper, NetworkMatchable}; -use adblock::regex_manager::RegexManager; +use adblock::filters::network::{NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper}; use adblock::request::Request; use adblock::resources::{MimeType, Resource, ResourceType}; use adblock::Engine; @@ -70,18 +69,38 @@ fn check_filter_matching() { "Could not parse filter {filter}" ); let network_filter = network_filter_res.unwrap(); + let mut filters = vec![network_filter.clone()]; + if network_filter.is_exception() { + let mut original_filter = network_filter.clone(); + original_filter + .mask + .set(NetworkFilterMask::IS_EXCEPTION, false); + filters.push(original_filter); + } + let filter_set = adblock::FilterSet::new_with_rules(filters, vec![], true); + let engine = adblock::Engine::from_filter_set(filter_set, true); let request_res = Request::new(&req.url, &req.sourceUrl, &req.r#type); // The dataset has cases where URL is set to just "http://" or "https://", which we do not support if let Ok(request) = request_res { - assert!( - network_filter.matches(&request, &mut RegexManager::default()), - "Expected {} to match {} at {}, typed {}", - filter, - req.url, - req.sourceUrl, - req.r#type - ); + let result = engine.check_network_request(&request); + if !network_filter.is_exception() { + assert!( + result.matched, + "Expected {} to match {} at {}, typed {}", + filter, req.url, req.sourceUrl, req.r#type + ); + } else { + assert!( + !result.matched && result.exception.is_some(), + "Expected {} exception to match {} at {}, typed {}", + filter, + req.url, + req.sourceUrl, + req.r#type + ); + } + requests_checked += 1; } } diff --git a/tests/unit/filters/network_matchers.rs b/tests/unit/filters/network_matchers.rs index f35cb25a..7701bcf2 100644 --- a/tests/unit/filters/network_matchers.rs +++ b/tests/unit/filters/network_matchers.rs @@ -350,9 +350,38 @@ mod match_tests { } fn check_options(filter: &NetworkFilter, request: &request::Request) -> bool { + let mut mapping = HashMap::new(); + let opt_domains = filter.opt_domains.clone().map(|domains| { + domains + .iter() + .map(|domain| { + mapping.insert(*domain, *domain as u32); + *domain as u32 + }) + .collect::>() + }); + + let opt_not_domains = filter.opt_not_domains.clone().map(|domains| { + domains + .iter() + .map(|domain| { + mapping.insert(*domain, *domain as u32); + *domain as u32 + }) + .collect::>() + }); + super::super::check_options(filter.mask, request) - && super::super::check_included_domains(filter.opt_domains.as_deref(), request) - && super::super::check_excluded_domains(filter.opt_not_domains.as_deref(), request) + && super::super::check_included_domains_mapped( + opt_domains.as_deref(), + request, + &mapping, + ) + && super::super::check_excluded_domains_mapped( + opt_not_domains.as_deref(), + request, + &mapping, + ) } #[test] @@ -659,22 +688,34 @@ mod match_tests { } #[test] - #[ignore] // Not going to handle lookaround regexes #[cfg(feature = "debug-info")] fn check_lookaround_regex_handled() { { + use crate::Engine; let filter = r#"/^https?:\/\/([0-9a-z\-]+\.)?(9anime|animeland|animenova|animeplus|animetoon|animewow|gamestorrent|goodanime|gogoanime|igg-games|kimcartoon|memecenter|readcomiconline|toonget|toonova|watchcartoononline)\.[a-z]{2,4}\/(?!([Ee]xternal|[Ii]mages|[Ss]cripts|[Uu]ploads|ac|ajax|assets|combined|content|cov|cover|(img\/bg)|(img\/icon)|inc|jwplayer|player|playlist-cat-rss|static|thumbs|wp-content|wp-includes)\/)(.*)/$image,other,script,~third-party,xmlhttprequest,domain=~animeland.hu"#; - let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); - let url = "https://data.foo.com/9VjjrjU9Or2aqkb8PDiqTBnULPgeI48WmYEHkYer"; - let source = "http://123movies.com"; + let engine = Engine::from_rules(vec![filter], Default::default()); + let url = "https://9anime.to/watch/episode-1"; + let source = "https://9anime.to"; let request = request::Request::new(url, source, "script").unwrap(); - let mut regex_manager = RegexManager::default(); - assert!(regex_manager.get_compiled_regex_count() == 0); + assert_eq!( + engine + .get_debug_info() + .regex_debug_info + .compiled_regex_count, + 0 + ); + // Regex can't be compiled, so no match. assert!( - network_filter.matches(&request, &mut regex_manager), - "Expected match for {filter} on {url}" + !engine.check_network_request(&request).matched, + "Expected no match for {filter} on {url}" + ); + assert_eq!( + engine + .get_debug_info() + .regex_debug_info + .compiled_regex_count, + 1 ); - assert!(regex_manager.get_compiled_regex_count() == 1); } } diff --git a/tests/unit/optimizer.rs b/tests/unit/optimizer.rs index a7ad2c45..d4779d20 100644 --- a/tests/unit/optimizer.rs +++ b/tests/unit/optimizer.rs @@ -3,10 +3,8 @@ mod optimization_tests_pattern_group { #[cfg(test)] mod optimization_tests_pattern_group_tests { use super::*; - use crate::filters::network::NetworkMatchable; use crate::lists; use crate::regex_manager::CompiledRegex; - use crate::regex_manager::RegexManager; use crate::request::Request; use regex::bytes::RegexSetBuilder as BytesRegexSetBuilder; @@ -18,20 +16,14 @@ mod optimization_tests_pattern_group { ); } - fn check_match( - regex_manager: &mut RegexManager, - filter: &NetworkFilter, - url_path: &str, - matches: bool, - ) { - let is_match = filter.matches( + fn check_match(filter: &NetworkFilter, url_path: &str, matches: bool) { + let is_match = filter.matches_test( &Request::new( ("https://example.com/".to_string() + url_path).as_str(), "https://google.com", "", ) .unwrap(), - regex_manager, ); assert!( is_match == matches, @@ -95,38 +87,22 @@ mod optimization_tests_pattern_group { fused.to_string(), "/static/ad- <+> /static/ad. <+> /static/ad/* <+> /static/ads/* <+> /static/adv/*" ); - let mut regex_manager = RegexManager::default(); - check_match(&mut regex_manager, &fused, "/static/ad-", true); - check_match(&mut regex_manager, &fused, "/static/ad.", true); - check_match(&mut regex_manager, &fused, "/static/ad%", false); - check_match(&mut regex_manager, &fused, "/static/ads-", false); - check_match(&mut regex_manager, &fused, "/static/ad/", true); - check_match(&mut regex_manager, &fused, "/static/ad", false); - check_match(&mut regex_manager, &fused, "/static/ad/foobar", true); - check_match( - &mut regex_manager, - &fused, - "/static/ad/foobar/asd?q=1", - true, - ); - check_match(&mut regex_manager, &fused, "/static/ads/", true); - check_match(&mut regex_manager, &fused, "/static/ads", false); - check_match(&mut regex_manager, &fused, "/static/ads/foobar", true); - check_match( - &mut regex_manager, - &fused, - "/static/ads/foobar/asd?q=1", - true, - ); - check_match(&mut regex_manager, &fused, "/static/adv/", true); - check_match(&mut regex_manager, &fused, "/static/adv", false); - check_match(&mut regex_manager, &fused, "/static/adv/foobar", true); - check_match( - &mut regex_manager, - &fused, - "/static/adv/foobar/asd?q=1", - true, - ); + check_match(&fused, "/static/ad-", true); + check_match(&fused, "/static/ad.", true); + check_match(&fused, "/static/ad%", false); + check_match(&fused, "/static/ads-", false); + check_match(&fused, "/static/ad/", true); + check_match(&fused, "/static/ad", false); + check_match(&fused, "/static/ad/foobar", true); + check_match(&fused, "/static/ad/foobar/asd?q=1", true); + check_match(&fused, "/static/ads/", true); + check_match(&fused, "/static/ads", false); + check_match(&fused, "/static/ads/foobar", true); + check_match(&fused, "/static/ads/foobar/asd?q=1", true); + check_match(&fused, "/static/adv/", true); + check_match(&fused, "/static/adv", false); + check_match(&fused, "/static/adv/foobar", true); + check_match(&fused, "/static/adv/foobar/asd?q=1", true); } #[test] @@ -325,10 +301,8 @@ mod optimization_tests_pattern_group { } */ use super::super::*; - use crate::filters::network::NetworkMatchable; use crate::lists; use crate::regex_manager::CompiledRegex; - use crate::regex_manager::RegexManager; use crate::request::Request; use regex::bytes::RegexSetBuilder as BytesRegexSetBuilder; @@ -340,20 +314,14 @@ mod optimization_tests_pattern_group { ); } - fn check_match( - regex_manager: &mut RegexManager, - filter: &NetworkFilter, - url_path: &str, - matches: bool, - ) { - let is_match = filter.matches( + fn check_match(filter: &NetworkFilter, url_path: &str, matches: bool) { + let is_match = filter.matches_test( &Request::new( ("https://example.com/".to_string() + url_path).as_str(), "https://google.com", "", ) .unwrap(), - regex_manager, ); assert!( is_match == matches, @@ -417,38 +385,22 @@ mod optimization_tests_pattern_group { fused.to_string(), "/static/ad- <+> /static/ad. <+> /static/ad/* <+> /static/ads/* <+> /static/adv/*" ); - let mut regex_manager = RegexManager::default(); - check_match(&mut regex_manager, &fused, "/static/ad-", true); - check_match(&mut regex_manager, &fused, "/static/ad.", true); - check_match(&mut regex_manager, &fused, "/static/ad%", false); - check_match(&mut regex_manager, &fused, "/static/ads-", false); - check_match(&mut regex_manager, &fused, "/static/ad/", true); - check_match(&mut regex_manager, &fused, "/static/ad", false); - check_match(&mut regex_manager, &fused, "/static/ad/foobar", true); - check_match( - &mut regex_manager, - &fused, - "/static/ad/foobar/asd?q=1", - true, - ); - check_match(&mut regex_manager, &fused, "/static/ads/", true); - check_match(&mut regex_manager, &fused, "/static/ads", false); - check_match(&mut regex_manager, &fused, "/static/ads/foobar", true); - check_match( - &mut regex_manager, - &fused, - "/static/ads/foobar/asd?q=1", - true, - ); - check_match(&mut regex_manager, &fused, "/static/adv/", true); - check_match(&mut regex_manager, &fused, "/static/adv", false); - check_match(&mut regex_manager, &fused, "/static/adv/foobar", true); - check_match( - &mut regex_manager, - &fused, - "/static/adv/foobar/asd?q=1", - true, - ); + check_match(&fused, "/static/ad-", true); + check_match(&fused, "/static/ad.", true); + check_match(&fused, "/static/ad%", false); + check_match(&fused, "/static/ads-", false); + check_match(&fused, "/static/ad/", true); + check_match(&fused, "/static/ad", false); + check_match(&fused, "/static/ad/foobar", true); + check_match(&fused, "/static/ad/foobar/asd?q=1", true); + check_match(&fused, "/static/ads/", true); + check_match(&fused, "/static/ads", false); + check_match(&fused, "/static/ads/foobar", true); + check_match(&fused, "/static/ads/foobar/asd?q=1", true); + check_match(&fused, "/static/adv/", true); + check_match(&fused, "/static/adv", false); + check_match(&fused, "/static/adv/foobar", true); + check_match(&fused, "/static/adv/foobar/asd?q=1", true); } #[test] diff --git a/tests/unit/regex_manager.rs b/tests/unit/regex_manager.rs index fdd9d107..739781dd 100644 --- a/tests/unit/regex_manager.rs +++ b/tests/unit/regex_manager.rs @@ -2,13 +2,12 @@ mod tests { use super::super::*; - use crate::filters::network::{NetworkFilter, NetworkMatchable}; - use crate::request; + use crate::{request, Engine}; - use mock_instant::global::MockClock; + use mock_instant::thread_local::MockClock; - fn make_filter(line: &str) -> NetworkFilter { - NetworkFilter::parse(line, true, Default::default()).unwrap() + fn make_engine(line: &str) -> Engine { + Engine::from_rules(vec![line], Default::default()) } fn make_request(url: &str) -> request::Request { @@ -25,45 +24,63 @@ mod tests { #[test] fn simple_match() { - let mut regex_manager = RegexManager::default(); - regex_manager.update_time(); + let engine = make_engine("||geo*.hltv.org^"); - let filter = make_filter("||geo*.hltv.org^"); - assert!(filter.matches(&make_request("https://geo2.hltv.org/"), &mut regex_manager)); + assert!( + engine + .check_network_request(&make_request("https://geo2.hltv.org/")) + .matched + ); + + let regex_manager = engine.borrow_regex_manager(); assert_eq!(get_active_regex_count(®ex_manager), 1); assert_eq!(regex_manager.get_debug_regex_data().len(), 1); } #[test] fn discard_and_recreate() { - let mut regex_manager = RegexManager::default(); - regex_manager.update_time(); + let engine = make_engine("||geo*.hltv.org^"); - let filter = make_filter("||geo*.hltv.org^"); - assert!(filter.matches(&make_request("https://geo2.hltv.org/"), &mut regex_manager)); - assert_eq!(regex_manager.get_compiled_regex_count(), 1); - assert_eq!(get_active_regex_count(®ex_manager), 1); + assert!( + engine + .check_network_request(&make_request("https://geo2.hltv.org/")) + .matched + ); - MockClock::advance(DEFAULT_DISCARD_UNUSED_TIME - Duration::from_secs(1)); - regex_manager.update_time(); - // The entry shouldn't be discarded because was used during - // last REGEX_MANAGER_DISCARD_TIME. - assert_eq!(get_active_regex_count(®ex_manager), 1); + { + let regex_manager = engine.borrow_regex_manager(); + assert_eq!(regex_manager.get_compiled_regex_count(), 1); + assert_eq!(get_active_regex_count(®ex_manager), 1); + } - // The entry is entry is outdated, but should be discarded only - // in the next cleanup() call. The call was 2 sec ago and is throttled - // now. - MockClock::advance(DEFAULT_CLEAN_UP_INTERVAL - Duration::from_secs(1)); - regex_manager.update_time(); - assert_eq!(get_active_regex_count(®ex_manager), 1); + { + let regex_manager = engine.borrow_regex_manager(); + MockClock::advance(DEFAULT_DISCARD_UNUSED_TIME - Duration::from_secs(1)); + // The entry shouldn't be discarded because was used during + // last REGEX_MANAGER_DISCARD_TIME. + assert_eq!(get_active_regex_count(®ex_manager), 1); + + // The entry is entry is outdated, but should be discarded only + // in the next cleanup() call. The call was 2 sec ago and is throttled + // now. + MockClock::advance(DEFAULT_CLEAN_UP_INTERVAL - Duration::from_secs(1)); + assert_eq!(get_active_regex_count(®ex_manager), 1); + } - MockClock::advance(Duration::from_secs(2)); - regex_manager.update_time(); - // The entry is now outdated & cleanup() should be called => discard. - assert_eq!(get_active_regex_count(®ex_manager), 0); + { + MockClock::advance(Duration::from_secs(2)); + let regex_manager = engine.borrow_regex_manager(); + // The entry is now outdated & cleanup() should be called => discard. + assert_eq!(get_active_regex_count(®ex_manager), 0); + } // The entry is recreated, get_compiled_regex_count() increased +1. - assert!(filter.matches(&make_request("https://geo2.hltv.org/"), &mut regex_manager)); + assert!( + engine + .check_network_request(&make_request("https://geo2.hltv.org/")) + .matched + ); + let regex_manager = engine.borrow_regex_manager(); assert_eq!(regex_manager.get_compiled_regex_count(), 2); assert_eq!(get_active_regex_count(®ex_manager), 1); } From 6b54aafe4d327b3a226c4b185a19e15fcc2e13ba Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Fri, 21 Nov 2025 12:15:00 +0400 Subject: [PATCH 3/4] Adjust public API before 0.12.x release --- src/filters/fb_network.rs | 5 ----- src/filters/fb_network_builder.rs | 2 +- src/filters/network.rs | 21 +++------------------ src/utils.rs | 2 +- tests/unit/filters/network.rs | 2 +- 5 files changed, 6 insertions(+), 26 deletions(-) diff --git a/src/filters/fb_network.rs b/src/filters/fb_network.rs index 13efa9a0..505caa17 100644 --- a/src/filters/fb_network.rs +++ b/src/filters/fb_network.rs @@ -171,9 +171,4 @@ impl NetworkMatchable for FlatNetworkFilter<'_> { regex_manager, ) } - - #[cfg(test)] - fn matches_test(&self, request: &Request) -> bool { - self.matches(request, &mut RegexManager::default()) - } } diff --git a/src/filters/fb_network_builder.rs b/src/filters/fb_network_builder.rs index ebbeb940..115bcbcc 100644 --- a/src/filters/fb_network_builder.rs +++ b/src/filters/fb_network_builder.rs @@ -159,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } }; - let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer); + let multi_tokens = network_filter.get_tokens(&mut tokens_buffer); match multi_tokens { FilterTokens::Empty => { // No tokens, add to fallback bucket (token 0) diff --git a/src/filters/network.rs b/src/filters/network.rs index a06c8859..66885142 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -310,7 +310,7 @@ pub enum FilterPart { } #[derive(Debug, PartialEq)] -pub enum FilterTokens<'a> { +pub(crate) enum FilterTokens<'a> { Empty, OptDomains(&'a [Hash]), Other(&'a [Hash]), @@ -881,19 +881,7 @@ impl NetworkFilter { ) } - #[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")] - pub fn get_tokens(&self) -> Vec> { - let mut tokens_buffer = TokensBuffer::default(); - match self.get_tokens_optimized(&mut tokens_buffer) { - FilterTokens::OptDomains(domains) => { - domains.iter().map(|domain| vec![*domain]).collect() - } - FilterTokens::Other(tokens) => vec![tokens.to_vec()], - FilterTokens::Empty => vec![], - } - } - - pub fn get_tokens_optimized<'a>( + pub(crate) fn get_tokens<'a>( &'a self, tokens_buffer: &'a mut TokensBuffer, ) -> FilterTokens<'a> { @@ -998,11 +986,8 @@ impl fmt::Display for NetworkFilter { } } -pub trait NetworkMatchable { +pub(crate) trait NetworkMatchable { fn matches(&self, request: &request::Request, regex_manager: &mut RegexManager) -> bool; - - #[cfg(test)] - fn matches_test(&self, request: &request::Request) -> bool; } // --------------------------------------------------------------------------- diff --git a/src/utils.rs b/src/utils.rs index a24a7052..6930ac11 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -31,7 +31,7 @@ fn is_allowed_filter(ch: char) -> bool { /// A fixed-size array-like vector of hashes with maximum capacity of 256. /// Used instread of Vec to avoid heap allocations. -pub type TokensBuffer = ArrayVec; +pub(crate) type TokensBuffer = ArrayVec; fn fast_tokenizer_no_regex( pattern: &str, diff --git a/tests/unit/filters/network.rs b/tests/unit/filters/network.rs index 6d636831..67823c1d 100644 --- a/tests/unit/filters/network.rs +++ b/tests/unit/filters/network.rs @@ -1193,7 +1193,7 @@ mod parse_tests { let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap(); let mut tokens_buffer = utils::TokensBuffer::default(); assert_eq!( - filter.get_tokens_optimized(&mut tokens_buffer), + filter.get_tokens(&mut tokens_buffer), FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")]) ); } From 29cf12d01b1a840eb860867c7c16b55de58a1eb8 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Thu, 20 Nov 2025 11:45:47 +0400 Subject: [PATCH 4/4] v0.12.0 - more memory/performance enhancements Added: - `arrayvec` dependency Changed: - Improved performance and memory usage when parsing filters Removed: - `FilterTokens`, `get_tokens`, `get_tokens_optimized`, and `NetworkMatchable` are now crate-internal only --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- js/Cargo.toml | 2 +- package-lock.json | 4 ++-- package.json | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f2f2fc8..1b8b2fc1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 4 [[package]] name = "adblock" -version = "0.11.1" +version = "0.12.0" dependencies = [ "addr", "arrayvec", @@ -46,7 +46,7 @@ dependencies = [ [[package]] name = "adblock-rs" -version = "0.11.1" +version = "0.12.0" dependencies = [ "adblock", "neon", diff --git a/Cargo.toml b/Cargo.toml index b9fa55e5..dc3848cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "adblock" -version = "0.11.1" +version = "0.12.0" authors = ["Anton Lazarev ", "Andrius Aucinas"] edition = "2021" diff --git a/js/Cargo.toml b/js/Cargo.toml index 21725037..ab77cf7d 100644 --- a/js/Cargo.toml +++ b/js/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "adblock-rs" -version = "0.11.1" +version = "0.12.0" authors = ["Anton Lazarev ", "Andrius Aucinas"] edition = "2021" license = "MPL-2.0" diff --git a/package-lock.json b/package-lock.json index 792a5106..aa13dcb5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "adblock-rs", - "version": "0.11.1", + "version": "0.12.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "adblock-rs", - "version": "0.11.1", + "version": "0.12.0", "hasInstallScript": true, "license": "MPL-2.0", "dependencies": { diff --git a/package.json b/package.json index b030baa9..dacea721 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "adblock-rs", - "version": "0.11.1", + "version": "0.12.0", "description": "Very fast, Rust-based, native implementation of ad-blocker engine for Node", "keywords": [ "adblock",