From 20fbd0d278fe1ed2eeb702c562d45bbf9fb99e3e Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Fri, 25 Mar 2022 20:08:43 +0700 Subject: [PATCH 1/4] Optimize regexp memory usage --- src/data_format/legacy.rs | 2 +- src/filters/network.rs | 28 +++++++++++----------------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/data_format/legacy.rs b/src/data_format/legacy.rs index 70b2f138..fc95fe1a 100644 --- a/src/data_format/legacy.rs +++ b/src/data_format/legacy.rs @@ -195,7 +195,7 @@ impl From for NetworkFilter { id: v.id, opt_domains_union: v.opt_domains_union, opt_not_domains_union: v.opt_not_domains_union, - regex: std::sync::Arc::new(std::sync::RwLock::new(None)), + regex: std::cell::RefCell::new(None), } } } diff --git a/src/filters/network.rs b/src/filters/network.rs index 65a51dd7..812fa887 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -4,7 +4,8 @@ use once_cell::sync::Lazy; use crate::url_parser::parse_url; use std::fmt; -use std::sync::{Arc, RwLock}; +use std::cell::RefCell; +use std::sync::Arc; use crate::request; use crate::utils; @@ -446,14 +447,13 @@ pub struct NetworkFilter { pub opt_not_domains_union: Option, // Regex compild lazily, using "Interior Mutability" - // Arc (Atomic Reference Counter) allows for cloned NetworkFilters - // to point to the same RwLock and what is inside. - // RwLock allows for concurrent access when reading as well as writing - // from the inside. - // When the Regex hasn't been compiled, is stored, afterwards Arc to Some + // RefCell allows for cloned NetworkFilters to point to the same Option and what is inside. + // When the Regex hasn't been compiled, is stored, afterwards Arc to CompiledRegex // to avoid expensive cloning of the Regex itself. + // Non-thread safe, should be used from a single thread. + // Atomic Arc is used for the backward-compatibilty with the other code. #[serde(skip_serializing, skip_deserializing)] - pub(crate) regex: Arc>>> + pub(crate) regex: RefCell>> } // TODO - restrict the API so that this is always true - i.e. lazy-calculate IDs from actual data, @@ -831,7 +831,7 @@ impl NetworkFilter { id: utils::fast_hash(&line), opt_domains_union, opt_not_domains_union, - regex: Arc::new(RwLock::new(None)) + regex: RefCell::new(None), }) } @@ -1057,15 +1057,9 @@ impl NetworkMatchable for NetworkFilter { if !self.is_regex() && !self.is_complete_regex() { return Arc::new(CompiledRegex::MatchAll); } - // Create a new scope to contain the lifetime of the - // dynamic read borrow - { - let cache = self.regex.as_ref().read().unwrap(); - if cache.is_some() { - return cache.as_ref().unwrap().clone(); - } + if let Some(cache) = &*self.regex.borrow() { + return cache.clone(); } - let mut cache = self.regex.as_ref().write().unwrap(); let regex = compile_regex( &self.filter, self.is_right_anchor(), @@ -1073,7 +1067,7 @@ impl NetworkMatchable for NetworkFilter { self.is_complete_regex(), ); let arc_regex = Arc::new(regex); - *cache = Some(arc_regex.clone()); + *self.regex.borrow_mut() = Some(arc_regex.clone()); arc_regex } } From c9791595575eae30109b95222bc270d4dd037f9a Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Mon, 28 Mar 2022 13:55:32 +0700 Subject: [PATCH 2/4] Fix static in tests --- tests/live.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/live.rs b/tests/live.rs index b1a19fca..cb8b4d1c 100644 --- a/tests/live.rs +++ b/tests/live.rs @@ -52,7 +52,7 @@ pub struct RemoteFilterSource { /// Fetch all filters once and store them in a lazy-loaded static variable to avoid unnecessary /// network traffic. -static ALL_FILTERS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { +static ALL_FILTERS: once_cell::sync::Lazy> = once_cell::sync::Lazy::new(|| { async fn get_all_filters() -> adblock::lists::FilterSet { use futures::FutureExt; @@ -91,11 +91,11 @@ static ALL_FILTERS: once_cell::sync::Lazy = once_cell } let async_runtime = Runtime::new().expect("Could not start Tokio runtime"); - async_runtime.block_on(get_all_filters()) + std::sync::Mutex::new(async_runtime.block_on(get_all_filters())) }); fn get_blocker_engine() -> Engine { - let mut engine = Engine::from_filter_set(ALL_FILTERS.clone(), true); + let mut engine = Engine::from_filter_set(ALL_FILTERS.lock().unwrap().clone(), true); engine.use_tags(&["fb-embeds", "twitter-embeds"]); @@ -134,7 +134,7 @@ fn get_blocker_engine_deserialized_ios() -> Engine { .lines() .map(|s| s.to_owned()) .collect(); - + let engine = Engine::from_rules_parametrised(&filters, Default::default(), true, false); engine } @@ -204,7 +204,7 @@ fn check_live_deserialized_specific_urls() { fn check_live_from_filterlists() { let engine = get_blocker_engine(); let requests = load_requests(); - + for req in requests { let checked = engine.check_network_urls(&req.url, &req.sourceUrl, &req.r#type); assert_eq!(checked.matched, req.blocked, @@ -217,7 +217,7 @@ fn check_live_from_filterlists() { fn check_live_deserialized_file() { let engine = get_blocker_engine_deserialized(); let requests = load_requests(); - + for req in requests { println!("Checking {:?}", req); let checked = engine.check_network_urls(&req.url, &req.sourceUrl, &req.r#type); @@ -232,7 +232,7 @@ fn check_live_deserialized_file() { fn check_live_deserialized_ios() { let engine = get_blocker_engine_deserialized_ios(); let requests = load_requests(); - + for req in requests { let checked = engine.check_network_urls(&req.url, &req.sourceUrl, &req.r#type); assert_eq!(checked.matched, req.blocked, @@ -252,7 +252,7 @@ fn check_live_redirects() { let resources = assemble_web_accessible_resources(war_dir, redirect_engine_path); engine.use_resources(&resources); - { + { let checked = engine.check_network_urls( "https://c.amazon-adsystem.com/aax2/amzn_ads.js", "https://aussieexotics.com/", @@ -274,17 +274,17 @@ fn check_live_redirects() { checked.filter, checked.exception); assert!(checked.redirect.is_some()); } - + } #[test] /// Ensure that two different engines loaded from the same textual filter set serialize to /// identical buffers. fn stable_serialization() { - let engine1 = Engine::from_filter_set(ALL_FILTERS.clone(), true); + let engine1 = Engine::from_filter_set(ALL_FILTERS.lock().unwrap().clone(), true); let ser1 = engine1.serialize_raw().unwrap(); - let engine2 = Engine::from_filter_set(ALL_FILTERS.clone(), true); + let engine2 = Engine::from_filter_set(ALL_FILTERS.lock().unwrap().clone(), true); let ser2 = engine2.serialize_raw().unwrap(); assert_eq!(ser1, ser2); @@ -294,7 +294,7 @@ fn stable_serialization() { /// Ensure that one engine's serialization result can be exactly reproduced by another engine after /// deserializing from it. fn stable_serialization_through_load() { - let engine1 = Engine::from_filter_set(ALL_FILTERS.clone(), true); + let engine1 = Engine::from_filter_set(ALL_FILTERS.lock().unwrap().clone(), true); let ser1 = engine1.serialize_raw().unwrap(); let mut engine2 = Engine::new(true); From f8cb5f7edc59034c095890df623acf5e2514854a Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Tue, 29 Mar 2022 14:32:57 +0700 Subject: [PATCH 3/4] add a feature to support the old impl --- Cargo.toml | 3 +- README.md | 5 ++++ src/data_format/legacy.rs | 2 +- src/filters/network.rs | 59 +++++++++++++++++++++++++++++++++++---- 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 13e04cba..104f680d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,7 +78,7 @@ harness = false [features] # If disabling default features, consider explicitly re-enabling the # "embedded-domain-resolver" feature. -default = ["embedded-domain-resolver", "full-regex-handling", "object-pooling"] +default = ["embedded-domain-resolver", "full-regex-handling", "object-pooling", "thread-safety"] full-domain-matching = [] metrics = [] full-regex-handling = [] @@ -87,3 +87,4 @@ css-validation = ["cssparser", "selectors"] content-blocking = ["serde_json"] embedded-domain-resolver = ["addr"] # Requires setting an external domain resolver if disabled. resource-assembler = ["serde_json"] +thread-safety = [] # Slower but thread-safe implementation. Consider disabling it if you don't need multithreading. diff --git a/README.md b/README.md index ffe9b957..7b9cfe37 100644 --- a/README.md +++ b/README.md @@ -88,3 +88,8 @@ By default, `adblock-rust` ships with a built-in domain resolution implementatio `adblock-rust` uses uBlock Origin-compatible resources for scriptlet injection and redirect rules. The `resource-assembler` feature allows `adblock-rust` to parse these resources directly from the file formats used by the uBlock Origin repository. + +### Thread safety + +The `thread-safety` feature enabled by default and allows to use the engine from multiple threads. Disabling the feature optimizes speed and memory usage by dropping some synchronization primitives. +Consider disabling this feature if you don't use multiply threads in your application. In Brave browser this feature is disabled. diff --git a/src/data_format/legacy.rs b/src/data_format/legacy.rs index fc95fe1a..3bd53a45 100644 --- a/src/data_format/legacy.rs +++ b/src/data_format/legacy.rs @@ -195,7 +195,7 @@ impl From for NetworkFilter { id: v.id, opt_domains_union: v.opt_domains_union, opt_not_domains_union: v.opt_not_domains_union, - regex: std::cell::RefCell::new(None), + regex: crate::filters::network::RegExStorage::default(), } } } diff --git a/src/filters/network.rs b/src/filters/network.rs index 812fa887..090e1a39 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -4,7 +4,6 @@ use once_cell::sync::Lazy; use crate::url_parser::parse_url; use std::fmt; -use std::cell::RefCell; use std::sync::Arc; use crate::request; @@ -426,6 +425,54 @@ fn parse_filter_options(raw_options: &str, opts: ParseOptions) -> Result>>>, + + #[cfg(not(feature = "thread-safety"))] + pub(crate) regex: std::cell::RefCell>>, +} + +impl RegExStorage { + pub fn default() -> RegExStorage { + #[cfg(feature = "thread-safety")] + { + RegExStorage {regex: Arc::new(std::sync::RwLock::new(None))} + } + + #[cfg(not(feature = "thread-safety"))] + { + RegExStorage {regex: std::cell::RefCell::new(None)} + } + } + + pub fn get(&self) -> Option> { + #[cfg(feature = "thread-safety")] + if let Some(cache) = &*self.regex.read().unwrap() { + return Some(cache.clone()); + } + + #[cfg(not(feature = "thread-safety"))] + if let Some(cache) = &*self.regex.borrow() { + return Some(cache.clone()); + } + None + } + + pub fn set(&self, regex: Arc) { + #[cfg(feature = "thread-safety")] + { + *self.regex.write().unwrap() = Some(regex); + } + + #[cfg(not(feature = "thread-safety"))] + { + *self.regex.borrow_mut() = Some(regex); + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NetworkFilter { pub mask: NetworkFilterMask, @@ -453,7 +500,7 @@ pub struct NetworkFilter { // Non-thread safe, should be used from a single thread. // Atomic Arc is used for the backward-compatibilty with the other code. #[serde(skip_serializing, skip_deserializing)] - pub(crate) regex: RefCell>> + pub(crate) regex: RegExStorage, } // TODO - restrict the API so that this is always true - i.e. lazy-calculate IDs from actual data, @@ -831,7 +878,7 @@ impl NetworkFilter { id: utils::fast_hash(&line), opt_domains_union, opt_not_domains_union, - regex: RefCell::new(None), + regex: RegExStorage::default(), }) } @@ -1057,8 +1104,8 @@ impl NetworkMatchable for NetworkFilter { if !self.is_regex() && !self.is_complete_regex() { return Arc::new(CompiledRegex::MatchAll); } - if let Some(cache) = &*self.regex.borrow() { - return cache.clone(); + if let Some(cache) = self.regex.get() { + return cache; } let regex = compile_regex( &self.filter, @@ -1067,7 +1114,7 @@ impl NetworkMatchable for NetworkFilter { self.is_complete_regex(), ); let arc_regex = Arc::new(regex); - *self.regex.borrow_mut() = Some(arc_regex.clone()); + self.regex.set(arc_regex.clone()); arc_regex } } From 62d7e27b7ab1d0d619cf0ac7c0f08100278a8383 Mon Sep 17 00:00:00 2001 From: Anton Lazarev Date: Tue, 29 Mar 2022 17:46:29 -0700 Subject: [PATCH 4/4] style fixes and updated feature name --- Cargo.toml | 6 +-- README.md | 4 +- src/data_format/legacy.rs | 2 +- src/engine.rs | 9 ++++ src/filters/network.rs | 86 ++++++++++++++++++--------------------- 5 files changed, 54 insertions(+), 53 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 104f680d..c8e8e3f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,13 +78,13 @@ harness = false [features] # If disabling default features, consider explicitly re-enabling the # "embedded-domain-resolver" feature. -default = ["embedded-domain-resolver", "full-regex-handling", "object-pooling", "thread-safety"] +default = ["embedded-domain-resolver", "full-regex-handling", "object-pooling", "unsync-regex-caching"] full-domain-matching = [] metrics = [] full-regex-handling = [] -object-pooling = ["lifeguard"] +object-pooling = ["lifeguard"] # disables `Send` and `Sync` on `Engine`. +unsync-regex-caching = [] # disables `Send` and `Sync` on `Engine`. css-validation = ["cssparser", "selectors"] content-blocking = ["serde_json"] embedded-domain-resolver = ["addr"] # Requires setting an external domain resolver if disabled. resource-assembler = ["serde_json"] -thread-safety = [] # Slower but thread-safe implementation. Consider disabling it if you don't need multithreading. diff --git a/README.md b/README.md index 7b9cfe37..abbfeb0f 100644 --- a/README.md +++ b/README.md @@ -91,5 +91,5 @@ The `resource-assembler` feature allows `adblock-rust` to parse these resources ### Thread safety -The `thread-safety` feature enabled by default and allows to use the engine from multiple threads. Disabling the feature optimizes speed and memory usage by dropping some synchronization primitives. -Consider disabling this feature if you don't use multiply threads in your application. In Brave browser this feature is disabled. +The `object-pooling` and `unsync-regex-caching` features enable optimizations for rule matching speed and the amount of memory used by the engine. +These features can be disabled to make the engine `Send + Sync`, although it is recommended to only access the engine on a single thread to maintain optimal performance. diff --git a/src/data_format/legacy.rs b/src/data_format/legacy.rs index 3bd53a45..91e069b6 100644 --- a/src/data_format/legacy.rs +++ b/src/data_format/legacy.rs @@ -195,7 +195,7 @@ impl From for NetworkFilter { id: v.id, opt_domains_union: v.opt_domains_union, opt_not_domains_union: v.opt_not_domains_union, - regex: crate::filters::network::RegExStorage::default(), + regex: crate::filters::network::RegexStorage::default(), } } } diff --git a/src/engine.rs b/src/engine.rs index a362f85c..7bdf16f4 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -268,6 +268,15 @@ impl Engine { } } +/// Static assertions for `Engine: Send + Sync` traits. +#[cfg(not(any(feature = "object-pooling", feature = "optimized-regex-cache")))] +fn _assertions() { + fn _assert_send() {} + fn _assert_sync() {} + + _assert_send::(); + _assert_sync::(); +} #[cfg(test)] mod tests { diff --git a/src/filters/network.rs b/src/filters/network.rs index 090e1a39..f206950d 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -425,52 +425,49 @@ fn parse_filter_options(raw_options: &str, opts: ParseOptions) -> Result>>>, - - #[cfg(not(feature = "thread-safety"))] - pub(crate) regex: std::cell::RefCell>>, +pub struct RegexStorage { + /// Atomic Arc is used for compatibilty with code that accesses the adblock engine from + /// multiple threads. + #[cfg(not(feature = "unsync-regex-caching"))] + pub(crate) regex: Arc>>>, + + /// RefCell allows for cloned NetworkFilters to point to the same Option and what is inside. + /// When the Regex hasn't been compiled, is stored, afterwards Arc to CompiledRegex + /// to avoid expensive cloning of the Regex itself. + /// Non-thread safe, should be used from a single thread. + #[cfg(feature = "unsync-regex-caching")] + pub(crate) regex: std::cell::RefCell>>, } -impl RegExStorage { - pub fn default() -> RegExStorage { - #[cfg(feature = "thread-safety")] - { - RegExStorage {regex: Arc::new(std::sync::RwLock::new(None))} - } - - #[cfg(not(feature = "thread-safety"))] - { - RegExStorage {regex: std::cell::RefCell::new(None)} - } - } - - pub fn get(&self) -> Option> { - #[cfg(feature = "thread-safety")] - if let Some(cache) = &*self.regex.read().unwrap() { - return Some(cache.clone()); - } +impl RegexStorage { + pub fn get(&self) -> Option> { + #[cfg(not(feature = "unsync-regex-caching"))] + if let Some(cache) = &*self.regex.read().unwrap() { + return Some(Arc::clone(cache)); + } - #[cfg(not(feature = "thread-safety"))] - if let Some(cache) = &*self.regex.borrow() { - return Some(cache.clone()); + #[cfg(feature = "unsync-regex-caching")] + if let Some(cache) = &*self.regex.borrow() { + return Some(Arc::clone(cache)); + } + None } - None - } - pub fn set(&self, regex: Arc) { - #[cfg(feature = "thread-safety")] - { - *self.regex.write().unwrap() = Some(regex); - } + pub fn set(&self, regex: Arc) { + #[cfg(not(feature = "unsync-regex-caching"))] + { + *self.regex.write().unwrap() = Some(regex); + } - #[cfg(not(feature = "thread-safety"))] - { - *self.regex.borrow_mut() = Some(regex); + #[cfg(feature = "unsync-regex-caching")] + { + *self.regex.borrow_mut() = Some(regex); + } } - } } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -493,14 +490,9 @@ pub struct NetworkFilter { pub opt_domains_union: Option, pub opt_not_domains_union: Option, - // Regex compild lazily, using "Interior Mutability" - // RefCell allows for cloned NetworkFilters to point to the same Option and what is inside. - // When the Regex hasn't been compiled, is stored, afterwards Arc to CompiledRegex - // to avoid expensive cloning of the Regex itself. - // Non-thread safe, should be used from a single thread. - // Atomic Arc is used for the backward-compatibilty with the other code. + // Regex compiled lazily and cached using interior mutability. #[serde(skip_serializing, skip_deserializing)] - pub(crate) regex: RegExStorage, + pub(crate) regex: RegexStorage, } // TODO - restrict the API so that this is always true - i.e. lazy-calculate IDs from actual data, @@ -878,7 +870,7 @@ impl NetworkFilter { id: utils::fast_hash(&line), opt_domains_union, opt_not_domains_union, - regex: RegExStorage::default(), + regex: RegexStorage::default(), }) } @@ -1105,7 +1097,7 @@ impl NetworkMatchable for NetworkFilter { return Arc::new(CompiledRegex::MatchAll); } if let Some(cache) = self.regex.get() { - return cache; + return cache; } let regex = compile_regex( &self.filter,