diff --git a/src/cosmetic_filter_cache_builder.rs b/src/cosmetic_filter_cache_builder.rs index 9e0c3e4e..8ee3ed01 100644 --- a/src/cosmetic_filter_cache_builder.rs +++ b/src/cosmetic_filter_cache_builder.rs @@ -6,6 +6,7 @@ use crate::cosmetic_filter_cache::ProceduralOrActionFilter; use crate::cosmetic_filter_utils::SpecificFilterType; use crate::cosmetic_filter_utils::{encode_script_with_permission, key_from_selector}; use crate::filters::cosmetic::{CosmeticFilter, CosmeticFilterMask, CosmeticFilterOperator}; +use crate::filters::fb_builder::{EngineFlatBuilder, ShareableString}; use crate::filters::flatbuffer_generated::fb; use crate::flatbuffers::containers::flat_map::FlatMapBuilder; use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder; @@ -27,18 +28,18 @@ use flatbuffers::WIPOffset; /// See HostnameSpecificRules declaration for more details. #[derive(Default)] struct HostnameRule { - unhide: Vec, - uninject_script: Vec, - procedural_action: Vec, - procedural_action_exception: Vec, + unhide: Vec, + uninject_script: Vec, + procedural_action: Vec, + procedural_action_exception: Vec, } -impl<'a, B: FlatBuilder<'a>> FlatSerialize<'a, B> for HostnameRule { +impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for HostnameRule { type Output = WIPOffset>; fn serialize( value: Self, - builder: &mut B, + builder: &mut EngineFlatBuilder<'a>, ) -> flatbuffers::WIPOffset> { let unhide = serialize_vec_opt(value.unhide, builder); let uninject_script = serialize_vec_opt(value.uninject_script, builder); @@ -69,29 +70,29 @@ pub(crate) struct CosmeticFilterCacheBuilder { complex_class_rules: HashMapBuilder, complex_id_rules: HashMapBuilder, - hostname_hide: FlatMultiMapBuilder, - hostname_inject_script: FlatMultiMapBuilder, + hostname_hide: FlatMultiMapBuilder, + hostname_inject_script: FlatMultiMapBuilder, specific_rules: HashMap, } impl CosmeticFilterCacheBuilder { - pub fn from_rules(rules: Vec) -> Self { + pub fn from_rules(rules: Vec, builder: &mut EngineFlatBuilder) -> Self { let mut self_ = Self::default(); for rule in rules { - self_.add_filter(rule) + self_.add_filter(rule, builder); } self_ } - pub fn add_filter(&mut self, rule: CosmeticFilter) { + pub fn add_filter(&mut self, rule: CosmeticFilter, builder: &mut EngineFlatBuilder) { if rule.has_hostname_constraint() { if let Some(generic_rule) = rule.hidden_generic_rule() { self.add_generic_filter(generic_rule); } - self.store_hostname_rule(rule); + self.store_hostname_rule(rule, builder); } else { self.add_generic_filter(rule); } @@ -139,7 +140,7 @@ impl CosmeticFilterCacheBuilder { } } - fn store_hostname_rule(&mut self, rule: CosmeticFilter) { + fn store_hostname_rule(&mut self, rule: CosmeticFilter, builder: &mut EngineFlatBuilder) { use SpecificFilterType::*; let unhide = rule.mask.contains(CosmeticFilterMask::UNHIDE); @@ -171,45 +172,74 @@ impl CosmeticFilterCacheBuilder { .chain(rule.hostnames.unwrap_or_default()) .chain(rule.entities.unwrap_or_default()); - tokens_to_insert.for_each(|t| self.store_hostname_filter(&t, kind.clone())); + self.store_hostname_filter(tokens_to_insert, &kind, builder); + let negated = kind.negated(); let tokens_to_insert_negated = std::iter::empty() .chain(rule.not_hostnames.unwrap_or_default()) .chain(rule.not_entities.unwrap_or_default()); - let negated = kind.negated(); - - tokens_to_insert_negated.for_each(|t| self.store_hostname_filter(&t, negated.clone())); + self.store_hostname_filter(tokens_to_insert_negated, &negated, builder); } - fn store_hostname_filter(&mut self, token: &Hash, kind: SpecificFilterType) { + fn store_hostname_filter( + &mut self, + tokens: impl IntoIterator, + kind: &SpecificFilterType, + builder: &mut EngineFlatBuilder, + ) { use SpecificFilterType::*; match kind { // Handle hide and inject_script at top level for better deduplication Hide(s) => { - self.hostname_hide.insert(*token, s); + let mut shareable_string = None; + for token in tokens { + let s = shareable_string.get_or_insert_with(|| builder.add_shareable_string(s)); + self.hostname_hide.insert(token, s.clone()); + } } InjectScript((s, permission)) => { - let encoded_script = encode_script_with_permission(s, permission); - self.hostname_inject_script.insert(*token, encoded_script); + let mut shareable_string = None; + for token in tokens { + let s = shareable_string.get_or_insert_with(|| { + builder.add_shareable_string(&encode_script_with_permission(s, permission)) + }); + self.hostname_inject_script.insert(token, s.clone()); + } } // Handle remaining types through HostnameRule Unhide(s) => { - let entry = self.specific_rules.entry(*token).or_default(); - entry.unhide.push(s); + let mut shareable_string = None; + for token in tokens { + let s = shareable_string.get_or_insert_with(|| builder.add_shareable_string(s)); + let entry = self.specific_rules.entry(token).or_default(); + entry.unhide.push(s.clone()); + } } UninjectScript((s, _)) => { - let entry = self.specific_rules.entry(*token).or_default(); - entry.uninject_script.push(s); + let mut shareable_string = None; + for token in tokens { + let s = shareable_string.get_or_insert_with(|| builder.add_shareable_string(s)); + let entry = self.specific_rules.entry(token).or_default(); + entry.uninject_script.push(s.clone()); + } } ProceduralOrAction(s) => { - let entry = self.specific_rules.entry(*token).or_default(); - entry.procedural_action.push(s); + let mut shareable_string = None; + for token in tokens { + let s = shareable_string.get_or_insert_with(|| builder.add_shareable_string(s)); + let entry = self.specific_rules.entry(token).or_default(); + entry.procedural_action.push(s.clone()); + } } ProceduralOrActionException(s) => { - let entry = self.specific_rules.entry(*token).or_default(); - entry.procedural_action_exception.push(s); + let mut shareable_string = None; + for token in tokens { + let s = shareable_string.get_or_insert_with(|| builder.add_shareable_string(s)); + let entry = self.specific_rules.entry(token).or_default(); + entry.procedural_action_exception.push(s.clone()); + } } } } @@ -227,10 +257,13 @@ impl<'a, B: FlatBuilder<'a>> FlatSerialize<'a, B> for StringVector { } } -impl<'a, B: FlatBuilder<'a>> FlatSerialize<'a, B> for CosmeticFilterCacheBuilder { +impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for CosmeticFilterCacheBuilder { type Output = WIPOffset>; - fn serialize(value: Self, builder: &mut B) -> WIPOffset> { + fn serialize( + value: Self, + builder: &mut EngineFlatBuilder<'a>, + ) -> WIPOffset> { let complex_class_rules = HashMapBuilder::finish(value.complex_class_rules, builder); let complex_id_rules = HashMapBuilder::finish(value.complex_id_rules, builder); diff --git a/src/cosmetic_filter_utils.rs b/src/cosmetic_filter_utils.rs index be6a12f5..d9417c97 100644 --- a/src/cosmetic_filter_utils.rs +++ b/src/cosmetic_filter_utils.rs @@ -86,17 +86,16 @@ impl SpecificFilterType { /// Encodes permission bits in the last 2 ascii chars of a script string /// Returns the script with permission appended -pub(crate) fn encode_script_with_permission( - mut script: String, - permission: PermissionMask, -) -> String { +pub(crate) fn encode_script_with_permission(script: &str, permission: &PermissionMask) -> String { const HEX_CHARS: &[u8; 16] = b"0123456789abcdef"; let high = (permission.to_bits() >> 4) as usize; let low = (permission.to_bits() & 0x0f) as usize; - script.push(HEX_CHARS[high] as char); - script.push(HEX_CHARS[low] as char); - script + let mut encoded_script = String::with_capacity(script.len() + 2); + encoded_script.push_str(script); + encoded_script.push(HEX_CHARS[high] as char); + encoded_script.push(HEX_CHARS[low] as char); + encoded_script } /// Decodes permission bits from the last 2 ascii chars of a script string @@ -133,7 +132,7 @@ mod tests { let script = "console.log('测试 🚀 emoji')".to_string(); let permission = PermissionMask::from_bits(permission); - let encoded = encode_script_with_permission(script.clone(), permission); + let encoded = encode_script_with_permission(&script, &permission); let (decoded_permission, decoded_script) = decode_script_with_permission(&encoded); assert_eq!(decoded_permission.to_bits(), permission.to_bits()); diff --git a/src/data_format/mod.rs b/src/data_format/mod.rs index d62dca13..35d0ff94 100644 --- a/src/data_format/mod.rs +++ b/src/data_format/mod.rs @@ -17,7 +17,7 @@ const ADBLOCK_RUST_DAT_MAGIC: [u8; 4] = [0xd1, 0xd9, 0x3a, 0xaf]; /// The version of the data format. /// If the data format version is incremented, the data is considered as incompatible. -const ADBLOCK_RUST_DAT_VERSION: u8 = 2; +const ADBLOCK_RUST_DAT_VERSION: u8 = 3; /// The total length of the header prefix (magic + version + seahash) const HEADER_PREFIX_LENGTH: usize = 4 + 1 + 8; diff --git a/src/engine.rs b/src/engine.rs index 6037b05a..47962030 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -328,7 +328,7 @@ fn make_flatbuffer( let mut builder = EngineFlatBuilder::default(); let network_rules_builder = NetworkRulesBuilder::from_rules(network_filters, optimize); let network_rules = FlatSerialize::serialize(network_rules_builder, &mut builder); - let cosmetic_rules = CosmeticFilterCacheBuilder::from_rules(cosmetic_filters); + let cosmetic_rules = CosmeticFilterCacheBuilder::from_rules(cosmetic_filters, &mut builder); let cosmetic_rules = FlatSerialize::serialize(cosmetic_rules, &mut builder); builder.finish(network_rules, cosmetic_rules) } diff --git a/src/filters/fb_builder.rs b/src/filters/fb_builder.rs index 6ea470dd..323cb109 100644 --- a/src/filters/fb_builder.rs +++ b/src/filters/fb_builder.rs @@ -5,17 +5,24 @@ use std::collections::HashMap; use flatbuffers::WIPOffset; use crate::filters::fb_network_builder::NetworkFilterListBuilder; -use crate::flatbuffers::containers::flat_serialize::{FlatBuilder, WIPFlatVec}; +use crate::flatbuffers::containers::flat_serialize::{FlatBuilder, FlatSerialize, WIPFlatVec}; use crate::flatbuffers::unsafe_tools::VerifiedFlatbufferMemory; use crate::utils::Hash; use super::flat::fb; +#[derive(Clone, Default)] +pub(crate) struct ShareableString { + index: Option, +} + #[derive(Default)] pub(crate) struct EngineFlatBuilder<'a> { fb_builder: flatbuffers::FlatBufferBuilder<'a>, unique_domains_hashes: Vec, unique_domains_hashes_map: HashMap, + shared_strings: Vec>, + shared_strings_original: Vec, } impl<'a> EngineFlatBuilder<'a> { @@ -29,6 +36,15 @@ impl<'a> EngineFlatBuilder<'a> { index } + pub fn add_shareable_string(&mut self, s: &str) -> ShareableString { + let wip_offset = self.fb_builder.create_string(s); + self.shared_strings.push(wip_offset); + self.shared_strings_original.push(s.to_string()); + ShareableString { + index: Some(self.shared_strings.len() - 1), + } + } + pub fn finish( &mut self, network_rules: WIPFlatVec<'a, NetworkFilterListBuilder, EngineFlatBuilder<'a>>, @@ -58,3 +74,14 @@ impl<'a> FlatBuilder<'a> for EngineFlatBuilder<'a> { &mut self.fb_builder } } + +impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for ShareableString { + type Output = WIPOffset<&'a str>; + fn serialize(value: Self, builder: &mut EngineFlatBuilder<'a>) -> Self::Output { + if let Some(index) = value.index { + builder.shared_strings[index] + } else { + builder.raw_builder().create_shared_string("") + } + } +} diff --git a/tests/unit/engine.rs b/tests/unit/engine.rs index 65c983d1..51e5f4a8 100644 --- a/tests/unit/engine.rs +++ b/tests/unit/engine.rs @@ -183,7 +183,7 @@ mod tests { fn deserialization_generate_simple() { let mut engine = Engine::from_rules(["ad-banner"], Default::default()); let data = engine.serialize().to_vec(); - const EXPECTED_HASH: u64 = 884296823183764168; + const EXPECTED_HASH: u64 = 10945714988765761881; assert_eq!(hash(&data), EXPECTED_HASH, "{HASH_MISMATCH_MSG}"); engine.deserialize(&data).unwrap(); } @@ -193,7 +193,7 @@ mod tests { let mut engine = Engine::from_rules(["ad-banner$tag=abc"], Default::default()); engine.use_tags(&["abc"]); let data = engine.serialize().to_vec(); - const EXPECTED_HASH: u64 = 7887643884738497753; + const EXPECTED_HASH: u64 = 4608037684406751718; assert_eq!(hash(&data), EXPECTED_HASH, "{HASH_MISMATCH_MSG}"); engine.deserialize(&data).unwrap(); } @@ -221,8 +221,8 @@ mod tests { #[cfg(feature = "debug-info")] { let debug_info = engine.get_debug_info(); - let low_bound = 9_500_000; - let high_bound = 10_000_000; + let low_bound = 8_000_000; + let high_bound = 8_500_000; assert!( debug_info.flatbuffer_size >= low_bound, "Expected size >= {} bytes, got {}", @@ -237,9 +237,9 @@ mod tests { ); } let expected_hash: u64 = if cfg!(feature = "css-validation") { - 18094146314477408965 + 9439492009815519037 } else { - 8215024964158872824 + 14803842039735157685 }; assert_eq!(hash(&data), expected_hash, "{HASH_MISMATCH_MSG}");