Skip to content

Commit

Permalink
Resolve binast#298 - In content streams, reserve 0 for "next in prelude"
Browse files Browse the repository at this point in the history
As it turns out, the most common value we fetch from a content stream is the "next in prelude". Experiments (see issue binast#298) indicate that reserving 0 for "next in prelude" improves compression of most streams. This is what this
patch does.
  • Loading branch information
Yoric committed Feb 13, 2019
1 parent 186788c commit 80e9563
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
26 changes: 22 additions & 4 deletions crates/binjs_io/src/entropy/read/content_decoders.rs
Expand Up @@ -29,6 +29,11 @@ pub struct DictionaryStreamDecoder<'a, T> {

/// The name of this dictionary. Used for debugging/error reporting.
name: SharedString,

/// The latest index we have read from the prelude dictionary.
///
/// Used to decode the special symbol `0`.
latest_in_prelude: Option<usize>,
}
impl<'a, T> DictionaryStreamDecoder<'a, T> {
/// Create a decoder from a shared dictionary, a prelude dictionary and a stream of varnum-encoded values.
Expand All @@ -50,6 +55,7 @@ impl<'a, T> DictionaryStreamDecoder<'a, T> {
prelude_dictionary,
stream: stream.map(Cursor::new),
name,
latest_in_prelude: None,
}
}
}
Expand Down Expand Up @@ -79,8 +85,20 @@ where
Ok(result) => result as usize,
Err(err) => return Some(Err(TokenReaderError::ReadError(err))),
};

debug!(target: "read", "DictionaryStreamDecoder::next index: {}", index);
let index = if index == 0 {
// Special case: 0 is reserved for "next in prelude".
self.shared_dictionary.len()
+ match self.latest_in_prelude {
None => 0,
Some(i) => i + 1,
}
} else {
// Everything else is off by one.
index - 1
};
debug!(target: "read", "DictionaryStreamDecoder::next index converted to: {}", index);
debug!(target: "read", "DictionaryStreamDecoder::next dictionaries have {}, {} elements", self.shared_dictionary.len(), self.prelude_dictionary.len());
if index < self.shared_dictionary.len() {
debug!(target: "read", "That's in the shared dictionary");
return Some(Ok(self.shared_dictionary[index].clone()));
Expand All @@ -90,9 +108,9 @@ where
index - self.shared_dictionary.len(),
self.prelude_dictionary
);
return Some(Ok(self.prelude_dictionary
[index - self.shared_dictionary.len()]
.clone()));
let index_in_prelude = index - self.shared_dictionary.len();
self.latest_in_prelude = Some(index_in_prelude);
return Some(Ok(self.prelude_dictionary[index_in_prelude].clone()));
}
return Some(Err(TokenReaderError::BadDictionaryIndex {
index: index as u32,
Expand Down
25 changes: 23 additions & 2 deletions crates/binjs_io/src/entropy/write/mod.rs
Expand Up @@ -308,9 +308,30 @@ impl Encoder {

// Write (and possibly dump) data.
// In the current implementation, we just ignore any information other than the index.

// We reserve number 0 for "next in prelude", as it is a very common pattern, and having
// successive 0s improves compression (confirmed by issue #298).
// Any other index i is written as i + 1.
let mut latest_in_prelude = None;
for item in vec {
let index = item.raw();
lazy_stream.write_varnum(index as u32)?;
let varnum = match *item {
// References to the shared dictionary are shifted by 1.
Index::Shared(i) => i + 1,
// References to the prelude dictionary may be 0 (for next) or shifted by 1.
Index::Prelude(i) => {
let previous_plus_one = match latest_in_prelude {
None => 0,
Some(previous) => previous + 1,
};
latest_in_prelude = Some(i);
if previous_plus_one == i {
0
} else {
i + 1
}
}
};
lazy_stream.write_varnum(varnum as u32)?;
}

Self::flush_stream(name, lazy_stream, out)
Expand Down

0 comments on commit 80e9563

Please sign in to comment.