From 6c83b7ec1076dc2f8a78a43f304e96a4c380900d Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Thu, 18 Feb 2021 14:48:01 +0100 Subject: [PATCH 01/21] refactor/transform: introduce transform and improve error handling --- build/Cargo.toml | 4 +- build/src/lib.rs | 140 ++++++++++++++++++++++++++++++----------------- 2 files changed, 93 insertions(+), 51 deletions(-) diff --git a/build/Cargo.toml b/build/Cargo.toml index 6fb737d..938d0ca 100644 --- a/build/Cargo.toml +++ b/build/Cargo.toml @@ -17,7 +17,9 @@ directories = "3" reqwest = { version = "0.11", default_features = false, features = ["blocking", "rustls-tls"]} nlprule = { path = "../nlprule", features = ["compile"], version = "0.4.5-pre" } # BUILD_BINDINGS_COMMENT # nlprule = { package = "nlprule-core", path = "../nlprule", features = ["compile"] } # BUILD_BINDINGS_UNCOMMENT +fs-err = "2.5" [dev-dependencies] tempdir = "0.3" -smush = "0.1.5" \ No newline at end of file +smush = "0.1.5" +brotli = "3.3.0" diff --git a/build/src/lib.rs b/build/src/lib.rs index 8fa7af7..2906627 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -1,28 +1,33 @@ //! This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). //! See `README.md` for details. -use flate2::read::GzDecoder; +use flate2::bufread::GzDecoder; +use io::BufReader; use nlprule::{compile, rules_filename, tokenizer_filename}; use std::{ - fs::{self, File}, io::{self, BufWriter, Cursor, Read}, path::{Path, PathBuf}, + result, }; -use thiserror::Error; use zip::result::ZipError; +use fs_err as fs; +use fs::File; +use std::fs::Permissions; -#[derive(Debug, Error)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[error("request error: {0}")] + #[error(transparent)] RequestError(#[from] reqwest::Error), - #[error("i/o error: {0}")] + #[error(transparent)] IOError(#[from] io::Error), - #[error("zip error: {0}")] + #[error(transparent)] ZipError(#[from] ZipError), #[error("error postprocessing binaries: {0}")] - PostprocessingError(Box), + PostprocessingError(#[from] Box), } +pub type Result = std::result::Result; + #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub enum Binary { Tokenizer, @@ -46,7 +51,7 @@ pub fn get_binary>( lang_code: &str, binary: Binary, cache_dir: Option

, -) -> Result { +) -> Result> { let filename = binary.filename(lang_code); let mut cache_path: Option = None; @@ -58,7 +63,7 @@ pub fn get_binary>( // if the file can be read, the data is already cached if let Some(path) = &cache_path { if let Ok(bytes) = fs::read(path) { - return Ok(Cursor::new(bytes)); + return Ok(Box::new(Cursor::new(bytes))); } } @@ -80,10 +85,10 @@ pub fn get_binary>( fs::write(path, &buffer)?; } - Ok(Cursor::new(buffer)) + Ok(Box::new(Cursor::new(buffer))) } -pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<(), Error> { +pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<()> { let bytes = reqwest::blocking::get(&format!( "https://f000.backblazeb2.com/file/nlprule/{}.zip", lang_code @@ -123,7 +128,7 @@ pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<(), use std::os::unix::fs::PermissionsExt; if let Some(mode) = file.unix_mode() { - fs::set_permissions(&outpath, fs::Permissions::from_mode(mode))?; + fs::set_permissions(&outpath, Permissions::from_mode(mode))?; } } } @@ -139,7 +144,6 @@ pub fn supported_language_codes() -> Vec<&'static str> { } /// Places all nlprule binaries for the given languages in some directory. -#[derive(Debug)] pub struct BinaryBuilder { language_codes: Vec, out_dir: PathBuf, @@ -148,37 +152,48 @@ pub struct BinaryBuilder { fallback_to_build_dir: bool, build_dir: Option, outputs: Vec, + transform_data_func: Option>, BufWriter) -> result::Result<(), Box>>>, + transform_path_func: Option result::Result>>>, } impl BinaryBuilder { + /// Acquires the rule and tokenizer binaries for one language by: /// - Trying to download them from their distribution source (or load them local cache). /// - If they are not found (i. e. a dev version of nlprule is used) and `fallback_to_build_dir`is true /// downloads the latest build directory and builds the binaries from it. /// This can still fail if the dev version is sufficiently outdated for the latest build dir. /// In that case, the user can't be helped. - fn build_language(&mut self, lang_code: &str) { + fn build_language(&mut self, lang_code: &str) -> Result<()> { let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); let rules_out = self.out_dir.join(rules_filename(lang_code)); let mut did_not_find_binaries = false; - for (binary, out) in [ + for (binary, out) in vec![ (Binary::Tokenizer, &tokenizer_out), (Binary::Rules, &rules_out), ] - .iter() { - let response = get_binary(&self.version, lang_code, *binary, self.cache_dir.as_ref()); + let response = get_binary(&self.version, lang_code, binary, self.cache_dir.as_ref()); (match response { - Ok(mut reader) => { - let mut bytes = Vec::new(); - reader - .read_to_end(&mut bytes) - .expect("reading binary bytes failed"); - - fs::write(out, bytes).expect("writing binary to file failed"); + Ok(reader) => { + let out = out.to_owned(); + let dest = if let Some(ref fx) = self.transform_path_func { + fx(out)? + } else { + out + }; + let dest = fs::OpenOptions::new().create(true).truncate(true).write(true).open(&dest)?; + let mut dest = BufWriter::new(dest); + + let mut reader = BufReader::new(reader); + if let Some(ref fx) = self.transform_data_func { + fx(reader, dest)?; + } else { + io::copy(&mut reader, &mut dest)?; + } Ok(()) } @@ -192,8 +207,7 @@ impl BinaryBuilder { Err(error.into()) } Err(x) => Err(x), - }) - .expect("error loading binary") + })?; } if did_not_find_binaries && self.fallback_to_build_dir { @@ -227,21 +241,21 @@ impl BinaryBuilder { self.outputs.push(tokenizer_out); self.outputs.push(rules_out); + Ok(()) } /// Creates a new binary builder. `language_codes` must be in ISO 639-1 (two-letter) format. - /// If `language_codes` is `None`, uses all supported languages. + /// If `language_codes` is `&[]`, uses all supported languages. /// If this is used in a `build.rs`, `out_dir` should probably be the OUT_DIR environment variable. - pub fn new>(language_codes: Option<&[&str]>, out_dir: P) -> Self { - let language_codes: Vec<_> = language_codes.map_or_else( - || { - supported_language_codes() + pub fn new>(language_codes: &[&str], out_dir: P) -> Self { + let language_codes: Vec<_> = if language_codes.is_empty() { + supported_language_codes() .into_iter() .map(|x| x.to_owned()) .collect() - }, - |x| x.iter().map(|x| x.to_string()).collect(), - ); + } else { + language_codes.into_iter().map(ToOwned::to_owned).map(ToOwned::to_owned).collect::>() + }; let project_dir = directories::ProjectDirs::from("", "", "nlprule"); // this should be CARGO_ARTIFACT_DIR once it is merged: https://github.com/rust-lang/rfcs/pull/3035 @@ -258,6 +272,8 @@ impl BinaryBuilder { fallback_to_build_dir: false, build_dir, outputs: Vec::new(), + transform_data_func: None, + transform_path_func: None, } } @@ -297,11 +313,11 @@ impl BinaryBuilder { } /// Builds by {downloading, copying, building} the binaries to the out directory. - pub fn build(mut self) -> Self { - for lang_code in self.language_codes[..].to_vec() { - self.build_language(&lang_code); - } - self + pub fn build(mut self) -> Result { + self.language_codes.clone().into_iter().try_for_each(|lang_code| { + self.build_language(&lang_code) + })?; + Ok(self) } /// Validates the binaries by checking if they can be loaded by nlprule. @@ -334,6 +350,30 @@ impl BinaryBuilder { &self.outputs } + + /// Applies the given transformation before placing a binencoded file into the cache + /// Allows for easier compression usage. + /// Modifies the cached path by the given path function. + /// + /// The resulting files will then reside in the given cache dir if any. + /// + /// Attention: Any compression applied here, must be undone in the + /// `fn postprocess` provided closure to retain the original binenc file + /// to be consumed by the application code. + pub fn transform, Q: AsRef>( + mut self, + proc_fn: C, + path_fn: F, + ) -> Self + where + C: Fn(BufReader, BufWriter) -> result::Result<(), Box>, + F: Fn(PathBuf) -> P, + { + self.transform_data_func = Some(Box::new(proc_fn)); + self.transform_path_func = Some(Box::new(path_fn)); + self + } + /// Applies the given postprocessing function to the binaries e. g. for compression. /// Modifies the output path by the given path function. /// @@ -368,20 +408,20 @@ impl BinaryBuilder { mut self, proc_fn: C, path_fn: F, - ) -> Result + ) -> Result where - C: Fn(Vec, BufWriter) -> Result<(), Box>, + C: Fn(BufReader, BufWriter) -> result::Result<(), Box>, F: Fn(PathBuf) -> P, { for (i, path) in self.outputs.to_vec().iter().enumerate() { - let buffer = fs::read(&path)?; + let reader = BufReader::new(fs::File::open(&path)?); let new_path = path_fn(path.clone()); let new_path = new_path.as_ref(); let writer = BufWriter::new(File::create(new_path)?); - proc_fn(buffer, writer).map_err(Error::PostprocessingError)?; + proc_fn(reader, writer).map_err(Error::PostprocessingError)?; if new_path != path { self.outputs[i] = new_path.to_path_buf(); @@ -400,7 +440,7 @@ mod tests { use super::*; #[test] - fn getting_binary_works() -> Result<(), Error> { + fn getting_binary_works() -> Result<()> { // this is nice to keep roughly in sync with the latest released version but it is not necessary get_binary::("0.3.0", "en", Binary::Rules, None)?; @@ -408,7 +448,7 @@ mod tests { } #[test] - fn getting_build_dir_works() -> Result<(), Error> { + fn getting_build_dir_works() -> Result<()> { let tempdir = tempdir::TempDir::new("build_dir_test")?; let tempdir = tempdir.path(); @@ -420,7 +460,7 @@ mod tests { } #[test] - fn binary_builder_works() -> Result<(), Error> { + fn binary_builder_works() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); @@ -433,7 +473,7 @@ mod tests { } #[test] - fn binary_builder_works_with_released_version() -> Result<(), Error> { + fn binary_builder_works_with_released_version() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); @@ -445,7 +485,7 @@ mod tests { } #[test] - fn binary_builder_works_with_smush() -> Result<(), Error> { + fn binary_builder_works_with_smush() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); @@ -477,7 +517,7 @@ mod tests { } #[test] - fn binary_builder_works_with_flate2() -> Result<(), Error> { + fn binary_builder_works_with_flate2() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); From ac505369caf9fc50e0ba48caf6182f386ece989e Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Thu, 18 Feb 2021 17:00:04 +0100 Subject: [PATCH 02/21] refactor --- build/Cargo.toml | 2 +- build/src/lib.rs | 221 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 151 insertions(+), 72 deletions(-) diff --git a/build/Cargo.toml b/build/Cargo.toml index 938d0ca..98e792e 100644 --- a/build/Cargo.toml +++ b/build/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "nlprule-build" version = "0.4.5-pre" -authors = ["Benjamin Minixhofer "] +authors = ["Benjamin Minixhofer ", "Bernhard Schuster "] edition = "2018" license = "MIT OR Apache-2.0" description = "Build tools for a fast, low-resource Natural Language Processing and Error Correction library." diff --git a/build/src/lib.rs b/build/src/lib.rs index 2906627..6d24978 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -2,10 +2,9 @@ //! See `README.md` for details. use flate2::bufread::GzDecoder; -use io::BufReader; use nlprule::{compile, rules_filename, tokenizer_filename}; use std::{ - io::{self, BufWriter, Cursor, Read}, + io::{self, BufWriter, Cursor, Read, BufReader, SeekFrom, Seek}, path::{Path, PathBuf}, result, }; @@ -18,6 +17,8 @@ use std::fs::Permissions; pub enum Error { #[error(transparent)] RequestError(#[from] reqwest::Error), + #[error("Binaries were not found on the remote")] + BinariesNotFound, #[error(transparent)] IOError(#[from] io::Error), #[error(transparent)] @@ -28,6 +29,13 @@ pub enum Error { pub type Result = std::result::Result; + +/// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. +pub trait TransformDataFn: Fn(Box, Cursor<&mut Vec>) -> result::Result<(), Box> {} + +/// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer datasets. +pub trait TransformPathFn: Fn(PathBuf) -> result::Result> {} + #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub enum Binary { Tokenizer, @@ -45,47 +53,126 @@ impl Binary { /// Stores the binaries for the given language and version in out_dir. /// Tries downloading the binaries from their distribution source. -/// Optionally caches them at some directory. -pub fn get_binary>( +/// +/// This implicitly unpacks the originally gzip'd sources and returns +/// an in-memory buffer. +fn obtain_binary_from_github_release( version: &str, lang_code: &str, binary: Binary, - cache_dir: Option

, -) -> Result> { +) -> Result>> { let filename = binary.filename(lang_code); - let mut cache_path: Option = None; - - if let Some(dir) = cache_dir { - cache_path = Some(dir.as_ref().join(version).join(lang_code).join(&filename)); - } - - // if the file can be read, the data is already cached - if let Some(path) = &cache_path { - if let Ok(bytes) = fs::read(path) { - return Ok(Box::new(Cursor::new(bytes))); - } - } - // ... otherwise, request the data from the URL ... let bytes = reqwest::blocking::get(&format!( "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz", version, filename ))? - .error_for_status()? + .error_for_status().map_err(|e| { + if let Some(404) = e.status().map(|x| x.as_u16()) { + Error::BinariesNotFound + } else { + e.into() + } + + })? .bytes()?; let mut gz = GzDecoder::new(&bytes[..]); let mut buffer = Vec::new(); gz.read_to_end(&mut buffer)?; - // ... and then cache the data at the provided file, if one was found - if let Some(path) = &cache_path { - fs::create_dir_all(path.parent().expect("path must have parent"))?; - fs::write(path, &buffer)?; + Ok(Cursor::new(buffer)) +} + + +fn construct_cache_path( + version: &str, + lang_code: &str, + binary: Binary, + cache_dir: Option<&PathBuf>, + transform_path_fn: Option<&Box>, +) -> Result> { + let filename = binary.filename(lang_code); + + cache_dir.map(move |dir| { + let path = dir.join(version).join(lang_code).join(&filename); + Ok(if let Some(fx) = transform_path_fn { + fx(path)? + } else { + path + }) + }).transpose() +} + + +/// Returns a `dyn Read` type, that is either directly feed +/// from an in-memory slice or from the on disk cache. +/// If the on-disk cache is disabled or is not present, +/// it will attempt to download it via [`obtain_binary_from_github_release`]. +/// The data is then written into `dest`. +/// Also updates the cache. +fn obtain_binary_cache_or_github( + version: &str, + lang_code: &str, + binary: Binary, + cache_dir: Option<&PathBuf>, + transform_path_fn: Option<&Box>, + transform_data_fn: Option<&Box>, +) -> Result>> { + let cache_path = construct_cache_path(version, lang_code, binary, cache_dir.clone(), transform_path_fn)?; + + // if the file can be read, the data is already cached and the transform was applied before + if let Some(ref cache_path) = cache_path { + if let Ok(bytes) = fs::read(cache_path) { + if bytes.len() > 256 { + return Ok(Cursor::new(bytes)); + } + } + } + + // the binencoded data from github + let reader_binenc = obtain_binary_from_github_release(version, lang_code, binary)?; + + // apply the transform if any to an intermediate buffer + let mut reader_transformed = if let Some(fx) = transform_data_fn { + // TODO this is not optimal, the additional copy is a bit annoying + let mut intermediate = Vec::::with_capacity(4096 * 1024); + fx(Box::new(reader_binenc), Cursor::new(&mut intermediate))?; + Cursor::new(intermediate) + } else { + reader_binenc + }; + + // update the cache entry + if let Some(ref cache_path) = cache_path { + fs::create_dir_all(cache_path.parent().expect("path must have parent"))?; + let mut cache_file = fs::OpenOptions::new().truncate(true).create(true).write(true).open(cache_path)?; + io::copy(&mut reader_transformed, &mut cache_file)?; } - Ok(Box::new(Cursor::new(buffer))) + // move the cursor back to the beginning + reader_transformed.seek(SeekFrom::Start(0_u64))?; + + Ok(reader_transformed) +} + + +fn assure_binary_availability( + version: &str, + lang_code: &str, + binary: Binary, + cache_dir: Option<&PathBuf>, + transform_path_fn: Option<&Box>, + transform_data_fn: Option<&Box>, + out: PathBuf, + ) -> Result<()> { + + let mut source = obtain_binary_cache_or_github(version, lang_code, binary, cache_dir, transform_path_fn, transform_data_fn)?; + + let mut out_file = fs::OpenOptions::new().truncate(true).create(true).write(true).open(out)?; + io::copy(&mut source, &mut out_file)?; + Ok(()) } pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<()> { @@ -152,21 +239,37 @@ pub struct BinaryBuilder { fallback_to_build_dir: bool, build_dir: Option, outputs: Vec, - transform_data_func: Option>, BufWriter) -> result::Result<(), Box>>>, - transform_path_func: Option result::Result>>>, + transform_data_fn: Option>, + transform_path_fn: Option>, } impl BinaryBuilder { - + /// ```plain + /// github release resource --[fn transform]--> $cache_dir --[fn postprocess]--> $OUT_DIR/ + /// ``` + /// + /// ```plain + /// github release resource --[fn transform]--> $OUT_DIR --[fn postprocess]--> $OUT_DIR/ + /// ``` + /// /// Acquires the rule and tokenizer binaries for one language by: /// - Trying to download them from their distribution source (or load them local cache). /// - If they are not found (i. e. a dev version of nlprule is used) and `fallback_to_build_dir`is true /// downloads the latest build directory and builds the binaries from it. /// This can still fail if the dev version is sufficiently outdated for the latest build dir. - /// In that case, the user can't be helped. + /// In that case, the user is encouraged to update to a release or a newer git sha. fn build_language(&mut self, lang_code: &str) -> Result<()> { - let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); - let rules_out = self.out_dir.join(rules_filename(lang_code)); + // adjust the destination path for now + let path_transform = |out: PathBuf| -> Result { + Ok(if let Some(ref fx) = self.transform_path_fn { + fx(out)? + } else { + out + }) + }; + + let tokenizer_out = path_transform(self.out_dir.join(tokenizer_filename(lang_code)))?; + let rules_out = path_transform(self.out_dir.join(rules_filename(lang_code)))?; let mut did_not_find_binaries = false; @@ -175,39 +278,15 @@ impl BinaryBuilder { (Binary::Rules, &rules_out), ] { - let response = get_binary(&self.version, lang_code, binary, self.cache_dir.as_ref()); - - (match response { - Ok(reader) => { - let out = out.to_owned(); - let dest = if let Some(ref fx) = self.transform_path_func { - fx(out)? - } else { - out - }; - let dest = fs::OpenOptions::new().create(true).truncate(true).write(true).open(&dest)?; - let mut dest = BufWriter::new(dest); - - let mut reader = BufReader::new(reader); - if let Some(ref fx) = self.transform_data_func { - fx(reader, dest)?; - } else { - io::copy(&mut reader, &mut dest)?; - } - - Ok(()) - } - Err(Error::RequestError(error)) => { - if let Some(404) = error.status().map(|x| x.as_u16()) { - // binaries for this version are not distributed, fall back to building them - did_not_find_binaries = true; - break; - } - - Err(error.into()) + let out = out.to_owned(); + if let Err(e) = assure_binary_availability(&self.version, + lang_code, binary, self.cache_dir.as_ref(), + self.transform_path_fn.as_ref(), self.transform_data_fn.as_ref(), out) { + if let Error::BinariesNotFound = e { + did_not_find_binaries = true; + break; } - Err(x) => Err(x), - })?; + } } if did_not_find_binaries && self.fallback_to_build_dir { @@ -251,7 +330,7 @@ impl BinaryBuilder { let language_codes: Vec<_> = if language_codes.is_empty() { supported_language_codes() .into_iter() - .map(|x| x.to_owned()) + .map(ToOwned::to_owned) .collect() } else { language_codes.into_iter().map(ToOwned::to_owned).map(ToOwned::to_owned).collect::>() @@ -272,8 +351,8 @@ impl BinaryBuilder { fallback_to_build_dir: false, build_dir, outputs: Vec::new(), - transform_data_func: None, - transform_path_func: None, + transform_data_fn: None, + transform_path_fn: None, } } @@ -366,11 +445,11 @@ impl BinaryBuilder { path_fn: F, ) -> Self where - C: Fn(BufReader, BufWriter) -> result::Result<(), Box>, - F: Fn(PathBuf) -> P, + C: TransformDataFn + 'static, + F: TransformPathFn + 'static, { - self.transform_data_func = Some(Box::new(proc_fn)); - self.transform_path_func = Some(Box::new(path_fn)); + self.transform_data_fn = Some(Box::new(proc_fn)); + self.transform_path_fn = Some(Box::new(path_fn)); self } @@ -413,7 +492,7 @@ impl BinaryBuilder { C: Fn(BufReader, BufWriter) -> result::Result<(), Box>, F: Fn(PathBuf) -> P, { - for (i, path) in self.outputs.to_vec().iter().enumerate() { + for (i, path) in self.outputs.clone().into_iter().enumerate() { let reader = BufReader::new(fs::File::open(&path)?); let new_path = path_fn(path.clone()); From 6660c92b499a309c723c82fd9a7985d4043dc1fe Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Thu, 18 Feb 2021 18:13:13 +0100 Subject: [PATCH 03/21] make tests fly again --- build/Cargo.toml | 2 +- build/src/lib.rs | 167 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 125 insertions(+), 44 deletions(-) diff --git a/build/Cargo.toml b/build/Cargo.toml index 98e792e..1c645c1 100644 --- a/build/Cargo.toml +++ b/build/Cargo.toml @@ -22,4 +22,4 @@ fs-err = "2.5" [dev-dependencies] tempdir = "0.3" smush = "0.1.5" -brotli = "3.3.0" +brotli = { version = "3.3.0", features = ["std"] } diff --git a/build/src/lib.rs b/build/src/lib.rs index 6d24978..537f258 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -19,6 +19,8 @@ pub enum Error { RequestError(#[from] reqwest::Error), #[error("Binaries were not found on the remote")] BinariesNotFound, + #[error("Failed to validate {1:?} binary for lang {0}")] + ValidationFailed(String, Binary, #[source] nlprule::Error), #[error(transparent)] IOError(#[from] io::Error), #[error(transparent)] @@ -31,11 +33,20 @@ pub type Result = std::result::Result; /// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. -pub trait TransformDataFn: Fn(Box, Cursor<&mut Vec>) -> result::Result<(), Box> {} +pub trait TransformDataFn: for<'r> Fn(I, Cursor<&'r mut Vec>) -> result::Result<(), Box> {} + +impl TransformDataFn for T +where + T: for<'r> Fn(I, Cursor<&'r mut Vec>) -> result::Result<(), Box> {} + /// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer datasets. pub trait TransformPathFn: Fn(PathBuf) -> result::Result> {} +impl TransformPathFn for T +where + T: Fn(PathBuf) -> result::Result> {} + #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub enum Binary { Tokenizer, @@ -97,8 +108,8 @@ fn construct_cache_path( cache_dir.map(move |dir| { let path = dir.join(version).join(lang_code).join(&filename); - Ok(if let Some(fx) = transform_path_fn { - fx(path)? + Ok(if let Some(transform_path_fn) = transform_path_fn { + transform_path_fn(path)? } else { path }) @@ -118,7 +129,7 @@ fn obtain_binary_cache_or_github( binary: Binary, cache_dir: Option<&PathBuf>, transform_path_fn: Option<&Box>, - transform_data_fn: Option<&Box>, + transform_data_fn: Option<&Box>>>>, ) -> Result>> { let cache_path = construct_cache_path(version, lang_code, binary, cache_dir.clone(), transform_path_fn)?; @@ -135,10 +146,10 @@ fn obtain_binary_cache_or_github( let reader_binenc = obtain_binary_from_github_release(version, lang_code, binary)?; // apply the transform if any to an intermediate buffer - let mut reader_transformed = if let Some(fx) = transform_data_fn { + let mut reader_transformed = if let Some(transform_data_fn) = transform_data_fn { // TODO this is not optimal, the additional copy is a bit annoying let mut intermediate = Vec::::with_capacity(4096 * 1024); - fx(Box::new(reader_binenc), Cursor::new(&mut intermediate))?; + transform_data_fn(reader_binenc, Cursor::new(&mut intermediate))?; Cursor::new(intermediate) } else { reader_binenc @@ -164,11 +175,12 @@ fn assure_binary_availability( binary: Binary, cache_dir: Option<&PathBuf>, transform_path_fn: Option<&Box>, - transform_data_fn: Option<&Box>, + transform_data_fn: Option<&Box>>>>, out: PathBuf, ) -> Result<()> { - let mut source = obtain_binary_cache_or_github(version, lang_code, binary, cache_dir, transform_path_fn, transform_data_fn)?; + let mut source = obtain_binary_cache_or_github(version, lang_code, binary, cache_dir, + transform_path_fn, transform_data_fn)?; let mut out_file = fs::OpenOptions::new().truncate(true).create(true).write(true).open(out)?; io::copy(&mut source, &mut out_file)?; @@ -239,8 +251,8 @@ pub struct BinaryBuilder { fallback_to_build_dir: bool, build_dir: Option, outputs: Vec, - transform_data_fn: Option>, transform_path_fn: Option>, + transform_data_fn: Option>>>>, } impl BinaryBuilder { @@ -400,28 +412,26 @@ impl BinaryBuilder { } /// Validates the binaries by checking if they can be loaded by nlprule. - pub fn validate(self) -> Self { + pub fn validate(self) -> Result { for lang_code in &self.language_codes { let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); let rules_out = self.out_dir.join(rules_filename(lang_code)); - nlprule::Rules::new(rules_out).unwrap_or_else(|e| { - panic!( - "failed to validate rules binary for {lang_code}: {error}", - lang_code = lang_code, - error = e - ) - }); - nlprule::Tokenizer::new(tokenizer_out).unwrap_or_else(|e| { - panic!( - "failed to validate tokenizer binary for {lang_code}: {error}", - lang_code = lang_code, - error = e - ) - }); + nlprule::Rules::new(rules_out) + .map_err(|e| Error::ValidationFailed( + lang_code.to_owned(), + Binary::Rules, + e + ))?; + nlprule::Tokenizer::new(tokenizer_out) + .map_err(|e| Error::ValidationFailed( + lang_code.to_owned(), + Binary::Tokenizer, + e + ))?; } - self + Ok(self) } /// Gets the paths to all files this builder created. @@ -439,13 +449,13 @@ impl BinaryBuilder { /// Attention: Any compression applied here, must be undone in the /// `fn postprocess` provided closure to retain the original binenc file /// to be consumed by the application code. - pub fn transform, Q: AsRef>( + pub fn transform( mut self, proc_fn: C, path_fn: F, ) -> Self where - C: TransformDataFn + 'static, + C: TransformDataFn>> + 'static, F: TransformPathFn + 'static, { self.transform_data_fn = Some(Box::new(proc_fn)); @@ -464,7 +474,7 @@ impl BinaryBuilder { /// # let tempdir = tempdir::TempDir::new("builder_test")?; /// # let tempdir = tempdir.path(); /// # - /// # let mut builder = BinaryBuilder::new(Some(&["en"]), tempdir).version("0.3.0"); + /// # let mut builder = BinaryBuilder::new(&["en"], tempdir).version("0.3.0"); /// builder /// .build() /// .validate() @@ -483,7 +493,7 @@ impl BinaryBuilder { /// )?; /// # Ok::<(), nlprule_build::Error>(()) /// ``` - pub fn postprocess>( + pub fn postprocess( mut self, proc_fn: C, path_fn: F, @@ -491,6 +501,7 @@ impl BinaryBuilder { where C: Fn(BufReader, BufWriter) -> result::Result<(), Box>, F: Fn(PathBuf) -> P, + P: AsRef, { for (i, path) in self.outputs.clone().into_iter().enumerate() { let reader = BufReader::new(fs::File::open(&path)?); @@ -514,6 +525,7 @@ impl BinaryBuilder { #[cfg(test)] mod tests { + use brotli::{BrotliCompress, CompressorReader, enc::BrotliEncoderParams}; use io::Write; use super::*; @@ -521,7 +533,9 @@ mod tests { #[test] fn getting_binary_works() -> Result<()> { // this is nice to keep roughly in sync with the latest released version but it is not necessary - get_binary::("0.3.0", "en", Binary::Rules, None)?; + let tempdir = tempdir::TempDir::new("build_dir")?; + let tempdir = tempdir.path().join("foo.bin"); + assure_binary_availability("0.3.0", "en", Binary::Rules, None, None, None, tempdir)?; Ok(()) } @@ -543,10 +557,10 @@ mod tests { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - BinaryBuilder::new(None, tempdir) + BinaryBuilder::new(&[], tempdir) .fallback_to_build_dir(true) - .build() - .validate(); + .build()? + .validate()?; Ok(()) } @@ -556,9 +570,9 @@ mod tests { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - BinaryBuilder::new(Some(&["en"]), tempdir) + BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .build(); + .build()?; Ok(()) } @@ -568,13 +582,15 @@ mod tests { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - BinaryBuilder::new(Some(&["en"]), tempdir) + BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .build() + .build()? .postprocess( - |buffer, mut writer| { + |mut buffer, mut writer| { + let mut tmp = Vec::new(); + buffer.read_to_end(&mut tmp)?; Ok(writer.write_all(&smush::encode( - &buffer, + &mut tmp, smush::Codec::Gzip, smush::Quality::Default, )?)?) @@ -600,14 +616,16 @@ mod tests { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - let builder = BinaryBuilder::new(Some(&["en"]), tempdir) + let builder = BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .build() + .build()? .postprocess( - |buffer, writer| { + |mut buffer, writer| { + let mut tmp = Vec::new(); + buffer.read_to_end(&mut tmp)?; Ok( flate2::write::GzEncoder::new(writer, flate2::Compression::default()) - .write_all(&buffer)?, + .write_all(&tmp)?, ) }, |p| { @@ -638,4 +656,67 @@ mod tests { Ok(()) } + + + + #[test] + fn build_with_brotli_transform() -> Result<()> { + let tempdir = tempdir::TempDir::new("builder_test")?; + let tempdir = tempdir.path(); + + let builder = BinaryBuilder::new(&["en"], tempdir) + .version("0.3.0") + .transform( |mut buffer: Cursor>, mut writer: Cursor<&'_ mut Vec>| { + dbg!("transform pre"); + let data = smush::encode(buffer.into_inner().as_slice(), smush::Codec::Zstd, smush::Quality::Maximum)?; + writer.write_all(&data)?; + + // XXX hangs forever, not sure what's going on + // let _ = brotli::BrotliCompress(&mut buffer, &mut sink, &Default::default())?; + dbg!("transform post"); + Ok(()) + }, + |p: PathBuf| { + let mut s = p.to_string_lossy().to_string(); + s.push_str(".zstd"); + Ok(PathBuf::from(s)) + }) + .build()? + .postprocess( + |mut buffer, mut writer| { + dbg!("postprocessing"); + let mut tmp = Vec::new(); + buffer.read_to_end(&mut tmp)?; + let data = smush::decode(tmp.as_slice(), smush::Codec::Zstd)?; + writer.write_all(data.as_slice())?; + Ok(()) + // XXX never reached + // Ok(brotli::BrotliDecompress(&mut buffer, &mut writer)?) + }, + |p| { + let path = p.to_string_lossy(); + assert!(path.ends_with(".zstd")); + let end = path.len().saturating_sub(".zstd".len()); + assert_ne!(end, 0); + path[..end].to_owned() + }, + )?; + + assert_eq!( + builder.outputs(), + &[ + tempdir.join("en_tokenizer.bin"), + tempdir.join("en_rules.bin") + ] + ); + + let rules_path = tempdir + .join(Path::new(&rules_filename("en"))) + .with_extension("bin"); + assert!(rules_path.exists()); + + // XXX will always fail + // let _ = nlprule::Rules::new(rules_path).map_err(|e| Error::ValidationFailed("en".to_owned(), Binary::Rules, e))?; + Ok(()) + } } From 947221bf8547ab3c257bc6f11600d96af5a191bf Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Thu, 18 Feb 2021 18:59:46 +0100 Subject: [PATCH 04/21] chore: cargo fmt --- build/src/lib.rs | 186 +++++++++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 78 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 537f258..6c75c4f 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -2,16 +2,16 @@ //! See `README.md` for details. use flate2::bufread::GzDecoder; +use fs::File; +use fs_err as fs; use nlprule::{compile, rules_filename, tokenizer_filename}; +use std::fs::Permissions; use std::{ - io::{self, BufWriter, Cursor, Read, BufReader, SeekFrom, Seek}, + io::{self, BufReader, BufWriter, Cursor, Read, Seek, SeekFrom}, path::{Path, PathBuf}, result, }; use zip::result::ZipError; -use fs_err as fs; -use fs::File; -use std::fs::Permissions; #[derive(Debug, thiserror::Error)] pub enum Error { @@ -31,21 +31,33 @@ pub enum Error { pub type Result = std::result::Result; - /// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. -pub trait TransformDataFn: for<'r> Fn(I, Cursor<&'r mut Vec>) -> result::Result<(), Box> {} - -impl TransformDataFn for T -where - T: for<'r> Fn(I, Cursor<&'r mut Vec>) -> result::Result<(), Box> {} +pub trait TransformDataFn: + for<'r> Fn( + I, + Cursor<&'r mut Vec>, +) -> result::Result<(), Box> +{ +} +impl TransformDataFn for T where + T: for<'r> Fn( + I, + Cursor<&'r mut Vec>, + ) -> result::Result<(), Box> +{ +} /// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer datasets. -pub trait TransformPathFn: Fn(PathBuf) -> result::Result> {} +pub trait TransformPathFn: + Fn(PathBuf) -> result::Result> +{ +} -impl TransformPathFn for T -where - T: Fn(PathBuf) -> result::Result> {} +impl TransformPathFn for T where + T: Fn(PathBuf) -> result::Result> +{ +} #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub enum Binary { @@ -79,13 +91,13 @@ fn obtain_binary_from_github_release( "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz", version, filename ))? - .error_for_status().map_err(|e| { + .error_for_status() + .map_err(|e| { if let Some(404) = e.status().map(|x| x.as_u16()) { Error::BinariesNotFound } else { e.into() } - })? .bytes()?; @@ -96,7 +108,6 @@ fn obtain_binary_from_github_release( Ok(Cursor::new(buffer)) } - fn construct_cache_path( version: &str, lang_code: &str, @@ -106,17 +117,18 @@ fn construct_cache_path( ) -> Result> { let filename = binary.filename(lang_code); - cache_dir.map(move |dir| { - let path = dir.join(version).join(lang_code).join(&filename); - Ok(if let Some(transform_path_fn) = transform_path_fn { - transform_path_fn(path)? - } else { - path + cache_dir + .map(move |dir| { + let path = dir.join(version).join(lang_code).join(&filename); + Ok(if let Some(transform_path_fn) = transform_path_fn { + transform_path_fn(path)? + } else { + path + }) }) - }).transpose() + .transpose() } - /// Returns a `dyn Read` type, that is either directly feed /// from an in-memory slice or from the on disk cache. /// If the on-disk cache is disabled or is not present, @@ -131,7 +143,13 @@ fn obtain_binary_cache_or_github( transform_path_fn: Option<&Box>, transform_data_fn: Option<&Box>>>>, ) -> Result>> { - let cache_path = construct_cache_path(version, lang_code, binary, cache_dir.clone(), transform_path_fn)?; + let cache_path = construct_cache_path( + version, + lang_code, + binary, + cache_dir.clone(), + transform_path_fn, + )?; // if the file can be read, the data is already cached and the transform was applied before if let Some(ref cache_path) = cache_path { @@ -158,7 +176,11 @@ fn obtain_binary_cache_or_github( // update the cache entry if let Some(ref cache_path) = cache_path { fs::create_dir_all(cache_path.parent().expect("path must have parent"))?; - let mut cache_file = fs::OpenOptions::new().truncate(true).create(true).write(true).open(cache_path)?; + let mut cache_file = fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(cache_path)?; io::copy(&mut reader_transformed, &mut cache_file)?; } @@ -168,7 +190,6 @@ fn obtain_binary_cache_or_github( Ok(reader_transformed) } - fn assure_binary_availability( version: &str, lang_code: &str, @@ -177,12 +198,21 @@ fn assure_binary_availability( transform_path_fn: Option<&Box>, transform_data_fn: Option<&Box>>>>, out: PathBuf, - ) -> Result<()> { - - let mut source = obtain_binary_cache_or_github(version, lang_code, binary, cache_dir, - transform_path_fn, transform_data_fn)?; - - let mut out_file = fs::OpenOptions::new().truncate(true).create(true).write(true).open(out)?; +) -> Result<()> { + let mut source = obtain_binary_cache_or_github( + version, + lang_code, + binary, + cache_dir, + transform_path_fn, + transform_data_fn, + )?; + + let mut out_file = fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(out)?; io::copy(&mut source, &mut out_file)?; Ok(()) } @@ -288,12 +318,17 @@ impl BinaryBuilder { for (binary, out) in vec![ (Binary::Tokenizer, &tokenizer_out), (Binary::Rules, &rules_out), - ] - { + ] { let out = out.to_owned(); - if let Err(e) = assure_binary_availability(&self.version, - lang_code, binary, self.cache_dir.as_ref(), - self.transform_path_fn.as_ref(), self.transform_data_fn.as_ref(), out) { + if let Err(e) = assure_binary_availability( + &self.version, + lang_code, + binary, + self.cache_dir.as_ref(), + self.transform_path_fn.as_ref(), + self.transform_data_fn.as_ref(), + out, + ) { if let Error::BinariesNotFound = e { did_not_find_binaries = true; break; @@ -341,11 +376,15 @@ impl BinaryBuilder { pub fn new>(language_codes: &[&str], out_dir: P) -> Self { let language_codes: Vec<_> = if language_codes.is_empty() { supported_language_codes() - .into_iter() - .map(ToOwned::to_owned) - .collect() + .into_iter() + .map(ToOwned::to_owned) + .collect() } else { - language_codes.into_iter().map(ToOwned::to_owned).map(ToOwned::to_owned).collect::>() + language_codes + .into_iter() + .map(ToOwned::to_owned) + .map(ToOwned::to_owned) + .collect::>() }; let project_dir = directories::ProjectDirs::from("", "", "nlprule"); @@ -405,9 +444,10 @@ impl BinaryBuilder { /// Builds by {downloading, copying, building} the binaries to the out directory. pub fn build(mut self) -> Result { - self.language_codes.clone().into_iter().try_for_each(|lang_code| { - self.build_language(&lang_code) - })?; + self.language_codes + .clone() + .into_iter() + .try_for_each(|lang_code| self.build_language(&lang_code))?; Ok(self) } @@ -418,17 +458,9 @@ impl BinaryBuilder { let rules_out = self.out_dir.join(rules_filename(lang_code)); nlprule::Rules::new(rules_out) - .map_err(|e| Error::ValidationFailed( - lang_code.to_owned(), - Binary::Rules, - e - ))?; + .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?; nlprule::Tokenizer::new(tokenizer_out) - .map_err(|e| Error::ValidationFailed( - lang_code.to_owned(), - Binary::Tokenizer, - e - ))?; + .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Tokenizer, e))?; } Ok(self) @@ -439,7 +471,6 @@ impl BinaryBuilder { &self.outputs } - /// Applies the given transformation before placing a binencoded file into the cache /// Allows for easier compression usage. /// Modifies the cached path by the given path function. @@ -449,11 +480,7 @@ impl BinaryBuilder { /// Attention: Any compression applied here, must be undone in the /// `fn postprocess` provided closure to retain the original binenc file /// to be consumed by the application code. - pub fn transform( - mut self, - proc_fn: C, - path_fn: F, - ) -> Self + pub fn transform(mut self, proc_fn: C, path_fn: F) -> Self where C: TransformDataFn>> + 'static, F: TransformPathFn + 'static, @@ -493,13 +520,12 @@ impl BinaryBuilder { /// )?; /// # Ok::<(), nlprule_build::Error>(()) /// ``` - pub fn postprocess( - mut self, - proc_fn: C, - path_fn: F, - ) -> Result + pub fn postprocess(mut self, proc_fn: C, path_fn: F) -> Result where - C: Fn(BufReader, BufWriter) -> result::Result<(), Box>, + C: Fn( + BufReader, + BufWriter, + ) -> result::Result<(), Box>, F: Fn(PathBuf) -> P, P: AsRef, { @@ -525,7 +551,7 @@ impl BinaryBuilder { #[cfg(test)] mod tests { - use brotli::{BrotliCompress, CompressorReader, enc::BrotliEncoderParams}; + use brotli::{enc::BrotliEncoderParams, BrotliCompress, CompressorReader}; use io::Write; use super::*; @@ -657,8 +683,6 @@ mod tests { Ok(()) } - - #[test] fn build_with_brotli_transform() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; @@ -666,9 +690,14 @@ mod tests { let builder = BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .transform( |mut buffer: Cursor>, mut writer: Cursor<&'_ mut Vec>| { + .transform( + |mut buffer: Cursor>, mut writer: Cursor<&'_ mut Vec>| { dbg!("transform pre"); - let data = smush::encode(buffer.into_inner().as_slice(), smush::Codec::Zstd, smush::Quality::Maximum)?; + let data = smush::encode( + buffer.into_inner().as_slice(), + smush::Codec::Zstd, + smush::Quality::Maximum, + )?; writer.write_all(&data)?; // XXX hangs forever, not sure what's going on @@ -676,11 +705,12 @@ mod tests { dbg!("transform post"); Ok(()) }, - |p: PathBuf| { - let mut s = p.to_string_lossy().to_string(); - s.push_str(".zstd"); - Ok(PathBuf::from(s)) - }) + |p: PathBuf| { + let mut s = p.to_string_lossy().to_string(); + s.push_str(".zstd"); + Ok(PathBuf::from(s)) + }, + ) .build()? .postprocess( |mut buffer, mut writer| { From ca081f8fcb58407188e8d290106502c442bbf9c1 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 07:59:42 +0100 Subject: [PATCH 05/21] address review comments --- build/Cargo.toml | 1 - build/src/lib.rs | 70 ++++++++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/build/Cargo.toml b/build/Cargo.toml index 1c645c1..f5024de 100644 --- a/build/Cargo.toml +++ b/build/Cargo.toml @@ -22,4 +22,3 @@ fs-err = "2.5" [dev-dependencies] tempdir = "0.3" smush = "0.1.5" -brotli = { version = "3.3.0", features = ["std"] } diff --git a/build/src/lib.rs b/build/src/lib.rs index 6c75c4f..717bbbb 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -13,6 +13,8 @@ use std::{ }; use zip::result::ZipError; +pub type OtherError = Box; + #[derive(Debug, thiserror::Error)] pub enum Error { #[error(transparent)] @@ -26,36 +28,38 @@ pub enum Error { #[error(transparent)] ZipError(#[from] ZipError), #[error("error postprocessing binaries: {0}")] - PostprocessingError(#[from] Box), + PostprocessingError(#[source] OtherError), + #[error("error postprocessing binaries: {0}")] + TransformError(#[source] OtherError), } pub type Result = std::result::Result; /// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. -pub trait TransformDataFn: +pub trait TransformDataFn: for<'r> Fn( - I, + Cursor>, Cursor<&'r mut Vec>, -) -> result::Result<(), Box> +) -> result::Result<(), OtherError> { } -impl TransformDataFn for T where +impl TransformDataFn for T where T: for<'r> Fn( - I, + Cursor>, Cursor<&'r mut Vec>, - ) -> result::Result<(), Box> + ) -> result::Result<(), OtherError> { } /// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer datasets. pub trait TransformPathFn: - Fn(PathBuf) -> result::Result> + Fn(PathBuf) -> result::Result { } impl TransformPathFn for T where - T: Fn(PathBuf) -> result::Result> + T: Fn(PathBuf) -> result::Result { } @@ -113,7 +117,7 @@ fn construct_cache_path( lang_code: &str, binary: Binary, cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&Box>, + transform_path_fn: Option<&dyn TransformPathFn>, ) -> Result> { let filename = binary.filename(lang_code); @@ -121,7 +125,7 @@ fn construct_cache_path( .map(move |dir| { let path = dir.join(version).join(lang_code).join(&filename); Ok(if let Some(transform_path_fn) = transform_path_fn { - transform_path_fn(path)? + transform_path_fn(path).map_err(Error::TransformError)? } else { path }) @@ -140,8 +144,8 @@ fn obtain_binary_cache_or_github( lang_code: &str, binary: Binary, cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&Box>, - transform_data_fn: Option<&Box>>>>, + transform_path_fn: Option<&dyn TransformPathFn>, + transform_data_fn: Option<&dyn TransformDataFn>, ) -> Result>> { let cache_path = construct_cache_path( version, @@ -167,7 +171,7 @@ fn obtain_binary_cache_or_github( let mut reader_transformed = if let Some(transform_data_fn) = transform_data_fn { // TODO this is not optimal, the additional copy is a bit annoying let mut intermediate = Vec::::with_capacity(4096 * 1024); - transform_data_fn(reader_binenc, Cursor::new(&mut intermediate))?; + transform_data_fn(reader_binenc, Cursor::new(&mut intermediate)).map_err(Error::TransformError)?; Cursor::new(intermediate) } else { reader_binenc @@ -195,8 +199,8 @@ fn assure_binary_availability( lang_code: &str, binary: Binary, cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&Box>, - transform_data_fn: Option<&Box>>>>, + transform_path_fn: Option<&dyn TransformPathFn>, + transform_data_fn: Option<&dyn TransformDataFn>, out: PathBuf, ) -> Result<()> { let mut source = obtain_binary_cache_or_github( @@ -282,7 +286,7 @@ pub struct BinaryBuilder { build_dir: Option, outputs: Vec, transform_path_fn: Option>, - transform_data_fn: Option>>>>, + transform_data_fn: Option>, } impl BinaryBuilder { @@ -303,8 +307,8 @@ impl BinaryBuilder { fn build_language(&mut self, lang_code: &str) -> Result<()> { // adjust the destination path for now let path_transform = |out: PathBuf| -> Result { - Ok(if let Some(ref fx) = self.transform_path_fn { - fx(out)? + Ok(if let Some(ref transform_path_fn) = self.transform_path_fn { + transform_path_fn(out).map_err(Error::TransformError)? } else { out }) @@ -325,8 +329,8 @@ impl BinaryBuilder { lang_code, binary, self.cache_dir.as_ref(), - self.transform_path_fn.as_ref(), - self.transform_data_fn.as_ref(), + self.transform_path_fn.as_ref().map(|x| x.as_ref()), + self.transform_data_fn.as_ref().map(|x| x.as_ref()), out, ) { if let Error::BinariesNotFound = e { @@ -482,7 +486,7 @@ impl BinaryBuilder { /// to be consumed by the application code. pub fn transform(mut self, proc_fn: C, path_fn: F) -> Self where - C: TransformDataFn>> + 'static, + C: TransformDataFn + 'static, F: TransformPathFn + 'static, { self.transform_data_fn = Some(Box::new(proc_fn)); @@ -525,7 +529,7 @@ impl BinaryBuilder { C: Fn( BufReader, BufWriter, - ) -> result::Result<(), Box>, + ) -> result::Result<(), OtherError>, F: Fn(PathBuf) -> P, P: AsRef, { @@ -551,7 +555,6 @@ impl BinaryBuilder { #[cfg(test)] mod tests { - use brotli::{enc::BrotliEncoderParams, BrotliCompress, CompressorReader}; use io::Write; use super::*; @@ -684,15 +687,14 @@ mod tests { } #[test] - fn build_with_brotli_transform() -> Result<()> { + fn build_with_zstd_transform() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); let builder = BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") .transform( - |mut buffer: Cursor>, mut writer: Cursor<&'_ mut Vec>| { - dbg!("transform pre"); + |buffer: Cursor>, mut writer: Cursor<&'_ mut Vec>| { let data = smush::encode( buffer.into_inner().as_slice(), smush::Codec::Zstd, @@ -700,9 +702,8 @@ mod tests { )?; writer.write_all(&data)?; - // XXX hangs forever, not sure what's going on + // XXX hangs forever, both with `smush` and `brotli` directly // let _ = brotli::BrotliCompress(&mut buffer, &mut sink, &Default::default())?; - dbg!("transform post"); Ok(()) }, |p: PathBuf| { @@ -720,7 +721,7 @@ mod tests { let data = smush::decode(tmp.as_slice(), smush::Codec::Zstd)?; writer.write_all(data.as_slice())?; Ok(()) - // XXX never reached + // XXX // Ok(brotli::BrotliDecompress(&mut buffer, &mut writer)?) }, |p| { @@ -745,8 +746,13 @@ mod tests { .with_extension("bin"); assert!(rules_path.exists()); - // XXX will always fail - // let _ = nlprule::Rules::new(rules_path).map_err(|e| Error::ValidationFailed("en".to_owned(), Binary::Rules, e))?; + // The following will always fail since the versions will mismatch and rebuilding does not make sense + // `get_build_dir` is tested separately + // + // ```rust,no_run + // let _ = nlprule::Rules::new(rules_path) + // .map_err(|e| Error::ValidationFailed("en".to_owned(), Binary::Rules, e))?; + // ``` Ok(()) } } From 94bde565858ca50bb390d751d1623322169faba9 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 08:02:19 +0100 Subject: [PATCH 06/21] clippy.. --- build/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 717bbbb..9fed112 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -319,15 +319,15 @@ impl BinaryBuilder { let mut did_not_find_binaries = false; - for (binary, out) in vec![ + for (binary, out) in &[ (Binary::Tokenizer, &tokenizer_out), (Binary::Rules, &rules_out), ] { - let out = out.to_owned(); + let out = out.to_owned().to_owned(); if let Err(e) = assure_binary_availability( &self.version, lang_code, - binary, + *binary, self.cache_dir.as_ref(), self.transform_path_fn.as_ref().map(|x| x.as_ref()), self.transform_data_fn.as_ref().map(|x| x.as_ref()), From c9e51b47e668e112a2b73dd3f2439cdc6e4d16c3 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 08:04:52 +0100 Subject: [PATCH 07/21] more chores --- build/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 9fed112..7361f16 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -385,7 +385,7 @@ impl BinaryBuilder { .collect() } else { language_codes - .into_iter() + .iter() .map(ToOwned::to_owned) .map(ToOwned::to_owned) .collect::>() From e32b3f353f389663d3c61c5c7aec9c6e406bcaa2 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 09:02:47 +0100 Subject: [PATCH 08/21] foo --- build/src/lib.rs | 33 ++++++++++++++++++++++++++------- nlprule/Cargo.toml | 1 + nlprule/src/bin/compile.rs | 5 ++++- nlprule/src/compile/mod.rs | 17 ++++++++--------- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 7361f16..0dc04ef 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -5,9 +5,10 @@ use flate2::bufread::GzDecoder; use fs::File; use fs_err as fs; use nlprule::{compile, rules_filename, tokenizer_filename}; +use reqwest::blocking::Response; use std::fs::Permissions; use std::{ - io::{self, BufReader, BufWriter, Cursor, Read, Seek, SeekFrom}, + io::{self, BufReader, BufWriter, Cursor, Read, Write, Seek, SeekFrom}, path::{Path, PathBuf}, result, }; @@ -354,12 +355,30 @@ impl BinaryBuilder { get_build_dir(lang_code, &build_dir).expect("error loading build directory"); } - compile::compile(&compile::BuildOptions { - build_dir, - rules_out: rules_out.clone(), - tokenizer_out: tokenizer_out.clone(), - }) - .expect("Compiling from build directory failed. Upgrading to a more recent development version of NLPRule might fix this problem."); + + let mut rules_out = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(rules_out)?); + let mut tokenizer_out = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(tokenizer_out)?); + let (rules_out, tokenizer_out) = if let Some(ref transform_data_fn) = self.transform_data_fn { + let mut transfer_buffer_rules = BufWriter::new(Vec::new()); + let mut transfer_buffer_tokenizer = BufWriter::new(Vec::new()); + + compile::compile( + build_dir, + transfer_buffer_rules, + transfer_buffer_tokenizer, + ).map_err(Error::CollationFailed)?; + + transform_data_fn(Box::new(transfer_buffer_rules), Box::new(rules_out))?; + transform_data_fn(Box::new(transfer_buffer_tokenizer), Box::new(tokenizer_out))?; + + } else { + compile::compile( + build_dir, + rules_out, + tokenizer_out, + ).map_err(Error::CollationFailed)?; + }; + } else if did_not_find_binaries { panic!( "Did not find binaries for version {}. \ diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index 68ee9b1..9b6a9bd 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -27,6 +27,7 @@ indexmap = { version = "1", features = ["serde"] } unicase = "2.6" derivative = "2.2" fst = "0.4" +fs-err = "2.5" aho-corasick = "0.7" half = { version = "1.7", features = ["serde"] } srx = { version = "^0.1.2", features = ["serde"] } diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 0b52c1a..3a59eec 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -5,5 +5,8 @@ fn main() -> Result<(), Error> { env_logger::init(); let opts = BuildOptions::parse(); - compile(&opts) + let tokenizer_sink = BufWriter::new(File::create(&opts.tokenizer_out)?); + let rules_sink = BufWriter::new(File::create(&opts.rules_out)?); + + compile(opts.build_dir, rules_sink, tokenizer_sink) } diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs index 34da90a..4bea901 100644 --- a/nlprule/src/compile/mod.rs +++ b/nlprule/src/compile/mod.rs @@ -1,9 +1,11 @@ //! Creates the nlprule binaries from a *build directory*. Usage information in /build/README.md. +use fs_err as fs; +use fs::File; + use std::{ - fs::{self, File}, hash::{Hash, Hasher}, - io::{BufReader, BufWriter}, + io::{self, BufReader, BufWriter}, num::ParseIntError, path::{Path, PathBuf}, str::FromStr, @@ -95,8 +97,8 @@ pub enum Error { ParseError(#[from] ParseIntError), } -pub fn compile(opts: &BuildOptions) -> Result<(), Error> { - let paths = BuildFilePaths::new(&opts.build_dir); +pub fn compile(build_dir: impl AsRef, mut rules_dest: impl io::Write, mut tokenizer_dest: impl io::Write) -> Result<(), Error> { + let paths = BuildFilePaths::new(&build_dir); let lang_code = fs::read_to_string(paths.lang_code_path)?; @@ -185,14 +187,11 @@ pub fn compile(opts: &BuildOptions) -> Result<(), Error> { tokenizer_options, )?; - let f = BufWriter::new(File::create(&opts.tokenizer_out)?); - bincode::serialize_into(f, &tokenizer)?; + bincode::serialize_into(&mut tokenizer_dest, &tokenizer)?; info!("Creating grammar rules."); let rules = Rules::from_xml(&paths.grammar_path, &mut build_info, &rules_options); - - let f = BufWriter::new(File::create(&opts.rules_out)?); - bincode::serialize_into(f, &rules)?; + bincode::serialize_into(&mut rules_dest, &rules)?; // we need to write the regex cache after building the rules, otherwise it isn't fully populated let f = BufWriter::new(File::create(&paths.regex_cache_path)?); From 5a16808d51448ed8d2db353bab38db41504efb4f Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 09:44:02 +0100 Subject: [PATCH 09/21] handle also build --- build/src/lib.rs | 63 ++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 0dc04ef..fd07e5c 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -32,23 +32,25 @@ pub enum Error { PostprocessingError(#[source] OtherError), #[error("error postprocessing binaries: {0}")] TransformError(#[source] OtherError), + #[error("Collation failed")] + CollationFailed(#[source] nlprule::compile::Error) } pub type Result = std::result::Result; /// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. pub trait TransformDataFn: - for<'r> Fn( - Cursor>, - Cursor<&'r mut Vec>, -) -> result::Result<(), OtherError> + for<'w> Fn( + Box, + Box, + ) -> result::Result<(), OtherError> { } impl TransformDataFn for T where - T: for<'r> Fn( - Cursor>, - Cursor<&'r mut Vec>, + T: for<'w> Fn( + Box, + Box, ) -> result::Result<(), OtherError> { } @@ -147,7 +149,7 @@ fn obtain_binary_cache_or_github( cache_dir: Option<&PathBuf>, transform_path_fn: Option<&dyn TransformPathFn>, transform_data_fn: Option<&dyn TransformDataFn>, -) -> Result>> { +) -> Result> { let cache_path = construct_cache_path( version, lang_code, @@ -160,7 +162,7 @@ fn obtain_binary_cache_or_github( if let Some(ref cache_path) = cache_path { if let Ok(bytes) = fs::read(cache_path) { if bytes.len() > 256 { - return Ok(Cursor::new(bytes)); + return Ok(Box::new(Cursor::new(bytes))); } } } @@ -171,11 +173,11 @@ fn obtain_binary_cache_or_github( // apply the transform if any to an intermediate buffer let mut reader_transformed = if let Some(transform_data_fn) = transform_data_fn { // TODO this is not optimal, the additional copy is a bit annoying - let mut intermediate = Vec::::with_capacity(4096 * 1024); - transform_data_fn(reader_binenc, Cursor::new(&mut intermediate)).map_err(Error::TransformError)?; - Cursor::new(intermediate) + let mut intermediate = Box::new(Cursor::new(Vec::::with_capacity(4096 * 1024))); + transform_data_fn(Box::new(reader_binenc), Box::new(&mut intermediate)).map_err(Error::TransformError)?; + intermediate } else { - reader_binenc + Box::new(reader_binenc) }; // update the cache entry @@ -356,26 +358,26 @@ impl BinaryBuilder { } - let mut rules_out = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(rules_out)?); - let mut tokenizer_out = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(tokenizer_out)?); - let (rules_out, tokenizer_out) = if let Some(ref transform_data_fn) = self.transform_data_fn { - let mut transfer_buffer_rules = BufWriter::new(Vec::new()); - let mut transfer_buffer_tokenizer = BufWriter::new(Vec::new()); + let mut rules_sink = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(&rules_out)?); + let mut tokenizer_sink = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(&tokenizer_out)?); + if let Some(ref transform_data_fn) = self.transform_data_fn { + let mut transfer_buffer_rules = Cursor::new(Vec::new()); + let mut transfer_buffer_tokenizer = Cursor::new(Vec::new()); compile::compile( build_dir, - transfer_buffer_rules, - transfer_buffer_tokenizer, + &mut transfer_buffer_rules, + &mut transfer_buffer_tokenizer, ).map_err(Error::CollationFailed)?; - transform_data_fn(Box::new(transfer_buffer_rules), Box::new(rules_out))?; - transform_data_fn(Box::new(transfer_buffer_tokenizer), Box::new(tokenizer_out))?; + transform_data_fn(Box::new(transfer_buffer_rules), Box::new(rules_sink)).map_err(Error::TransformError)?; + transform_data_fn(Box::new(transfer_buffer_tokenizer), Box::new(tokenizer_sink)).map_err(Error::TransformError)?; } else { compile::compile( build_dir, - rules_out, - tokenizer_out, + &mut rules_sink, + &mut tokenizer_sink, ).map_err(Error::CollationFailed)?; }; @@ -503,10 +505,7 @@ impl BinaryBuilder { /// Attention: Any compression applied here, must be undone in the /// `fn postprocess` provided closure to retain the original binenc file /// to be consumed by the application code. - pub fn transform(mut self, proc_fn: C, path_fn: F) -> Self - where - C: TransformDataFn + 'static, - F: TransformPathFn + 'static, + pub fn transform(mut self, proc_fn: &'static (dyn TransformDataFn), path_fn: &'static (dyn TransformPathFn)) -> Self { self.transform_data_fn = Some(Box::new(proc_fn)); self.transform_path_fn = Some(Box::new(path_fn)); @@ -713,9 +712,11 @@ mod tests { let builder = BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") .transform( - |buffer: Cursor>, mut writer: Cursor<&'_ mut Vec>| { + &|mut buffer, mut writer| { + let mut data = Vec::new(); + buffer.read_to_end(&mut data)?; let data = smush::encode( - buffer.into_inner().as_slice(), + data.as_slice(), smush::Codec::Zstd, smush::Quality::Maximum, )?; @@ -725,7 +726,7 @@ mod tests { // let _ = brotli::BrotliCompress(&mut buffer, &mut sink, &Default::default())?; Ok(()) }, - |p: PathBuf| { + &|p: PathBuf| { let mut s = p.to_string_lossy().to_string(); s.push_str(".zstd"); Ok(PathBuf::from(s)) From e3d8f323a29679a9b55ea0b77acc84057514f7a4 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 09:51:20 +0100 Subject: [PATCH 10/21] moar fixes --- build/Cargo.toml | 1 + build/src/lib.rs | 6 +++--- nlprule/src/bin/compile.rs | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/build/Cargo.toml b/build/Cargo.toml index f5024de..d215e63 100644 --- a/build/Cargo.toml +++ b/build/Cargo.toml @@ -22,3 +22,4 @@ fs-err = "2.5" [dev-dependencies] tempdir = "0.3" smush = "0.1.5" +env_logger = "0.8" diff --git a/build/src/lib.rs b/build/src/lib.rs index fd07e5c..ccd4882 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -161,9 +161,7 @@ fn obtain_binary_cache_or_github( // if the file can be read, the data is already cached and the transform was applied before if let Some(ref cache_path) = cache_path { if let Ok(bytes) = fs::read(cache_path) { - if bytes.len() > 256 { - return Ok(Box::new(Cursor::new(bytes))); - } + return Ok(Box::new(Cursor::new(bytes))); } } @@ -589,6 +587,8 @@ mod tests { #[test] fn getting_build_dir_works() -> Result<()> { + let _ = env_logger::builder().is_test(true).try_init(); + let tempdir = tempdir::TempDir::new("build_dir_test")?; let tempdir = tempdir.path(); diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 3a59eec..3388d27 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -1,12 +1,14 @@ use clap::Clap; use nlprule::compile::{compile, BuildOptions, Error}; +use std::io::BufWriter; +use fs_err as fs; fn main() -> Result<(), Error> { env_logger::init(); let opts = BuildOptions::parse(); - let tokenizer_sink = BufWriter::new(File::create(&opts.tokenizer_out)?); - let rules_sink = BufWriter::new(File::create(&opts.rules_out)?); + let tokenizer_sink = BufWriter::new(fs::File::create(&opts.tokenizer_out)?); + let rules_sink = BufWriter::new(fs::File::create(&opts.rules_out)?); compile(opts.build_dir, rules_sink, tokenizer_sink) } From 20d489d410e622c05de2e6f598e2c75bd1ded09c Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:13:18 +0100 Subject: [PATCH 11/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index ccd4882..ea26d92 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -637,7 +637,7 @@ mod tests { let mut tmp = Vec::new(); buffer.read_to_end(&mut tmp)?; Ok(writer.write_all(&smush::encode( - &mut tmp, + &tmp, smush::Codec::Gzip, smush::Quality::Default, )?)?) From 098d172685557ff0a0902025941c5cb125e37628 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:13:26 +0100 Subject: [PATCH 12/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index ea26d92..e322fc3 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -30,7 +30,7 @@ pub enum Error { ZipError(#[from] ZipError), #[error("error postprocessing binaries: {0}")] PostprocessingError(#[source] OtherError), - #[error("error postprocessing binaries: {0}")] + #[error("error transforming binaries: {0}")] TransformError(#[source] OtherError), #[error("Collation failed")] CollationFailed(#[source] nlprule::compile::Error) From cf3ef827be9fda517a424b338355b16d8a17967e Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:13:41 +0100 Subject: [PATCH 13/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index e322fc3..b9935df 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -494,9 +494,11 @@ impl BinaryBuilder { &self.outputs } - /// Applies the given transformation before placing a binencoded file into the cache - /// Allows for easier compression usage. - /// Modifies the cached path by the given path function. + /// Applies the given transformation function to the binary immediately after obtaining it. + /// This happens before placing the file in the cache (if any) so by using a compression + /// function the size of the cache directory can be reduced. + /// Modifies the path of the cached binaries by the given `path_fn`. + /// If no cache directory is set or the binaries are built from the build dir, the `path_fn` does nothing. /// /// The resulting files will then reside in the given cache dir if any. /// From abc7188ae2eb9be83530be5decc19f4094fb36d8 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:15:44 +0100 Subject: [PATCH 14/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index b9935df..a3b0a15 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -171,7 +171,7 @@ fn obtain_binary_cache_or_github( // apply the transform if any to an intermediate buffer let mut reader_transformed = if let Some(transform_data_fn) = transform_data_fn { // TODO this is not optimal, the additional copy is a bit annoying - let mut intermediate = Box::new(Cursor::new(Vec::::with_capacity(4096 * 1024))); + let mut intermediate = Box::new(Cursor::new(Vec::::new())); transform_data_fn(Box::new(reader_binenc), Box::new(&mut intermediate)).map_err(Error::TransformError)?; intermediate } else { From bd1d19a5568bfda73a3ae65d355e5a03ebdb37d4 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:15:53 +0100 Subject: [PATCH 15/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index a3b0a15..529e268 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -154,7 +154,7 @@ fn obtain_binary_cache_or_github( version, lang_code, binary, - cache_dir.clone(), + cache_dir, transform_path_fn, )?; From 1281aa094f923c3612a37c8e5bea81d866508d00 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:16:03 +0100 Subject: [PATCH 16/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 529e268..99169c3 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -5,7 +5,6 @@ use flate2::bufread::GzDecoder; use fs::File; use fs_err as fs; use nlprule::{compile, rules_filename, tokenizer_filename}; -use reqwest::blocking::Response; use std::fs::Permissions; use std::{ io::{self, BufReader, BufWriter, Cursor, Read, Write, Seek, SeekFrom}, From 50ce726fce28fb39d6f304dd9b02ac4ceb90ba1c Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:16:10 +0100 Subject: [PATCH 17/21] Update build/src/lib.rs Co-authored-by: Benjamin Minixhofer --- build/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 99169c3..d968a7a 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -92,7 +92,6 @@ fn obtain_binary_from_github_release( ) -> Result>> { let filename = binary.filename(lang_code); - // ... otherwise, request the data from the URL ... let bytes = reqwest::blocking::get(&format!( "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz", version, filename From 18e0019fccff17087aafabd73e4e83addc080d35 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:17:27 +0100 Subject: [PATCH 18/21] move clap struct def --- nlprule/src/bin/compile.rs | 14 ++++++++++++++ nlprule/src/compile/mod.rs | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 3388d27..f011a55 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -3,6 +3,20 @@ use nlprule::compile::{compile, BuildOptions, Error}; use std::io::BufWriter; use fs_err as fs; +#[derive(clap::Clap)] +#[clap( + version = env!("CARGO_PKG_VERSION"), + author = "Benjamin Minixhofer " +)] +pub struct BuildOptions { + #[clap(long, parse(from_os_str))] + pub build_dir: PathBuf, + #[clap(long, parse(from_os_str))] + pub tokenizer_out: PathBuf, + #[clap(long, parse(from_os_str))] + pub rules_out: PathBuf, +} + fn main() -> Result<(), Error> { env_logger::init(); let opts = BuildOptions::parse(); diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs index 4bea901..ef240bb 100644 --- a/nlprule/src/compile/mod.rs +++ b/nlprule/src/compile/mod.rs @@ -17,7 +17,6 @@ use crate::{ tokenizer::{chunk::Chunker, multiword::MultiwordTagger, tag::Tagger, Tokenizer}, types::DefaultHasher, }; -use clap::Clap; use log::info; use self::parse_structure::{BuildInfo, RegexCache}; @@ -28,19 +27,6 @@ mod parse_structure; mod structure; mod utils; -#[derive(Clap)] -#[clap( - version = env!("CARGO_PKG_VERSION"), - author = "Benjamin Minixhofer " -)] -pub struct BuildOptions { - #[clap(long, parse(from_os_str))] - pub build_dir: PathBuf, - #[clap(long, parse(from_os_str))] - pub tokenizer_out: PathBuf, - #[clap(long, parse(from_os_str))] - pub rules_out: PathBuf, -} struct BuildFilePaths { lang_code_path: PathBuf, From 7a9f78402aafffececdc93103f05abb4d00f3f26 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:18:13 +0100 Subject: [PATCH 19/21] chore: cargo fmt --- build/src/lib.rs | 98 +++++++++++++++++++------------------- nlprule/src/bin/compile.rs | 2 +- nlprule/src/compile/mod.rs | 9 ++-- 3 files changed, 55 insertions(+), 54 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index d968a7a..2a7a149 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -7,7 +7,7 @@ use fs_err as fs; use nlprule::{compile, rules_filename, tokenizer_filename}; use std::fs::Permissions; use std::{ - io::{self, BufReader, BufWriter, Cursor, Read, Write, Seek, SeekFrom}, + io::{self, BufReader, BufWriter, Cursor, Read, Seek, SeekFrom, Write}, path::{Path, PathBuf}, result, }; @@ -32,38 +32,26 @@ pub enum Error { #[error("error transforming binaries: {0}")] TransformError(#[source] OtherError), #[error("Collation failed")] - CollationFailed(#[source] nlprule::compile::Error) + CollationFailed(#[source] nlprule::compile::Error), } pub type Result = std::result::Result; /// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. pub trait TransformDataFn: - for<'w> Fn( - Box, - Box, - ) -> result::Result<(), OtherError> + for<'w> Fn(Box, Box) -> result::Result<(), OtherError> { } impl TransformDataFn for T where - T: for<'w> Fn( - Box, - Box, - ) -> result::Result<(), OtherError> + T: for<'w> Fn(Box, Box) -> result::Result<(), OtherError> { } /// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer datasets. -pub trait TransformPathFn: - Fn(PathBuf) -> result::Result -{ -} +pub trait TransformPathFn: Fn(PathBuf) -> result::Result {} -impl TransformPathFn for T where - T: Fn(PathBuf) -> result::Result -{ -} +impl TransformPathFn for T where T: Fn(PathBuf) -> result::Result {} #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub enum Binary { @@ -148,13 +136,8 @@ fn obtain_binary_cache_or_github( transform_path_fn: Option<&dyn TransformPathFn>, transform_data_fn: Option<&dyn TransformDataFn>, ) -> Result> { - let cache_path = construct_cache_path( - version, - lang_code, - binary, - cache_dir, - transform_path_fn, - )?; + let cache_path = + construct_cache_path(version, lang_code, binary, cache_dir, transform_path_fn)?; // if the file can be read, the data is already cached and the transform was applied before if let Some(ref cache_path) = cache_path { @@ -170,7 +153,8 @@ fn obtain_binary_cache_or_github( let mut reader_transformed = if let Some(transform_data_fn) = transform_data_fn { // TODO this is not optimal, the additional copy is a bit annoying let mut intermediate = Box::new(Cursor::new(Vec::::new())); - transform_data_fn(Box::new(reader_binenc), Box::new(&mut intermediate)).map_err(Error::TransformError)?; + transform_data_fn(Box::new(reader_binenc), Box::new(&mut intermediate)) + .map_err(Error::TransformError)?; intermediate } else { Box::new(reader_binenc) @@ -306,11 +290,13 @@ impl BinaryBuilder { fn build_language(&mut self, lang_code: &str) -> Result<()> { // adjust the destination path for now let path_transform = |out: PathBuf| -> Result { - Ok(if let Some(ref transform_path_fn) = self.transform_path_fn { - transform_path_fn(out).map_err(Error::TransformError)? - } else { - out - }) + Ok( + if let Some(ref transform_path_fn) = self.transform_path_fn { + transform_path_fn(out).map_err(Error::TransformError)? + } else { + out + }, + ) }; let tokenizer_out = path_transform(self.out_dir.join(tokenizer_filename(lang_code)))?; @@ -353,9 +339,20 @@ impl BinaryBuilder { get_build_dir(lang_code, &build_dir).expect("error loading build directory"); } - - let mut rules_sink = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(&rules_out)?); - let mut tokenizer_sink = BufWriter::new(fs::OpenOptions::new().truncate(true).create(true).write(true).open(&tokenizer_out)?); + let mut rules_sink = BufWriter::new( + fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(&rules_out)?, + ); + let mut tokenizer_sink = BufWriter::new( + fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(&tokenizer_out)?, + ); if let Some(ref transform_data_fn) = self.transform_data_fn { let mut transfer_buffer_rules = Cursor::new(Vec::new()); let mut transfer_buffer_tokenizer = Cursor::new(Vec::new()); @@ -364,19 +361,20 @@ impl BinaryBuilder { build_dir, &mut transfer_buffer_rules, &mut transfer_buffer_tokenizer, - ).map_err(Error::CollationFailed)?; - - transform_data_fn(Box::new(transfer_buffer_rules), Box::new(rules_sink)).map_err(Error::TransformError)?; - transform_data_fn(Box::new(transfer_buffer_tokenizer), Box::new(tokenizer_sink)).map_err(Error::TransformError)?; + ) + .map_err(Error::CollationFailed)?; + transform_data_fn(Box::new(transfer_buffer_rules), Box::new(rules_sink)) + .map_err(Error::TransformError)?; + transform_data_fn( + Box::new(transfer_buffer_tokenizer), + Box::new(tokenizer_sink), + ) + .map_err(Error::TransformError)?; } else { - compile::compile( - build_dir, - &mut rules_sink, - &mut tokenizer_sink, - ).map_err(Error::CollationFailed)?; + compile::compile(build_dir, &mut rules_sink, &mut tokenizer_sink) + .map_err(Error::CollationFailed)?; }; - } else if did_not_find_binaries { panic!( "Did not find binaries for version {}. \ @@ -503,8 +501,11 @@ impl BinaryBuilder { /// Attention: Any compression applied here, must be undone in the /// `fn postprocess` provided closure to retain the original binenc file /// to be consumed by the application code. - pub fn transform(mut self, proc_fn: &'static (dyn TransformDataFn), path_fn: &'static (dyn TransformPathFn)) -> Self - { + pub fn transform( + mut self, + proc_fn: &'static (dyn TransformDataFn), + path_fn: &'static (dyn TransformPathFn), + ) -> Self { self.transform_data_fn = Some(Box::new(proc_fn)); self.transform_path_fn = Some(Box::new(path_fn)); self @@ -542,10 +543,7 @@ impl BinaryBuilder { /// ``` pub fn postprocess(mut self, proc_fn: C, path_fn: F) -> Result where - C: Fn( - BufReader, - BufWriter, - ) -> result::Result<(), OtherError>, + C: Fn(BufReader, BufWriter) -> result::Result<(), OtherError>, F: Fn(PathBuf) -> P, P: AsRef, { diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index f011a55..070550d 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -1,7 +1,7 @@ use clap::Clap; +use fs_err as fs; use nlprule::compile::{compile, BuildOptions, Error}; use std::io::BufWriter; -use fs_err as fs; #[derive(clap::Clap)] #[clap( diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs index ef240bb..2ed3054 100644 --- a/nlprule/src/compile/mod.rs +++ b/nlprule/src/compile/mod.rs @@ -1,7 +1,7 @@ //! Creates the nlprule binaries from a *build directory*. Usage information in /build/README.md. -use fs_err as fs; use fs::File; +use fs_err as fs; use std::{ hash::{Hash, Hasher}, @@ -27,7 +27,6 @@ mod parse_structure; mod structure; mod utils; - struct BuildFilePaths { lang_code_path: PathBuf, tag_paths: Vec, @@ -83,7 +82,11 @@ pub enum Error { ParseError(#[from] ParseIntError), } -pub fn compile(build_dir: impl AsRef, mut rules_dest: impl io::Write, mut tokenizer_dest: impl io::Write) -> Result<(), Error> { +pub fn compile( + build_dir: impl AsRef, + mut rules_dest: impl io::Write, + mut tokenizer_dest: impl io::Write, +) -> Result<(), Error> { let paths = BuildFilePaths::new(&build_dir); let lang_code = fs::read_to_string(paths.lang_code_path)?; From ab3902fcafd3294a00774ec8a19a9a886f03705b Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 13:20:37 +0100 Subject: [PATCH 20/21] dont drop errors --- build/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 2a7a149..336875d 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -309,7 +309,7 @@ impl BinaryBuilder { (Binary::Rules, &rules_out), ] { let out = out.to_owned().to_owned(); - if let Err(e) = assure_binary_availability( + match assure_binary_availability( &self.version, lang_code, *binary, @@ -318,10 +318,11 @@ impl BinaryBuilder { self.transform_data_fn.as_ref().map(|x| x.as_ref()), out, ) { - if let Error::BinariesNotFound = e { + Err(e) if Error::BinariesNotFound => { did_not_find_binaries = true; break; } + res => res?, } } From 428e58fa6bc2f8f4381cb6bee94c66fa496103bc Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 19 Feb 2021 14:23:34 +0100 Subject: [PATCH 21/21] slip of the pen --- build/src/lib.rs | 2 +- nlprule/src/bin/compile.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/build/src/lib.rs b/build/src/lib.rs index 336875d..344667c 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -318,7 +318,7 @@ impl BinaryBuilder { self.transform_data_fn.as_ref().map(|x| x.as_ref()), out, ) { - Err(e) if Error::BinariesNotFound => { + Err(Error::BinariesNotFound) => { did_not_find_binaries = true; break; } diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 070550d..6f63412 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -1,7 +1,8 @@ use clap::Clap; use fs_err as fs; -use nlprule::compile::{compile, BuildOptions, Error}; +use nlprule::compile::{compile, Error}; use std::io::BufWriter; +use std::path::PathBuf; #[derive(clap::Clap)] #[clap(