-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9f50576
commit 6923d40
Showing
9 changed files
with
320 additions
and
148 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
use lazy_static::lazy_static; | ||
use once_cell::sync::OnceCell; | ||
use onig::{Captures, Regex}; | ||
use serde::{Deserialize, Serialize}; | ||
|
||
pub mod parallelism; | ||
pub mod regex; | ||
|
||
#[derive(Debug, Serialize, Deserialize, Clone)] | ||
pub struct CacheString<S: AsRef<str>> { | ||
string: S, | ||
#[serde(skip)] | ||
lower: OnceCell<String>, | ||
} | ||
|
||
impl<S: AsRef<str>> PartialEq for CacheString<S> { | ||
fn eq(&self, other: &Self) -> bool { | ||
other.as_str() == self.as_str() | ||
} | ||
} | ||
|
||
impl<S: AsRef<str>> From<S> for CacheString<S> { | ||
fn from(string: S) -> Self { | ||
CacheString { | ||
lower: OnceCell::new(), | ||
string, | ||
} | ||
} | ||
} | ||
|
||
impl<S: AsRef<str>> CacheString<S> { | ||
pub fn to_lowercase(&self) -> &str { | ||
self.lower | ||
.get_or_init(|| self.string.as_ref().to_lowercase()) | ||
.as_str() | ||
} | ||
|
||
pub fn as_str(&self) -> &str { | ||
self.string.as_ref() | ||
} | ||
|
||
pub fn inner(self) -> S { | ||
self.string | ||
} | ||
} | ||
|
||
// see https://stackoverflow.com/questions/38406793/why-is-capitalizing-the-first-letter-of-a-string-so-convoluted-in-rust | ||
pub fn apply_to_first<F>(string: &str, func: F) -> String | ||
where | ||
F: Fn(char) -> String, | ||
{ | ||
let mut c = string.chars(); | ||
match c.next() { | ||
None => String::new(), | ||
Some(first) => func(first) + c.as_str(), | ||
} | ||
} | ||
|
||
pub fn is_title_case(string: &str) -> bool { | ||
let mut char_case = string.chars().map(|x| x.is_uppercase()); | ||
|
||
char_case.next().unwrap_or(false) && !char_case.any(|x| x) | ||
} | ||
|
||
pub fn is_uppercase(string: &str) -> bool { | ||
!string.chars().any(|x| x.is_lowercase()) | ||
} | ||
|
||
// see https://github.com/rust-onig/rust-onig/issues/59#issuecomment-340160520 | ||
pub fn dollar_replace(mut replacement: String, caps: &Captures) -> String { | ||
for i in 1..caps.len() { | ||
replacement = replacement.replace(&format!("${}", i), caps.at(i).unwrap_or("")); | ||
} | ||
replacement | ||
} | ||
|
||
// remove duplicate whitespaces | ||
pub fn normalize_whitespace(string: &str) -> String { | ||
lazy_static! { | ||
static ref REGEX: Regex = Regex::new(r"(\s)\s+").unwrap(); | ||
} | ||
|
||
REGEX.replace_all(string, |caps: &Captures| caps.at(1).unwrap().to_string()) | ||
} | ||
|
||
#[inline] | ||
pub fn splitting_chars() -> &'static str { | ||
r##"«»'’`´‘],.:;!?/\()<=>„“”"+#…*"## | ||
} | ||
|
||
#[inline] | ||
pub fn no_space_chars() -> &'static str { | ||
r##","## | ||
} | ||
|
||
pub fn fix_nospace_chars(text: &str) -> String { | ||
text.char_indices() | ||
.filter(|(i, c)| { | ||
if c.is_whitespace() { | ||
!no_space_chars() | ||
.chars() | ||
.any(|nospace_c| text[(i + c.len_utf8())..].starts_with(nospace_c)) | ||
} else { | ||
true | ||
} | ||
}) | ||
.map(|x| x.1) | ||
.collect() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
//! | ||
//! This module defines helpers to allow optional Rayon usage. | ||
//! Copied from huggingface/tokenizers v0.1.1. Only change is "TOKENIZERS_PARALLELISM" to "NLPRULE_PARALLELISM" | ||
//! | ||
|
||
use rayon::iter::IterBridge; | ||
use rayon::prelude::*; | ||
use rayon_cond::CondIterator; | ||
|
||
pub const ENV_VARIABLE: &str = "NLPRULE_PARALLELISM"; | ||
|
||
// Reading/Writing this variable should always happen on the main thread | ||
static mut USED_PARALLELISM: bool = false; | ||
|
||
/// Check if the NLPRULE_PARALLELISM env variable has been explicitly set | ||
pub fn is_parallelism_configured() -> bool { | ||
std::env::var(ENV_VARIABLE).is_ok() | ||
} | ||
|
||
/// Check if at some point we used a parallel iterator | ||
pub fn has_parallelism_been_used() -> bool { | ||
unsafe { USED_PARALLELISM } | ||
} | ||
|
||
/// Get the currently set value for `NLPRULE_PARALLELISM` env variable | ||
pub fn get_parallelism() -> bool { | ||
match std::env::var(ENV_VARIABLE) { | ||
Ok(mut v) => { | ||
v.make_ascii_lowercase(); | ||
!matches!(v.as_ref(), "" | "off" | "false" | "f" | "no" | "n" | "0") | ||
} | ||
Err(_) => true, // If we couldn't get the variable, we use the default | ||
} | ||
} | ||
|
||
/// Set the value for `NLPRULE_PARALLELISM` for the current process | ||
pub fn set_parallelism(val: bool) { | ||
std::env::set_var(ENV_VARIABLE, if val { "true" } else { "false" }) | ||
} | ||
|
||
/// Allows to convert into an iterator that can be executed either parallelly or serially. | ||
/// | ||
/// The choice is made according to the currently set `NLPRULE_PARALLELISM` environment variable. | ||
/// This variable can have one of the following values | ||
/// - False => "" (empty value), "false", "f", "off", "no", "n", "0" | ||
/// - True => Any other value | ||
/// | ||
pub trait MaybeParallelIterator<P, S> | ||
where | ||
P: ParallelIterator, | ||
S: Iterator<Item = P::Item>, | ||
{ | ||
/// Convert ourself in a CondIterator, that will be executed either in parallel or serially, | ||
/// based solely on the `NLPRULE_PARALLELISM` environment variable | ||
fn into_maybe_par_iter(self) -> CondIterator<P, S>; | ||
/// Convert ourself in a CondIterator, that will be executed either in parallel or serially, | ||
/// based on both the `NLPRULE_PARALLELISM` environment variable and the provided bool. | ||
/// Both must be true to run with parallelism activated. | ||
fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator<P, S>; | ||
} | ||
|
||
impl<P, S, I> MaybeParallelIterator<P, S> for I | ||
where | ||
I: IntoParallelIterator<Iter = P, Item = P::Item> + IntoIterator<IntoIter = S, Item = S::Item>, | ||
P: ParallelIterator, | ||
S: Iterator<Item = P::Item>, | ||
{ | ||
fn into_maybe_par_iter(self) -> CondIterator<P, S> { | ||
let parallelism = get_parallelism(); | ||
if parallelism { | ||
unsafe { USED_PARALLELISM = true }; | ||
} | ||
CondIterator::new(self, parallelism) | ||
} | ||
|
||
fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator<P, S> { | ||
if cond { | ||
self.into_maybe_par_iter() | ||
} else { | ||
CondIterator::from_serial(self) | ||
} | ||
} | ||
} | ||
|
||
/// Shared reference version of MaybeParallelIterator, works the same but returns an iterator | ||
/// over references, does not consume self | ||
pub trait MaybeParallelRefIterator<'data, P, S> | ||
where | ||
P: ParallelIterator, | ||
S: Iterator<Item = P::Item>, | ||
P::Item: 'data, | ||
{ | ||
fn maybe_par_iter(&'data self) -> CondIterator<P, S>; | ||
fn maybe_par_iter_cond(&'data self, cond: bool) -> CondIterator<P, S>; | ||
} | ||
|
||
impl<'data, P, S, I: 'data + ?Sized> MaybeParallelRefIterator<'data, P, S> for I | ||
where | ||
&'data I: MaybeParallelIterator<P, S>, | ||
P: ParallelIterator, | ||
S: Iterator<Item = P::Item>, | ||
P::Item: 'data, | ||
{ | ||
fn maybe_par_iter(&'data self) -> CondIterator<P, S> { | ||
self.into_maybe_par_iter() | ||
} | ||
|
||
fn maybe_par_iter_cond(&'data self, cond: bool) -> CondIterator<P, S> { | ||
self.into_maybe_par_iter_cond(cond) | ||
} | ||
} | ||
|
||
/// Exclusive reference version of MaybeParallelIterator, works the same but returns an iterator | ||
/// over mutable references, does not consume self | ||
pub trait MaybeParallelRefMutIterator<'data, P, S> | ||
where | ||
P: ParallelIterator, | ||
S: Iterator<Item = P::Item>, | ||
P::Item: 'data, | ||
{ | ||
fn maybe_par_iter_mut(&'data mut self) -> CondIterator<P, S>; | ||
fn maybe_par_iter_mut_cond(&'data mut self, cond: bool) -> CondIterator<P, S>; | ||
} | ||
|
||
impl<'data, P, S, I: 'data + ?Sized> MaybeParallelRefMutIterator<'data, P, S> for I | ||
where | ||
&'data mut I: MaybeParallelIterator<P, S>, | ||
P: ParallelIterator, | ||
S: Iterator<Item = P::Item>, | ||
P::Item: 'data, | ||
{ | ||
fn maybe_par_iter_mut(&'data mut self) -> CondIterator<P, S> { | ||
self.into_maybe_par_iter() | ||
} | ||
|
||
fn maybe_par_iter_mut_cond(&'data mut self, cond: bool) -> CondIterator<P, S> { | ||
self.into_maybe_par_iter_cond(cond) | ||
} | ||
} | ||
|
||
/// Converts any serial iterator into a CondIterator, that can either run parallelly or serially. | ||
pub trait MaybeParallelBridge<T, S> | ||
where | ||
S: Iterator<Item = T> + Send, | ||
T: Send, | ||
{ | ||
fn maybe_par_bridge(self) -> CondIterator<IterBridge<S>, S>; | ||
fn maybe_par_bridge_cond(self, cond: bool) -> CondIterator<IterBridge<S>, S>; | ||
} | ||
|
||
impl<T, S> MaybeParallelBridge<T, S> for S | ||
where | ||
S: Iterator<Item = T> + Send, | ||
T: Send, | ||
{ | ||
fn maybe_par_bridge(self) -> CondIterator<IterBridge<S>, S> { | ||
let iter = CondIterator::from_serial(self); | ||
|
||
if get_parallelism() { | ||
unsafe { USED_PARALLELISM = true }; | ||
CondIterator::from_parallel(iter.into_parallel().right().unwrap()) | ||
} else { | ||
iter | ||
} | ||
} | ||
|
||
fn maybe_par_bridge_cond(self, cond: bool) -> CondIterator<IterBridge<S>, S> { | ||
if cond { | ||
self.maybe_par_bridge() | ||
} else { | ||
CondIterator::from_serial(self) | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_maybe_parallel_iterator() { | ||
let mut v = vec![1u32, 2, 3, 4, 5, 6]; | ||
|
||
assert_eq!(v.maybe_par_iter().sum::<u32>(), 21); | ||
assert_eq!( | ||
v.maybe_par_iter_mut() | ||
.map(|v| { | ||
*v *= 2; | ||
*v | ||
}) | ||
.sum::<u32>(), | ||
42 | ||
); | ||
assert_eq!(v.maybe_par_iter().sum::<u32>(), 42); | ||
assert_eq!(v.into_maybe_par_iter().sum::<u32>(), 42); | ||
} | ||
} |
Oops, something went wrong.