Skip to content

Commit

Permalink
add NLPRULE_PARALLELISM env var
Browse files Browse the repository at this point in the history
  • Loading branch information
bminixhofer committed Jan 7, 2021
1 parent 9f50576 commit 6923d40
Show file tree
Hide file tree
Showing 9 changed files with 320 additions and 148 deletions.
30 changes: 0 additions & 30 deletions bindings/python/test.py

This file was deleted.

1 change: 1 addition & 0 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ enum_dispatch = "0.3.4"
indexmap = { version = "1", features = ["serde"]}
once_cell = "1.5"

rayon-cond = "0.1.0"
rayon = "1.5"

clap = { version = "3.0.0-beta.1", optional = true }
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/composition.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{
tokenizer::Token,
utils::{CacheString, SerializeRegex},
utils::{regex::SerializeRegex, CacheString},
};
use enum_dispatch::enum_dispatch;
use lazy_static::lazy_static;
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/filter/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::tokenizer::Tokenizer;
use crate::utils::SerializeRegex;
use crate::utils::regex::SerializeRegex;
use crate::{composition::MatchGraph, Error};
use enum_dispatch::enum_dispatch;
use serde::{Deserialize, Serialize};
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/from_structure/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
AndAtom, Atom, Composition, Matcher, NotAtom, OffsetAtom, OrAtom, Part, Quantifier,
TrueAtom, WordDataMatcher,
},
utils::SerializeRegex,
utils::regex::SerializeRegex,
};
use crate::{filter::get_filter, tokenizer::OwnedWordData};
use crate::{rule, tokenizer::OwnedWord};
Expand Down
5 changes: 2 additions & 3 deletions nlprule/src/rule/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ use crate::{
tokenizer::{
finalize, IncompleteToken, OwnedWord, OwnedWordData, Token, Tokenizer, Word, WordData,
},
utils::{self, SerializeRegex},
utils::{self, parallelism::MaybeParallelRefIterator, regex::SerializeRegex},
};
use itertools::Itertools;
use log::{error, info, warn};
use onig::Captures;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::{
borrow::Cow,
Expand Down Expand Up @@ -1037,7 +1036,7 @@ impl Rules {

let mut output: Vec<_> = self
.rules
.par_iter()
.maybe_par_iter()
.enumerate()
.filter(|(_, x)| x.on())
.map(|(i, rule)| {
Expand Down
109 changes: 109 additions & 0 deletions nlprule/src/utils/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
use lazy_static::lazy_static;
use once_cell::sync::OnceCell;
use onig::{Captures, Regex};
use serde::{Deserialize, Serialize};

pub mod parallelism;
pub mod regex;

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct CacheString<S: AsRef<str>> {
string: S,
#[serde(skip)]
lower: OnceCell<String>,
}

impl<S: AsRef<str>> PartialEq for CacheString<S> {
fn eq(&self, other: &Self) -> bool {
other.as_str() == self.as_str()
}
}

impl<S: AsRef<str>> From<S> for CacheString<S> {
fn from(string: S) -> Self {
CacheString {
lower: OnceCell::new(),
string,
}
}
}

impl<S: AsRef<str>> CacheString<S> {
pub fn to_lowercase(&self) -> &str {
self.lower
.get_or_init(|| self.string.as_ref().to_lowercase())
.as_str()
}

pub fn as_str(&self) -> &str {
self.string.as_ref()
}

pub fn inner(self) -> S {
self.string
}
}

// see https://stackoverflow.com/questions/38406793/why-is-capitalizing-the-first-letter-of-a-string-so-convoluted-in-rust
pub fn apply_to_first<F>(string: &str, func: F) -> String
where
F: Fn(char) -> String,
{
let mut c = string.chars();
match c.next() {
None => String::new(),
Some(first) => func(first) + c.as_str(),
}
}

pub fn is_title_case(string: &str) -> bool {
let mut char_case = string.chars().map(|x| x.is_uppercase());

char_case.next().unwrap_or(false) && !char_case.any(|x| x)
}

pub fn is_uppercase(string: &str) -> bool {
!string.chars().any(|x| x.is_lowercase())
}

// see https://github.com/rust-onig/rust-onig/issues/59#issuecomment-340160520
pub fn dollar_replace(mut replacement: String, caps: &Captures) -> String {
for i in 1..caps.len() {
replacement = replacement.replace(&format!("${}", i), caps.at(i).unwrap_or(""));
}
replacement
}

// remove duplicate whitespaces
pub fn normalize_whitespace(string: &str) -> String {
lazy_static! {
static ref REGEX: Regex = Regex::new(r"(\s)\s+").unwrap();
}

REGEX.replace_all(string, |caps: &Captures| caps.at(1).unwrap().to_string())
}

#[inline]
pub fn splitting_chars() -> &'static str {
r##"«»'’`´‘],.:;!?/\()<=>„“”"+#…*"##
}

#[inline]
pub fn no_space_chars() -> &'static str {
r##","##
}

pub fn fix_nospace_chars(text: &str) -> String {
text.char_indices()
.filter(|(i, c)| {
if c.is_whitespace() {
!no_space_chars()
.chars()
.any(|nospace_c| text[(i + c.len_utf8())..].starts_with(nospace_c))
} else {
true
}
})
.map(|x| x.1)
.collect()
}
197 changes: 197 additions & 0 deletions nlprule/src/utils/parallelism.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
//!
//! This module defines helpers to allow optional Rayon usage.
//! Copied from huggingface/tokenizers v0.1.1. Only change is "TOKENIZERS_PARALLELISM" to "NLPRULE_PARALLELISM"
//!

use rayon::iter::IterBridge;
use rayon::prelude::*;
use rayon_cond::CondIterator;

pub const ENV_VARIABLE: &str = "NLPRULE_PARALLELISM";

// Reading/Writing this variable should always happen on the main thread
static mut USED_PARALLELISM: bool = false;

/// Check if the NLPRULE_PARALLELISM env variable has been explicitly set
pub fn is_parallelism_configured() -> bool {
std::env::var(ENV_VARIABLE).is_ok()
}

/// Check if at some point we used a parallel iterator
pub fn has_parallelism_been_used() -> bool {
unsafe { USED_PARALLELISM }
}

/// Get the currently set value for `NLPRULE_PARALLELISM` env variable
pub fn get_parallelism() -> bool {
match std::env::var(ENV_VARIABLE) {
Ok(mut v) => {
v.make_ascii_lowercase();
!matches!(v.as_ref(), "" | "off" | "false" | "f" | "no" | "n" | "0")
}
Err(_) => true, // If we couldn't get the variable, we use the default
}
}

/// Set the value for `NLPRULE_PARALLELISM` for the current process
pub fn set_parallelism(val: bool) {
std::env::set_var(ENV_VARIABLE, if val { "true" } else { "false" })
}

/// Allows to convert into an iterator that can be executed either parallelly or serially.
///
/// The choice is made according to the currently set `NLPRULE_PARALLELISM` environment variable.
/// This variable can have one of the following values
/// - False => "" (empty value), "false", "f", "off", "no", "n", "0"
/// - True => Any other value
///
pub trait MaybeParallelIterator<P, S>
where
P: ParallelIterator,
S: Iterator<Item = P::Item>,
{
/// Convert ourself in a CondIterator, that will be executed either in parallel or serially,
/// based solely on the `NLPRULE_PARALLELISM` environment variable
fn into_maybe_par_iter(self) -> CondIterator<P, S>;
/// Convert ourself in a CondIterator, that will be executed either in parallel or serially,
/// based on both the `NLPRULE_PARALLELISM` environment variable and the provided bool.
/// Both must be true to run with parallelism activated.
fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator<P, S>;
}

impl<P, S, I> MaybeParallelIterator<P, S> for I
where
I: IntoParallelIterator<Iter = P, Item = P::Item> + IntoIterator<IntoIter = S, Item = S::Item>,
P: ParallelIterator,
S: Iterator<Item = P::Item>,
{
fn into_maybe_par_iter(self) -> CondIterator<P, S> {
let parallelism = get_parallelism();
if parallelism {
unsafe { USED_PARALLELISM = true };
}
CondIterator::new(self, parallelism)
}

fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator<P, S> {
if cond {
self.into_maybe_par_iter()
} else {
CondIterator::from_serial(self)
}
}
}

/// Shared reference version of MaybeParallelIterator, works the same but returns an iterator
/// over references, does not consume self
pub trait MaybeParallelRefIterator<'data, P, S>
where
P: ParallelIterator,
S: Iterator<Item = P::Item>,
P::Item: 'data,
{
fn maybe_par_iter(&'data self) -> CondIterator<P, S>;
fn maybe_par_iter_cond(&'data self, cond: bool) -> CondIterator<P, S>;
}

impl<'data, P, S, I: 'data + ?Sized> MaybeParallelRefIterator<'data, P, S> for I
where
&'data I: MaybeParallelIterator<P, S>,
P: ParallelIterator,
S: Iterator<Item = P::Item>,
P::Item: 'data,
{
fn maybe_par_iter(&'data self) -> CondIterator<P, S> {
self.into_maybe_par_iter()
}

fn maybe_par_iter_cond(&'data self, cond: bool) -> CondIterator<P, S> {
self.into_maybe_par_iter_cond(cond)
}
}

/// Exclusive reference version of MaybeParallelIterator, works the same but returns an iterator
/// over mutable references, does not consume self
pub trait MaybeParallelRefMutIterator<'data, P, S>
where
P: ParallelIterator,
S: Iterator<Item = P::Item>,
P::Item: 'data,
{
fn maybe_par_iter_mut(&'data mut self) -> CondIterator<P, S>;
fn maybe_par_iter_mut_cond(&'data mut self, cond: bool) -> CondIterator<P, S>;
}

impl<'data, P, S, I: 'data + ?Sized> MaybeParallelRefMutIterator<'data, P, S> for I
where
&'data mut I: MaybeParallelIterator<P, S>,
P: ParallelIterator,
S: Iterator<Item = P::Item>,
P::Item: 'data,
{
fn maybe_par_iter_mut(&'data mut self) -> CondIterator<P, S> {
self.into_maybe_par_iter()
}

fn maybe_par_iter_mut_cond(&'data mut self, cond: bool) -> CondIterator<P, S> {
self.into_maybe_par_iter_cond(cond)
}
}

/// Converts any serial iterator into a CondIterator, that can either run parallelly or serially.
pub trait MaybeParallelBridge<T, S>
where
S: Iterator<Item = T> + Send,
T: Send,
{
fn maybe_par_bridge(self) -> CondIterator<IterBridge<S>, S>;
fn maybe_par_bridge_cond(self, cond: bool) -> CondIterator<IterBridge<S>, S>;
}

impl<T, S> MaybeParallelBridge<T, S> for S
where
S: Iterator<Item = T> + Send,
T: Send,
{
fn maybe_par_bridge(self) -> CondIterator<IterBridge<S>, S> {
let iter = CondIterator::from_serial(self);

if get_parallelism() {
unsafe { USED_PARALLELISM = true };
CondIterator::from_parallel(iter.into_parallel().right().unwrap())
} else {
iter
}
}

fn maybe_par_bridge_cond(self, cond: bool) -> CondIterator<IterBridge<S>, S> {
if cond {
self.maybe_par_bridge()
} else {
CondIterator::from_serial(self)
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_maybe_parallel_iterator() {
let mut v = vec![1u32, 2, 3, 4, 5, 6];

assert_eq!(v.maybe_par_iter().sum::<u32>(), 21);
assert_eq!(
v.maybe_par_iter_mut()
.map(|v| {
*v *= 2;
*v
})
.sum::<u32>(),
42
);
assert_eq!(v.maybe_par_iter().sum::<u32>(), 42);
assert_eq!(v.into_maybe_par_iter().sum::<u32>(), 42);
}
}

0 comments on commit 6923d40

Please sign in to comment.