Skip to content

Commit

Permalink
remove skip mask, use fnv, id words and cache in regex
Browse files Browse the repository at this point in the history
  • Loading branch information
bminixhofer committed Jan 14, 2021
1 parent a86e10b commit fd894d3
Show file tree
Hide file tree
Showing 15 changed files with 292 additions and 252 deletions.
15 changes: 10 additions & 5 deletions bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,12 @@ impl PyTagger {
self.options.use_compound_split_heuristic,
)
.into_iter()
.map(|x| (x.lemma.to_string(), self.tagger.id_to_tag(x.pos_id)))
.map(|x| {
(
x.lemma.as_ref().to_string(),
self.tagger.id_to_tag(x.pos_id),
)
})
.collect()
}

Expand Down Expand Up @@ -316,7 +321,7 @@ impl PyToken {
impl PyToken {
#[getter]
fn text(&self) -> &str {
&self.token.word.text
self.token.word.text.as_ref()
}

#[getter]
Expand All @@ -330,7 +335,7 @@ impl PyToken {
.word
.tags
.iter()
.map(|x| (x.lemma.as_str(), self.tagger.id_to_tag(x.pos_id)))
.map(|x| (x.lemma.as_ref(), self.tagger.id_to_tag(x.pos_id)))
.collect()
}

Expand All @@ -342,10 +347,10 @@ impl PyToken {
.tags
.iter()
.filter_map(|x| {
if x.lemma.is_empty() {
if x.lemma.as_ref().is_empty() {
None
} else {
Some(x.lemma.as_str())
Some(x.lemma.as_ref())
}
})
.collect();
Expand Down
1 change: 1 addition & 0 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ enum_dispatch = "0.3.4"
indexmap = { version = "1", features = ["serde"]}
unicase = "2.6.0"
derivative = "2.1.3"
fnv = "1.0.7"

rayon-cond = "0.1.0"
rayon = "1.5"
Expand Down
7 changes: 3 additions & 4 deletions nlprule/src/bin/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ fn main() {
&opts.tag_paths,
&opts.tag_remove_paths,
&tokenizer_options.extra_tags,
&common_words,
)
.unwrap();
let mut tokenizer = Tokenizer::from_xml(
let tokenizer = Tokenizer::from_xml(
opts.disambiguation_path,
Arc::new(tagger),
if let Some(path) = opts.chunker_path {
Expand All @@ -74,13 +75,11 @@ fn main() {
tokenizer_options,
)
.unwrap();
tokenizer.populate_cache(&common_words);

let f = BufWriter::new(File::create(&opts.out_tokenizer_path).unwrap());
bincode::serialize_into(f, &tokenizer).unwrap();

let mut rules = Rules::from_xml(opts.grammar_path, tokenizer.tagger(), rules_options);
rules.populate_cache(&common_words);
let rules = Rules::from_xml(opts.grammar_path, tokenizer.tagger(), rules_options);

let f = BufWriter::new(File::create(&opts.out_rules_path).unwrap());
bincode::serialize_into(f, &rules).unwrap();
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl Filterable for NoDisambiguationEnglishPartialPosTagFilter {
let tokens = &group.tokens(graph.tokens());

tokens.iter().all(|token| {
if let Some(captures) = self.regexp.captures(&token.word.text) {
if let Some(captures) = self.regexp.captures(&token.word.text.as_ref()) {
// get group 2 because `must_fully_match` adds one group
let tags = tokenizer.tagger().get_tags(
&captures.at(2).unwrap(),
Expand Down
30 changes: 14 additions & 16 deletions nlprule/src/rule/disambiguation.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::types::*;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;

use super::engine::composition::PosMatcher;

Expand Down Expand Up @@ -63,7 +62,8 @@ impl Disambiguation {
either::Left(data) => {
token.word.tags.retain(|x| {
!(x.pos_id == data.pos_id
&& (data.lemma.is_empty() || x.lemma == data.lemma))
&& (data.lemma.as_ref().is_empty()
|| x.lemma == data.lemma.as_ref_id()))
});
}
either::Right(filter) => {
Expand All @@ -79,21 +79,19 @@ impl Disambiguation {
match data_or_filter {
either::Left(limit) => {
for token in group.into_iter() {
let last = token
.word
.tags
.get(0)
.map_or(token.word.text, |x| x.lemma.as_ref())
.to_string();
let last = token.word.tags.get(0).map_or_else(
|| token.word.text.clone(),
|x| x.lemma.clone(),
);

token.word.tags.retain(|x| x.pos_id == limit.pos_id);

if token.word.tags.is_empty() {
token.word.tags.push(WordData::new(
if retain_last {
Cow::Owned(last)
last
} else {
token.word.text.into()
token.word.text.clone()
},
limit.pos_id,
));
Expand All @@ -113,10 +111,10 @@ impl Disambiguation {
for (group, data) in groups.into_iter().zip(datas) {
for token in group.into_iter() {
let data = WordData::new(
if data.lemma.is_empty() {
token.word.text
if data.lemma.as_ref().is_empty() {
token.word.text.clone()
} else {
data.lemma.as_str()
data.lemma.as_ref_id()
},
data.pos_id,
);
Expand All @@ -135,10 +133,10 @@ impl Disambiguation {
for (group, data) in groups.into_iter().zip(datas) {
for token in group.into_iter() {
let data = WordData::new(
if data.lemma.is_empty() {
token.word.text
if data.lemma.as_ref().is_empty() {
token.word.text.clone()
} else {
data.lemma.as_str()
data.lemma.as_ref_id()
},
data.pos_id,
);
Expand Down

0 comments on commit fd894d3

Please sign in to comment.