Skip to content

Commit

Permalink
optmize code
Browse files Browse the repository at this point in the history
  • Loading branch information
blueshen committed Jul 4, 2023
1 parent e1ff90a commit f39a6c4
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 60 deletions.
10 changes: 10 additions & 0 deletions src/core/char_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ pub fn regularize_str(input: &str) -> String {
regular_str
}

#[allow(dead_code)]
pub fn regular_char_enum(input: &str) -> Vec<(char, CharType)> {
let mut pair = Vec::with_capacity(input.chars().count());
for c in input.chars() {
let r_char = regularize(c);
pair.push((r_char, CharType::from(r_char)));
}
pair
}

pub fn utf8_slice(s: &str, begin: usize, end: usize) -> &str {
if end < begin {
return "";
Expand Down
46 changes: 26 additions & 20 deletions src/core/ik_arbitrator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ pub struct IKArbitrator {}
impl IKArbitrator {
pub fn process(
&mut self,
org_lexemes: &OrderedLinkedList<Lexeme>,
mode: TokenMode,
orgin_lexemes: &OrderedLinkedList<Lexeme>,
mode: &TokenMode,
) -> HashMap<usize, LexemePath> {
let mut path_map = HashMap::<usize, LexemePath>::new();
let mut cross_path = LexemePath::new();
for org_lexeme in org_lexemes.iter() {
for org_lexeme in orgin_lexemes.iter() {
if !cross_path.add_cross_lexeme(org_lexeme) {
if self.need_add_path(&cross_path, mode) {
path_map.insert(cross_path.begin() as usize, cross_path);
Expand All @@ -41,53 +41,59 @@ impl IKArbitrator {

fn judge(&self, cur_node: Option<&Link<Lexeme>>) -> Option<LexemePath> {
let mut path_options = BTreeSet::new();
let mut option = LexemePath::new();
let mut lexeme_stack = self.forward_path(cur_node, &mut option);
path_options.insert(option.clone());
let mut option_path = LexemePath::new();
let mut lexeme_stack = self.forward_path(cur_node, &mut option_path);
path_options.insert(option_path.clone());
while let Some(node) = lexeme_stack.pop() {
self.back_path(node, &mut option);
self.forward_path(node, &mut option);
path_options.insert(option.clone());
self.back_path(node, &mut option_path);
self.forward_path(node, &mut option_path);
path_options.insert(option_path.clone());
}
// pick first one
let mut best = None;
// pick first one as best
let mut best_path = None;
if !path_options.is_empty() {
for o in path_options.into_iter() {
best = Some(o);
best_path = Some(o);
break;
}
}
best
best_path
// after rust 1.66.0
// path_options.pop_first()
}

fn need_add_path(&self, cross_path: &LexemePath, mode: TokenMode) -> bool {
cross_path.len() == 1 || !(mode == TokenMode::SEARCH)
fn need_add_path(&self, cross_path: &LexemePath, mode: &TokenMode) -> bool {
match mode {
TokenMode::INDEX => return true,
_ => {}
}
cross_path.len() == 1
}

fn forward_path<'a>(
&'a self,
cur_node: Option<&'a Link<Lexeme>>,
option: &mut LexemePath,
option_path: &mut LexemePath,
) -> Vec<Option<&Link<Lexeme>>> {
let mut conflict_stack: Vec<Option<&Link<Lexeme>>> = Vec::new();
let mut cur = cur_node;
while let Some(node) = cur {
let ref_node = unsafe { node.as_ref() }; // safety
let c = ref_node.ref_val();
if !option.add_not_cross_lexeme(c) {
if !option_path.add_not_cross_lexeme(c) {
conflict_stack.push(cur);
}
cur = ref_node.next.as_ref();
}
conflict_stack
}

fn back_path(&self, l: Option<&Link<Lexeme>>, option: &mut LexemePath) {
fn back_path(&self, l: Option<&Link<Lexeme>>, option_path: &mut LexemePath) {
if let Some(node) = l {
let ref_node = unsafe { node.as_ref() }; // safety
let lexeme = ref_node.ref_val();
while option.check_cross(lexeme) {
option.remove_tail();
while option_path.check_cross(lexeme) {
option_path.remove_tail();
}
}
}
Expand Down
66 changes: 37 additions & 29 deletions src/core/ik_segmenter.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::core::char_util::CharType;
use crate::core::char_util::{regularize_str, utf8_len};
use crate::core::char_util::{regularize_str, utf8_len, CharType};
use crate::core::cjk_segmenter::CJKSegmenter;
use crate::core::cn_quantifier_segmenter::CnQuantifierSegmenter;
use crate::core::ik_arbitrator::IKArbitrator;
Expand All @@ -11,7 +10,7 @@ use crate::core::segmentor::Segmenter;
use crate::dict::dictionary::GLOBAL_DICT;
use std::collections::{HashMap, LinkedList};

#[derive(Clone, Copy, PartialEq)]
#[derive(Debug)]
pub enum TokenMode {
INDEX,
SEARCH,
Expand Down Expand Up @@ -49,14 +48,17 @@ impl IKSegmenter {
}
}

let mut path_map = self.arbitrator.process(&origin_lexemes, mode);
let mut path_map = self.arbitrator.process(&origin_lexemes, &mode);
let mut results = self.output_to_result(&mut path_map, input);
let mut final_results = Vec::with_capacity(results.len());
// remove stop word
let mut result = results.pop_front();
while let Some(ref mut result_value) = result {
if mode == TokenMode::SEARCH {
self.compound(&mut results, result_value);
match &mode {
TokenMode::SEARCH => {
self.compound(&mut results, result_value);
}
_ => {}
}
if !GLOBAL_DICT.lock().unwrap().is_stop_word(
input,
Expand Down Expand Up @@ -137,35 +139,41 @@ impl IKSegmenter {

fn compound(&self, results: &mut LinkedList<Lexeme>, result: &mut Lexeme) {
if !results.is_empty() {
if LexemeType::ARABIC == result.lexeme_type() {
let mut append_ok = false;
let next_lexeme = results.front();
next_lexeme.map(|next| match next.lexeme_type() {
LexemeType::CNUM => {
append_ok = result.append(next, LexemeType::CNUM);
}
LexemeType::COUNT => {
append_ok = result.append(next, LexemeType::CQUAN);
match result.lexeme_type() {
LexemeType::ARABIC => {
let mut append_ok = false;
let next_lexeme = results.front();
next_lexeme.map(|next| match next.lexeme_type() {
LexemeType::CNUM => {
append_ok = result.append(next, LexemeType::CNUM);
}
LexemeType::COUNT => {
append_ok = result.append(next, LexemeType::CQUAN);
}
_ => {}
});
if append_ok {
results.pop_front();
}
_ => {}
});
if append_ok {
results.pop_front();
}
_ => {}
}

if LexemeType::CNUM == result.lexeme_type() && !results.is_empty() {
let mut append_ok = false;
let next_lexeme = results.front();
next_lexeme.map(|next| match next.lexeme_type() {
LexemeType::COUNT => {
append_ok = result.append(next, LexemeType::CQUAN);
match result.lexeme_type() {
LexemeType::CNUM if !results.is_empty() => {
let mut append_ok = false;
let next_lexeme = results.front();
next_lexeme.map(|next| match next.lexeme_type() {
LexemeType::COUNT => {
append_ok = result.append(next, LexemeType::CQUAN);
}
_ => {}
});
if append_ok {
results.pop_front();
}
_ => {}
});
if append_ok {
results.pop_front();
}
_ => {}
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/core/lexeme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::core::char_util::utf8_slice;
use std::cmp::Ordering;
use std::ops::Range;

#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, Clone)]
pub enum LexemeType {
UNKNOWN,
ENGLISH,
Expand Down Expand Up @@ -86,8 +86,8 @@ impl Lexeme {
}
}

pub fn lexeme_type(&self) -> LexemeType {
self.lexeme_type.clone()
pub fn lexeme_type(&self) -> &LexemeType {
&self.lexeme_type
}

pub fn begin_pos(&self) -> usize {
Expand Down
14 changes: 7 additions & 7 deletions src/core/lexeme_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ impl LexemePath {
} else {
self.lexeme_list.insert(lexeme.clone());
self.payload_len += lexeme.len();
let head = self.lexeme_list.peek_front(); // peekFirst();
let head = self.lexeme_list.peek_front();
if let Some(h) = head {
self.begin = h.begin_pos() as i32;
}
let tail = self.lexeme_list.peek_back(); // peekLast();
let tail = self.lexeme_list.peek_back();
if let Some(t) = tail {
self.end = t.end_pos() as i32;
}
Expand Down Expand Up @@ -153,13 +153,13 @@ impl Display for LexemePath {
impl Clone for LexemePath {
fn clone(&self) -> Self {
let mut the_copy = LexemePath::new();
the_copy.begin = self.begin;
the_copy.end = self.end;
the_copy.payload_len = self.payload_len;
the_copy.begin = self.begin();
the_copy.end = self.end();
the_copy.payload_len = self.payload_len();
for lexeme in self.lexeme_list.iter() {
the_copy.lexeme_list.insert(lexeme.clone());
}
return the_copy;
the_copy
}
}

Expand Down Expand Up @@ -215,7 +215,7 @@ impl PartialEq for LexemePath {
fn eq(&self, other: &Self) -> bool {
return if self.begin() == other.begin()
&& self.end() == other.end()
&& self.payload_len == other.payload_len
&& self.payload_len() == other.payload_len()
&& self.lexeme_list.len() == other.lexeme_list.len()
{
for (a, b) in self.lexeme_list.iter().zip(other.lexeme_list.iter()) {
Expand Down
2 changes: 1 addition & 1 deletion src/dict/hit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ const UNMATCH: u8 = 0b00000000;
const MATCH: u8 = 0b00000001;
const PREFIX: u8 = 0b00000010;

#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct Hit {
hit_state: u8,
pos: Range<usize>,
Expand Down

0 comments on commit f39a6c4

Please sign in to comment.