From 1d20762d3032cbae7ad688ebdd6b702cfd0efcaf Mon Sep 17 00:00:00 2001 From: blueshen Date: Mon, 26 Jun 2023 18:29:30 +0800 Subject: [PATCH] method name layout --- README.md | 11 ++-- src/core/char_util.rs | 22 +++---- src/core/cjk_segmenter.rs | 4 +- src/core/cn_quantifier_segmenter.rs | 18 +----- src/core/ik_arbitrator.rs | 12 ++-- src/core/ik_segmenter.rs | 15 +++-- src/core/lexeme.rs | 16 ++--- src/core/lexeme_path.rs | 95 +++++++++++++++-------------- src/core/ordered_linked_list.rs | 2 +- src/dict/dictionary.rs | 2 +- src/dict/hit.rs | 29 ++++----- src/dict/trie.rs | 10 ++- tests/test-ik.rs | 4 +- 13 files changed, 113 insertions(+), 127 deletions(-) diff --git a/README.md b/README.md index 830d67a..979a766 100644 --- a/README.md +++ b/README.md @@ -55,12 +55,6 @@ mod test { } ``` - -# Usage for Tantivy - -use [tantivy-ik](https://github.com/blueshen/tantivy-ik) project - - # BenchMark High performance @@ -70,6 +64,11 @@ ik_tokenize_benchmark time: [19.366 µs 19.572 µs 19.850 µs] change: [-1.5364% -0.4029% +0.7357%] (p = 0.51 > 0.05) ``` +# Usage for Tantivy + +use [tantivy-ik](https://github.com/blueshen/tantivy-ik) project + + --- Welcome to rust developer and search engine developer join us, and maintain this project together! diff --git a/src/core/char_util.rs b/src/core/char_util.rs index 287d8d1..ef1a315 100644 --- a/src/core/char_util.rs +++ b/src/core/char_util.rs @@ -9,21 +9,19 @@ pub enum CharType { OtherCjk, } -impl TryFrom for CharType { - type Error = (); - - fn try_from(input: char) -> Result { +impl From for CharType { + fn from(input: char) -> Self { if input >= '0' && input <= '9' { - return Ok(CharType::ARABIC); + return CharType::ARABIC; } else if (input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z') { - return Ok(CharType::ENGLISH); + return CharType::ENGLISH; } else { if let Some(ub) = unicode_blocks::find_unicode_block(input) { if ub == unicode_blocks::CJK_UNIFIED_IDEOGRAPHS || ub == unicode_blocks::CJK_COMPATIBILITY_IDEOGRAPHS || ub == unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A { - return Ok(CharType::CHINESE); + return CharType::CHINESE; } else if ub == unicode_blocks::HALFWIDTH_AND_FULLWIDTH_FORMS || ub == unicode_blocks::HANGUL_SYLLABLES || ub == unicode_blocks::HANGUL_JAMO @@ -32,18 +30,18 @@ impl TryFrom for CharType { || ub == unicode_blocks::KATAKANA || ub == unicode_blocks::KATAKANA_PHONETIC_EXTENSIONS { - return Ok(CharType::OtherCjk); + return CharType::OtherCjk; } } } - return Ok(CharType::USELESS); + return CharType::USELESS; } } // identify CharType Of char -pub fn char_type_of(input: char) -> CharType { - CharType::try_from(input).unwrap() -} +// pub fn char_type_of(input: char) -> CharType { +// CharType::from(input) +// } // full char -> half char && lowercase pub fn regularize(input: char) -> char { diff --git a/src/core/cjk_segmenter.rs b/src/core/cjk_segmenter.rs index 904a7c3..e61878d 100644 --- a/src/core/cjk_segmenter.rs +++ b/src/core/cjk_segmenter.rs @@ -22,12 +22,12 @@ impl Segmenter for CJKSegmenter { CharType::USELESS => {} _ => { let char_count = utf8_len(input); - let hit_options = GLOBAL_DICT.lock().unwrap().match_in_main_dict_with_offset( + let hits = GLOBAL_DICT.lock().unwrap().match_in_main_dict_with_offset( input, cursor, char_count - cursor, ); - for hit in hit_options.iter() { + for hit in hits.iter() { if hit.is_match() { let new_lexeme = Lexeme::new(hit.pos.clone(), LexemeType::CNWORD); origin_lexemes.insert(new_lexeme); diff --git a/src/core/cn_quantifier_segmenter.rs b/src/core/cn_quantifier_segmenter.rs index 5cd18cb..f99e6f8 100644 --- a/src/core/cn_quantifier_segmenter.rs +++ b/src/core/cn_quantifier_segmenter.rs @@ -125,7 +125,7 @@ impl CnQuantifierSegmenter { if self.start.is_some() && self.end.is_some() { return true; } - if origin_lexemes.is_empty() { + if origin_lexemes.empty() { return false; } else { let last = origin_lexemes.peek_back(); @@ -133,7 +133,7 @@ impl CnQuantifierSegmenter { if lexeme.lexeme_type == LexemeType::ARABIC || lexeme.lexeme_type == LexemeType::CNUM { - if lexeme.get_end_position() == cursor { + if lexeme.end_position() == cursor { return true; } } @@ -143,19 +143,7 @@ impl CnQuantifierSegmenter { } fn initial_state(&self) -> bool { - return match self.start { - None => match self.end { - None => { - true - } - Some(_) => { - false - } - }, - Some(_) => { - false - } - } + self.start.is_none() && self.end.is_none() } fn reset_state(&mut self) { diff --git a/src/core/ik_arbitrator.rs b/src/core/ik_arbitrator.rs index 9076169..391712c 100644 --- a/src/core/ik_arbitrator.rs +++ b/src/core/ik_arbitrator.rs @@ -17,11 +17,11 @@ impl IKArbitrator { for org_lexeme in org_lexemes.iter() { if !cross_path.add_cross_lexeme(org_lexeme) { if self.need_add_path(&cross_path, mode) { - path_map.insert(cross_path.get_path_begin() as usize, cross_path); + path_map.insert(cross_path.path_begin() as usize, cross_path); } else { - let judge_result = self.judge(cross_path.get_head()); + let judge_result = self.judge(cross_path.head_node()); if let Some(path) = judge_result { - path_map.insert(path.get_path_begin() as usize, path); + path_map.insert(path.path_begin() as usize, path); } } cross_path = LexemePath::new(); @@ -29,11 +29,11 @@ impl IKArbitrator { } } if self.need_add_path(&cross_path, mode) { - path_map.insert(cross_path.get_path_begin() as usize, cross_path); + path_map.insert(cross_path.path_begin() as usize, cross_path); } else { - let judge_result = self.judge(cross_path.get_head()); + let judge_result = self.judge(cross_path.head_node()); if let Some(path) = judge_result { - path_map.insert(path.get_path_begin() as usize, path); + path_map.insert(path.path_begin() as usize, path); } } path_map diff --git a/src/core/ik_segmenter.rs b/src/core/ik_segmenter.rs index 23642de..42e6a8b 100644 --- a/src/core/ik_segmenter.rs +++ b/src/core/ik_segmenter.rs @@ -1,4 +1,3 @@ -use crate::core::char_util::char_type_of; use crate::core::char_util::regularize_str; use crate::core::char_util::CharType; use crate::core::cjk_segmenter::CJKSegmenter; @@ -44,7 +43,7 @@ impl IKSegmenter { let input = regular_str.as_str(); let mut origin_lexemes = OrderedLinkedList::::new(); for (cursor, curr_char) in input.chars().enumerate() { - let curr_char_type = char_type_of(curr_char); + let curr_char_type = CharType::from(curr_char); for segmenter in self.segmenters.iter_mut() { segmenter.analyze(input, cursor, curr_char_type, &mut origin_lexemes); } @@ -61,8 +60,8 @@ impl IKSegmenter { } if !GLOBAL_DICT.lock().unwrap().is_stop_word( input, - result_value.get_begin_position(), - result_value.get_length(), + result_value.begin_position(), + result_value.length(), ) { result_value.parse_lexeme_text(input); final_results.push(result_value.clone()) @@ -82,7 +81,7 @@ impl IKSegmenter { let char_count = input.chars().count(); while index < char_count { let curr_char = input.chars().nth(index).unwrap(); - let cur_char_type = char_type_of(curr_char); + let cur_char_type = CharType::from(curr_char); if CharType::USELESS == cur_char_type { index += 1; continue; @@ -92,12 +91,12 @@ impl IKSegmenter { let mut cur_lexeme = p.poll_first(); while let Some(ref lexeme) = cur_lexeme { results.push_back(lexeme.clone()); - index = lexeme.get_end_position(); + index = lexeme.end_position(); cur_lexeme = p.poll_first(); if let Some(ref lexeme) = cur_lexeme { - while index < lexeme.get_begin_position() { + while index < lexeme.begin_position() { let curr_char = input.chars().nth(index).unwrap(); - let cur_char_type = char_type_of(curr_char); + let cur_char_type = CharType::from(curr_char); self.add_single_lexeme(&mut results, cur_char_type, index); index += 1; } diff --git a/src/core/lexeme.rs b/src/core/lexeme.rs index d141fc4..56e737d 100644 --- a/src/core/lexeme.rs +++ b/src/core/lexeme.rs @@ -60,9 +60,9 @@ impl PartialEq for Lexeme { impl PartialOrd for Lexeme { fn partial_cmp(&self, other: &Self) -> Option { - return if self.get_begin_position() < other.get_begin_position() { + return if self.begin_position() < other.begin_position() { Some(Ordering::Less) - } else if self.get_begin_position() == other.get_begin_position() { + } else if self.begin_position() == other.begin_position() { if self.pos.len() > other.pos.len() { Some(Ordering::Less) } else if self.pos.len() == other.pos.len() { @@ -86,29 +86,29 @@ impl Lexeme { } } - pub fn get_begin_position(&self) -> usize { + pub fn begin_position(&self) -> usize { self.offset + self.pos.start } - pub fn get_end_position(&self) -> usize { + pub fn end_position(&self) -> usize { self.offset + self.pos.end } - pub fn get_length(&self) -> usize { + pub fn length(&self) -> usize { self.pos.len() } - pub fn get_lexeme_text(&self) -> &str { + pub fn lexeme_text(&self) -> &str { &self.lexeme_text } pub fn parse_lexeme_text(&mut self, input: &str) { - let sub_text = utf8_slice(input, self.get_begin_position(), self.get_end_position()); + let sub_text = utf8_slice(input, self.begin_position(), self.end_position()); self.lexeme_text = sub_text.to_string(); } pub fn append(&mut self, l: &Lexeme, lexeme_type: LexemeType) -> bool { - if self.get_end_position() == l.get_begin_position() { + if self.end_position() == l.begin_position() { self.pos.end = l.pos.end; self.lexeme_type = lexeme_type; return true; diff --git a/src/core/lexeme_path.rs b/src/core/lexeme_path.rs index 9bc74ae..e15d8e1 100644 --- a/src/core/lexeme_path.rs +++ b/src/core/lexeme_path.rs @@ -21,18 +21,18 @@ impl LexemePath { } pub fn add_cross_lexeme(&mut self, lexeme: &Lexeme) -> bool { - return if self.lexeme_list.is_empty() { + return if self.lexeme_list.empty() { self.lexeme_list.insert(lexeme.clone()); - self.path_begin = lexeme.get_begin_position() as i32; - self.path_end = lexeme.get_end_position() as i32; - self.payload_length += lexeme.get_length(); + self.path_begin = lexeme.begin_position() as i32; + self.path_end = lexeme.end_position() as i32; + self.payload_length += lexeme.length(); true } else if self.check_cross(&lexeme) { self.lexeme_list.insert(lexeme.clone()); - if lexeme.get_end_position() > self.get_path_end() as usize { - self.path_end = lexeme.get_end_position() as i32; + if lexeme.end_position() > self.path_end() as usize { + self.path_end = lexeme.end_position() as i32; } - self.payload_length = self.get_path_length(); + self.payload_length = self.path_length(); true } else { false @@ -40,24 +40,24 @@ impl LexemePath { } pub fn add_not_cross_lexeme(&mut self, lexeme: &Lexeme) -> bool { - return if self.lexeme_list.is_empty() { + return if self.lexeme_list.empty() { self.lexeme_list.insert(lexeme.clone()); - self.path_begin = lexeme.get_begin_position() as i32; - self.path_end = lexeme.get_end_position() as i32; - self.payload_length += lexeme.get_length(); + self.path_begin = lexeme.begin_position() as i32; + self.path_end = lexeme.end_position() as i32; + self.payload_length += lexeme.length(); true } else if self.check_cross(lexeme) { false } else { self.lexeme_list.insert(lexeme.clone()); - self.payload_length += lexeme.get_length(); + self.payload_length += lexeme.length(); let head = self.lexeme_list.peek_front(); // peekFirst(); if let Some(h) = head { - self.path_begin = h.get_begin_position() as i32; + self.path_begin = h.begin_position() as i32; } let tail = self.lexeme_list.peek_back(); // peekLast(); if let Some(t) = tail { - self.path_end = t.get_end_position() as i32; + self.path_end = t.end_position() as i32; } true }; @@ -65,56 +65,59 @@ impl LexemePath { pub fn remove_tail(&mut self) -> Option { let tail = self.lexeme_list.pop_back(); - if self.lexeme_list.is_empty() { + if self.lexeme_list.empty() { self.path_begin = -1; self.path_end = -1; self.payload_length = 0usize; } else { - self.payload_length -= tail.as_ref().unwrap().get_length(); + self.payload_length -= tail.as_ref().unwrap().length(); let new_tail = self.lexeme_list.peek_back(); if let Some(new) = new_tail { - self.path_end = new.get_end_position() as i32; + self.path_end = new.end_position() as i32; } } return tail; } pub fn check_cross(&self, lexeme: &Lexeme) -> bool { - let l_begin = lexeme.get_begin_position(); - let l_length = lexeme.get_length(); - let cross = (l_begin >= self.get_path_begin() as usize - && l_begin < self.get_path_end() as usize) - || (self.get_path_begin() as usize >= l_begin - && (self.get_path_begin() as usize) < (l_begin + l_length)); + let l_begin = lexeme.begin_position(); + let l_length = lexeme.length(); + let cross = (l_begin >= self.path_begin() as usize && l_begin < self.path_end() as usize) + || (self.path_begin() as usize >= l_begin + && (self.path_begin() as usize) < (l_begin + l_length)); cross } - pub fn get_path_begin(&self) -> i32 { + pub fn path_begin(&self) -> i32 { self.path_begin } - pub fn get_path_end(&self) -> i32 { + pub fn path_end(&self) -> i32 { self.path_end } - pub fn get_path_length(&self) -> usize { + pub fn path_length(&self) -> usize { (self.path_end - self.path_begin) as usize } - pub fn get_xweight(&self) -> usize { + pub fn payload_length(&self) -> usize { + self.payload_length + } + + pub fn x_weight(&self) -> usize { let mut product = 1; for lexeme in self.lexeme_list.iter() { - product *= lexeme.get_length(); + product *= lexeme.length(); } return product; } - pub fn get_pweight(&self) -> usize { + pub fn p_weight(&self) -> usize { let mut p_weight = 0; let mut p = 0; for lexeme in self.lexeme_list.iter() { p += 1; - p_weight += p * lexeme.get_length(); + p_weight += p * lexeme.length(); } return p_weight; } @@ -127,7 +130,7 @@ impl LexemePath { self.lexeme_list.pop_front() } - pub fn get_head(&self) -> Option<&Link> { + pub fn head_node(&self) -> Option<&Link> { self.lexeme_list.head_node() } } @@ -137,9 +140,9 @@ impl Display for LexemePath { write!( f, "path_begin:{}, path_end:{}, payload_length:{}, lexeme_list:{}", - self.get_path_begin(), - self.get_path_end(), - self.payload_length, + self.path_begin(), + self.path_end(), + self.payload_length(), self.lexeme_list ) } @@ -166,9 +169,9 @@ impl Ord for LexemePath { impl PartialOrd for LexemePath { fn partial_cmp(&self, o: &Self) -> Option { - if self.payload_length > o.payload_length { + if self.payload_length() > o.payload_length() { return Some(Ordering::Less); - } else if self.payload_length < o.payload_length { + } else if self.payload_length() < o.payload_length() { return Some(Ordering::Greater); } else { if self.size() < o.size() { @@ -176,24 +179,24 @@ impl PartialOrd for LexemePath { } else if self.size() > o.size() { return Some(Ordering::Greater); } else { - if self.get_path_length() > o.get_path_length() { + if self.path_length() > o.path_length() { return Some(Ordering::Less); - } else if self.get_path_length() < o.get_path_length() { + } else if self.path_length() < o.path_length() { return Some(Ordering::Greater); } else { - if self.get_path_end() > o.get_path_end() { + if self.path_end() > o.path_end() { return Some(Ordering::Less); - } else if self.get_path_end() < o.get_path_end() { + } else if self.path_end() < o.path_end() { return Some(Ordering::Greater); } else { - if self.get_xweight() > o.get_xweight() { + if self.x_weight() > o.x_weight() { return Some(Ordering::Less); - } else if self.get_xweight() < o.get_xweight() { + } else if self.x_weight() < o.x_weight() { return Some(Ordering::Greater); } else { - if self.get_pweight() > o.get_pweight() { + if self.p_weight() > o.p_weight() { return Some(Ordering::Less); - } else if self.get_pweight() < o.get_pweight() { + } else if self.p_weight() < o.p_weight() { return Some(Ordering::Greater); } } @@ -208,8 +211,8 @@ impl PartialOrd for LexemePath { impl Eq for LexemePath {} impl PartialEq for LexemePath { fn eq(&self, other: &Self) -> bool { - return if self.get_path_begin() == other.get_path_begin() - && self.get_path_end() == other.get_path_end() + return if self.path_begin() == other.path_begin() + && self.path_end() == other.path_end() && self.payload_length == other.payload_length && self.lexeme_list.length() == other.lexeme_list.length() { diff --git a/src/core/ordered_linked_list.rs b/src/core/ordered_linked_list.rs index c595cf8..0f1bf9d 100644 --- a/src/core/ordered_linked_list.rs +++ b/src/core/ordered_linked_list.rs @@ -75,7 +75,7 @@ impl OrderedLinkedList { self.length } - pub fn is_empty(&self) -> bool { + pub fn empty(&self) -> bool { self.length == 0 } diff --git a/src/dict/dictionary.rs b/src/dict/dictionary.rs index 22674a4..9344a35 100644 --- a/src/dict/dictionary.rs +++ b/src/dict/dictionary.rs @@ -108,7 +108,7 @@ impl Dictionary { return true; } } - return false; + false } fn load_main_dict(&mut self) -> bool { diff --git a/src/dict/hit.rs b/src/dict/hit.rs index 4dda415..87239a9 100644 --- a/src/dict/hit.rs +++ b/src/dict/hit.rs @@ -1,22 +1,31 @@ use std::ops::Range; -const UNMATCH: u32 = 0x00000000; -const MATCH: u32 = 0x00000001; -const PREFIX: u32 = 0x00000010; +const UNMATCH: u8 = 0b00000000; +const MATCH: u8 = 0b00000001; +const PREFIX: u8 = 0b00000010; #[derive(Debug, Clone)] pub struct Hit { - pub hit_state: u32, + pub hit_state: u8, pub pos: Range, } -impl Hit { - pub fn new() -> Self { +impl Default for Hit { + fn default() -> Self { Hit { hit_state: UNMATCH, pos: 0..0, } } +} + +impl Hit { + pub fn new_with_pos(pos: Range) -> Self { + let mut hit = Hit::default(); + hit.pos = pos; + hit + } + pub fn set_match(&mut self) { self.hit_state = self.hit_state | MATCH; } @@ -32,14 +41,6 @@ impl Hit { (self.hit_state & PREFIX) > 0 } - #[allow(dead_code)] - pub fn set_unmatch(&mut self) { - self.hit_state = UNMATCH; - } - #[allow(dead_code)] - pub fn is_unmatch(&self) -> bool { - self.hit_state == UNMATCH - } #[allow(dead_code)] pub fn length(&self) -> usize { self.pos.len() diff --git a/src/dict/trie.rs b/src/dict/trie.rs index 0aac663..6bbc0b9 100644 --- a/src/dict/trie.rs +++ b/src/dict/trie.rs @@ -26,7 +26,7 @@ impl TrieNode { TrieNode { value: Some(c), final_state, - child_nodes: HashMap::new(), + child_nodes: HashMap::with_capacity(32), } } @@ -34,7 +34,7 @@ impl TrieNode { TrieNode { value: None, final_state: false, - child_nodes: HashMap::new(), + child_nodes: HashMap::with_capacity(32), } } @@ -122,8 +122,7 @@ impl TrieNode { break; } if current_node.final_state { - let mut hit = Hit::new(); - hit.pos = offset..end + 1; + let mut hit = Hit::new_with_pos(offset..end + 1); hit.set_match(); if current_node.has_childs() { hit.set_prefix(); @@ -134,8 +133,7 @@ impl TrieNode { end = counter; } if !current_node.is_root() { - let mut hit = Hit::new(); - hit.pos = offset..end + 1; + let mut hit = Hit::new_with_pos(offset..end + 1); if current_node.final_state { hit.set_match(); } diff --git a/tests/test-ik.rs b/tests/test-ik.rs index 6f62263..e3894df 100644 --- a/tests/test-ik.rs +++ b/tests/test-ik.rs @@ -85,7 +85,7 @@ mod test { let mut token_texts = Vec::new(); for token in tokens.iter() { // println!("{:?}", token); - token_texts.push(token.get_lexeme_text()); + token_texts.push(token.lexeme_text()); } assert_eq!(expect, token_texts); } @@ -96,7 +96,7 @@ mod test { let mut token_texts = Vec::new(); for token in tokens.iter() { // println!("{:?}", token); - token_texts.push(token.get_lexeme_text()); + token_texts.push(token.lexeme_text()); } assert_eq!(expect, token_texts); }