Skip to content

Commit

Permalink
method name layout
Browse files Browse the repository at this point in the history
  • Loading branch information
blueshen committed Jun 26, 2023
1 parent 4d41648 commit 1d20762
Show file tree
Hide file tree
Showing 13 changed files with 113 additions and 127 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ mod test {
}

```

# Usage for Tantivy

use [tantivy-ik](https://github.com/blueshen/tantivy-ik) project


# BenchMark

High performance
Expand All @@ -70,6 +64,11 @@ ik_tokenize_benchmark time: [19.366 µs 19.572 µs 19.850 µs]
change: [-1.5364% -0.4029% +0.7357%] (p = 0.51 > 0.05)

```
# Usage for Tantivy

use [tantivy-ik](https://github.com/blueshen/tantivy-ik) project


---
Welcome to rust developer and search engine developer join us, and maintain this project together!

Expand Down
22 changes: 10 additions & 12 deletions src/core/char_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,19 @@ pub enum CharType {
OtherCjk,
}

impl TryFrom<char> for CharType {
type Error = ();

fn try_from(input: char) -> Result<Self, Self::Error> {
impl From<char> for CharType {
fn from(input: char) -> Self {
if input >= '0' && input <= '9' {
return Ok(CharType::ARABIC);
return CharType::ARABIC;
} else if (input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z') {
return Ok(CharType::ENGLISH);
return CharType::ENGLISH;
} else {
if let Some(ub) = unicode_blocks::find_unicode_block(input) {
if ub == unicode_blocks::CJK_UNIFIED_IDEOGRAPHS
|| ub == unicode_blocks::CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
{
return Ok(CharType::CHINESE);
return CharType::CHINESE;
} else if ub == unicode_blocks::HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == unicode_blocks::HANGUL_SYLLABLES
|| ub == unicode_blocks::HANGUL_JAMO
Expand All @@ -32,18 +30,18 @@ impl TryFrom<char> for CharType {
|| ub == unicode_blocks::KATAKANA
|| ub == unicode_blocks::KATAKANA_PHONETIC_EXTENSIONS
{
return Ok(CharType::OtherCjk);
return CharType::OtherCjk;
}
}
}
return Ok(CharType::USELESS);
return CharType::USELESS;
}
}

// identify CharType Of char
pub fn char_type_of(input: char) -> CharType {
CharType::try_from(input).unwrap()
}
// pub fn char_type_of(input: char) -> CharType {
// CharType::from(input)
// }

// full char -> half char && lowercase
pub fn regularize(input: char) -> char {
Expand Down
4 changes: 2 additions & 2 deletions src/core/cjk_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ impl Segmenter for CJKSegmenter {
CharType::USELESS => {}
_ => {
let char_count = utf8_len(input);
let hit_options = GLOBAL_DICT.lock().unwrap().match_in_main_dict_with_offset(
let hits = GLOBAL_DICT.lock().unwrap().match_in_main_dict_with_offset(
input,
cursor,
char_count - cursor,
);
for hit in hit_options.iter() {
for hit in hits.iter() {
if hit.is_match() {
let new_lexeme = Lexeme::new(hit.pos.clone(), LexemeType::CNWORD);
origin_lexemes.insert(new_lexeme);
Expand Down
18 changes: 3 additions & 15 deletions src/core/cn_quantifier_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ impl CnQuantifierSegmenter {
if self.start.is_some() && self.end.is_some() {
return true;
}
if origin_lexemes.is_empty() {
if origin_lexemes.empty() {
return false;
} else {
let last = origin_lexemes.peek_back();
if let Some(lexeme) = last {
if lexeme.lexeme_type == LexemeType::ARABIC
|| lexeme.lexeme_type == LexemeType::CNUM
{
if lexeme.get_end_position() == cursor {
if lexeme.end_position() == cursor {
return true;
}
}
Expand All @@ -143,19 +143,7 @@ impl CnQuantifierSegmenter {
}

fn initial_state(&self) -> bool {
return match self.start {
None => match self.end {
None => {
true
}
Some(_) => {
false
}
},
Some(_) => {
false
}
}
self.start.is_none() && self.end.is_none()
}

fn reset_state(&mut self) {
Expand Down
12 changes: 6 additions & 6 deletions src/core/ik_arbitrator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ impl IKArbitrator {
for org_lexeme in org_lexemes.iter() {
if !cross_path.add_cross_lexeme(org_lexeme) {
if self.need_add_path(&cross_path, mode) {
path_map.insert(cross_path.get_path_begin() as usize, cross_path);
path_map.insert(cross_path.path_begin() as usize, cross_path);
} else {
let judge_result = self.judge(cross_path.get_head());
let judge_result = self.judge(cross_path.head_node());
if let Some(path) = judge_result {
path_map.insert(path.get_path_begin() as usize, path);
path_map.insert(path.path_begin() as usize, path);
}
}
cross_path = LexemePath::new();
cross_path.add_cross_lexeme(org_lexeme);
}
}
if self.need_add_path(&cross_path, mode) {
path_map.insert(cross_path.get_path_begin() as usize, cross_path);
path_map.insert(cross_path.path_begin() as usize, cross_path);
} else {
let judge_result = self.judge(cross_path.get_head());
let judge_result = self.judge(cross_path.head_node());
if let Some(path) = judge_result {
path_map.insert(path.get_path_begin() as usize, path);
path_map.insert(path.path_begin() as usize, path);
}
}
path_map
Expand Down
15 changes: 7 additions & 8 deletions src/core/ik_segmenter.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::core::char_util::char_type_of;
use crate::core::char_util::regularize_str;
use crate::core::char_util::CharType;
use crate::core::cjk_segmenter::CJKSegmenter;
Expand Down Expand Up @@ -44,7 +43,7 @@ impl IKSegmenter {
let input = regular_str.as_str();
let mut origin_lexemes = OrderedLinkedList::<Lexeme>::new();
for (cursor, curr_char) in input.chars().enumerate() {
let curr_char_type = char_type_of(curr_char);
let curr_char_type = CharType::from(curr_char);
for segmenter in self.segmenters.iter_mut() {
segmenter.analyze(input, cursor, curr_char_type, &mut origin_lexemes);
}
Expand All @@ -61,8 +60,8 @@ impl IKSegmenter {
}
if !GLOBAL_DICT.lock().unwrap().is_stop_word(
input,
result_value.get_begin_position(),
result_value.get_length(),
result_value.begin_position(),
result_value.length(),
) {
result_value.parse_lexeme_text(input);
final_results.push(result_value.clone())
Expand All @@ -82,7 +81,7 @@ impl IKSegmenter {
let char_count = input.chars().count();
while index < char_count {
let curr_char = input.chars().nth(index).unwrap();
let cur_char_type = char_type_of(curr_char);
let cur_char_type = CharType::from(curr_char);
if CharType::USELESS == cur_char_type {
index += 1;
continue;
Expand All @@ -92,12 +91,12 @@ impl IKSegmenter {
let mut cur_lexeme = p.poll_first();
while let Some(ref lexeme) = cur_lexeme {
results.push_back(lexeme.clone());
index = lexeme.get_end_position();
index = lexeme.end_position();
cur_lexeme = p.poll_first();
if let Some(ref lexeme) = cur_lexeme {
while index < lexeme.get_begin_position() {
while index < lexeme.begin_position() {
let curr_char = input.chars().nth(index).unwrap();
let cur_char_type = char_type_of(curr_char);
let cur_char_type = CharType::from(curr_char);
self.add_single_lexeme(&mut results, cur_char_type, index);
index += 1;
}
Expand Down
16 changes: 8 additions & 8 deletions src/core/lexeme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ impl PartialEq for Lexeme {

impl PartialOrd for Lexeme {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
return if self.get_begin_position() < other.get_begin_position() {
return if self.begin_position() < other.begin_position() {
Some(Ordering::Less)
} else if self.get_begin_position() == other.get_begin_position() {
} else if self.begin_position() == other.begin_position() {
if self.pos.len() > other.pos.len() {
Some(Ordering::Less)
} else if self.pos.len() == other.pos.len() {
Expand All @@ -86,29 +86,29 @@ impl Lexeme {
}
}

pub fn get_begin_position(&self) -> usize {
pub fn begin_position(&self) -> usize {
self.offset + self.pos.start
}

pub fn get_end_position(&self) -> usize {
pub fn end_position(&self) -> usize {
self.offset + self.pos.end
}

pub fn get_length(&self) -> usize {
pub fn length(&self) -> usize {
self.pos.len()
}

pub fn get_lexeme_text(&self) -> &str {
pub fn lexeme_text(&self) -> &str {
&self.lexeme_text
}

pub fn parse_lexeme_text(&mut self, input: &str) {
let sub_text = utf8_slice(input, self.get_begin_position(), self.get_end_position());
let sub_text = utf8_slice(input, self.begin_position(), self.end_position());
self.lexeme_text = sub_text.to_string();
}

pub fn append(&mut self, l: &Lexeme, lexeme_type: LexemeType) -> bool {
if self.get_end_position() == l.get_begin_position() {
if self.end_position() == l.begin_position() {
self.pos.end = l.pos.end;
self.lexeme_type = lexeme_type;
return true;
Expand Down
Loading

0 comments on commit 1d20762

Please sign in to comment.