Skip to content

Commit

Permalink
release 0.3.5
Browse files Browse the repository at this point in the history
  • Loading branch information
blueshen committed Jun 27, 2023
1 parent 4de2539 commit 8733601
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 158 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!-- Badges section here. -->
[![Crates.io](https://img.shields.io/badge/crates.io-0.3.3-green)](https://crates.io/crates/ik-rs)
[![Crates.io](https://img.shields.io/badge/crates.io-0.3.5-green)](https://crates.io/crates/ik-rs)
[![License](https://img.shields.io/badge/license-LGPL--2.1-blue)](./LICENSE)
[![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://github.com/blueshen/ik-rs/releases)
[![Build Status](https://app.travis-ci.com/blueshen/ik-rs.svg?branch=main)](https://app.travis-ci.com/github/blueshen/ik-rs)
Expand All @@ -18,7 +18,7 @@
## add to Cargo.toml
```toml
[dependencies]
ik-rs = "0.3.3"
ik-rs = "0.3.5"
```

## Chinese Segment
Expand Down
6 changes: 3 additions & 3 deletions src/core/char_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ pub fn regularize(input: char) -> char {
} else if input_code >= 'A' as u32 && input_code <= 'Z' as u32 {
input_code += 32; // lowercase
}
let to_char = char::from_u32(input_code).unwrap();
to_char
char::from_u32(input_code).unwrap()
}

pub fn regularize_str(input: &str) -> String {
let mut regular_str = "".to_string();
let mut regular_str = String::from("");
// let chars = input.chars().map(|c| regularize(c)).collect::<Vec<char>>();
for c in input.chars() {
regular_str.push(regularize(c));
}
Expand Down
2 changes: 1 addition & 1 deletion src/core/cjk_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ impl Segmenter for CJKSegmenter {
);
for hit in hits.iter() {
if hit.is_match() {
let new_lexeme = Lexeme::new(hit.pos.clone(), LexemeType::CNWORD);
let new_lexeme = Lexeme::new(hit.pos(), LexemeType::CNWORD);
origin_lexemes.insert(new_lexeme);
}
}
Expand Down
32 changes: 12 additions & 20 deletions src/core/cn_quantifier_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,9 @@ impl CnQuantifierSegmenter {
let char_count = utf8_len(input);
if self.initial_state() {
match curr_char_type {
CharType::CHINESE => {
if self.chn_number_chars.contains(&curr_char) {
self.start = Some(cursor);
self.end = Some(cursor);
}
CharType::CHINESE if self.chn_number_chars.contains(&curr_char) => {
self.start = Some(cursor);
self.end = Some(cursor);
}
_ => {}
}
Expand All @@ -79,10 +77,7 @@ impl CnQuantifierSegmenter {
}
if let Some(index) = self.end {
if index == (char_count - 1) {
let new_lexeme = Lexeme::new(
(self.start.unwrap())..(self.end.unwrap() + 1),
LexemeType::CNUM,
);
let new_lexeme = Lexeme::new((self.start.unwrap())..index + 1, LexemeType::CNUM);
origin_lexemes.insert(new_lexeme);
self.reset_state();
}
Expand All @@ -107,7 +102,7 @@ impl CnQuantifierSegmenter {
);
for hit in hits.iter() {
if hit.is_match() {
let new_lexeme = Lexeme::new(hit.pos.clone(), LexemeType::COUNT);
let new_lexeme = Lexeme::new(hit.pos(), LexemeType::COUNT);
origin_lexemes.insert(new_lexeme);
}
}
Expand All @@ -127,19 +122,16 @@ impl CnQuantifierSegmenter {
}
if origin_lexemes.empty() {
return false;
} else {
let last = origin_lexemes.peek_back();
if let Some(lexeme) = last {
if lexeme.lexeme_type == LexemeType::ARABIC
|| lexeme.lexeme_type == LexemeType::CNUM
{
if lexeme.end_position() == cursor {
return true;
}
}
let last = origin_lexemes.peek_back();
if let Some(lexeme) = last {
if lexeme.lexeme_type == LexemeType::ARABIC || lexeme.lexeme_type == LexemeType::CNUM {
if lexeme.end_pos() == cursor {
return true;
}
}
return false;
}
return false;
}

fn initial_state(&self) -> bool {
Expand Down
10 changes: 5 additions & 5 deletions src/core/ik_arbitrator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ impl IKArbitrator {
for org_lexeme in org_lexemes.iter() {
if !cross_path.add_cross_lexeme(org_lexeme) {
if self.need_add_path(&cross_path, mode) {
path_map.insert(cross_path.path_begin() as usize, cross_path);
path_map.insert(cross_path.begin() as usize, cross_path);
} else {
let judge_result = self.judge(cross_path.head_node());
if let Some(path) = judge_result {
path_map.insert(path.path_begin() as usize, path);
path_map.insert(path.begin() as usize, path);
}
}
cross_path = LexemePath::new();
cross_path.add_cross_lexeme(org_lexeme);
}
}
if self.need_add_path(&cross_path, mode) {
path_map.insert(cross_path.path_begin() as usize, cross_path);
path_map.insert(cross_path.begin() as usize, cross_path);
} else {
let judge_result = self.judge(cross_path.head_node());
if let Some(path) = judge_result {
path_map.insert(path.path_begin() as usize, path);
path_map.insert(path.begin() as usize, path);
}
}
path_map
Expand Down Expand Up @@ -61,7 +61,7 @@ impl IKArbitrator {
}

fn need_add_path(&self, cross_path: &LexemePath, mode: TokenMode) -> bool {
cross_path.size() == 1 || !(mode == TokenMode::SEARCH)
cross_path.len() == 1 || !(mode == TokenMode::SEARCH)
}

fn forward_path<'a>(
Expand Down
25 changes: 14 additions & 11 deletions src/core/ik_segmenter.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::core::char_util::regularize_str;
use crate::core::char_util::CharType;
use crate::core::char_util::{regularize_str, utf8_len};
use crate::core::cjk_segmenter::CJKSegmenter;
use crate::core::cn_quantifier_segmenter::CnQuantifierSegmenter;
use crate::core::ik_arbitrator::IKArbitrator;
Expand Down Expand Up @@ -51,7 +51,7 @@ impl IKSegmenter {

let mut path_map = self.arbitrator.process(&mut origin_lexemes, mode);
let mut results = self.output_to_result(&mut path_map, input);
let mut final_results = Vec::new();
let mut final_results = Vec::with_capacity(results.len());
// remove stop word
let mut result = results.pop_front();
while let Some(ref mut result_value) = result {
Expand All @@ -60,8 +60,8 @@ impl IKSegmenter {
}
if !GLOBAL_DICT.lock().unwrap().is_stop_word(
input,
result_value.begin_position(),
result_value.length(),
result_value.begin_pos(),
result_value.len(),
) {
result_value.parse_lexeme_text(input);
final_results.push(result_value.clone())
Expand All @@ -78,7 +78,7 @@ impl IKSegmenter {
) -> LinkedList<Lexeme> {
let mut results = LinkedList::new();
let mut index = 0usize;
let char_count = input.chars().count();
let char_count = utf8_len(input);
while index < char_count {
let curr_char = input.chars().nth(index).unwrap();
let cur_char_type = CharType::from(curr_char);
Expand All @@ -91,10 +91,10 @@ impl IKSegmenter {
let mut cur_lexeme = p.poll_first();
while let Some(ref lexeme) = cur_lexeme {
results.push_back(lexeme.clone());
index = lexeme.end_position();
index = lexeme.end_pos();
cur_lexeme = p.poll_first();
if let Some(ref lexeme) = cur_lexeme {
while index < lexeme.begin_position() {
while index < lexeme.begin_pos() {
let curr_char = input.chars().nth(index).unwrap();
let cur_char_type = CharType::from(curr_char);
self.add_single_lexeme(&mut results, cur_char_type, index);
Expand All @@ -116,17 +116,20 @@ impl IKSegmenter {
cur_char_type: CharType,
index: usize,
) {
let mut lexeme_type = None;
match cur_char_type {
CharType::CHINESE => {
let single_char_lexeme = Lexeme::new(index..index + 1, LexemeType::CNCHAR);
results.push_back(single_char_lexeme);
lexeme_type = Some(LexemeType::CNCHAR);
}
CharType::OtherCjk => {
let single_char_lexeme = Lexeme::new(index..index + 1, LexemeType::OtherCJK);
results.push_back(single_char_lexeme);
lexeme_type = Some(LexemeType::OtherCJK);
}
_ => {}
}
if let Some(l_type) = lexeme_type {
let single_char_lexeme = Lexeme::new(index..index + 1, l_type);
results.push_back(single_char_lexeme);
}
}

fn compound(&mut self, results: &mut LinkedList<Lexeme>, result: &mut Lexeme) {
Expand Down
50 changes: 20 additions & 30 deletions src/core/letter_segmentor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,28 +70,24 @@ impl LetterSegmenter {
self.end = Some(cursor);
}
}
Some(_) => {
Some(start) => {
if CharType::ARABIC == curr_char_type || CharType::ENGLISH == curr_char_type {
self.end = Some(cursor);
} else if CharType::USELESS == curr_char_type && self.is_letter_connector(curr_char)
} else if CharType::USELESS == curr_char_type
&& self.is_letter_connector(&curr_char)
{
self.end = Some(cursor);
} else {
let new_lexeme = Lexeme::new(
(self.start.unwrap())..(self.end.unwrap() + 1),
LexemeType::LETTER,
);
let new_lexeme =
Lexeme::new(start..(self.end.unwrap() + 1), LexemeType::LETTER);
origin_lexemes.insert(new_lexeme);
self.reset_mix_state();
}
}
}
if let Some(index) = self.end {
if index == (char_count - 1) {
let new_lexeme = Lexeme::new(
(self.start.unwrap())..(self.end.unwrap() + 1),
LexemeType::LETTER,
);
let new_lexeme = Lexeme::new((self.start.unwrap())..index + 1, LexemeType::LETTER);
origin_lexemes.insert(new_lexeme);
self.reset_mix_state();
}
Expand All @@ -115,15 +111,13 @@ impl LetterSegmenter {
}
_ => {}
},
Some(_) => match curr_char_type {
Some(start) => match curr_char_type {
CharType::ENGLISH => {
self.english_end = Some(cursor);
}
_ => {
let new_lexeme = Lexeme::new(
(self.english_start.unwrap())..(self.english_end.unwrap() + 1),
LexemeType::ENGLISH,
);
let new_lexeme =
Lexeme::new(start..(self.english_end.unwrap() + 1), LexemeType::ENGLISH);
origin_lexemes.insert(new_lexeme);
self.reset_english_state();
}
Expand All @@ -133,7 +127,7 @@ impl LetterSegmenter {
if let Some(index) = self.english_end {
if index == (char_count - 1) {
let new_lexeme = Lexeme::new(
(self.english_start.unwrap())..(self.english_end.unwrap() + 1),
(self.english_start.unwrap())..index + 1,
LexemeType::ENGLISH,
);
origin_lexemes.insert(new_lexeme);
Expand All @@ -159,16 +153,14 @@ impl LetterSegmenter {
}
_ => {}
},
Some(_) => {
Some(start) => {
if CharType::ARABIC == curr_char_type {
self.arabic_end = Some(cursor);
} else if CharType::USELESS == curr_char_type && self.is_num_connector(curr_char) {
} else if CharType::USELESS == curr_char_type && self.is_num_connector(&curr_char) {
// do nothing
} else {
let new_lexeme = Lexeme::new(
(self.arabic_start.unwrap())..(self.arabic_end.unwrap() + 1),
LexemeType::ARABIC,
);
let new_lexeme =
Lexeme::new(start..(self.arabic_end.unwrap() + 1), LexemeType::ARABIC);
origin_lexemes.insert(new_lexeme);
self.reset_arabic_state();
}
Expand All @@ -177,10 +169,8 @@ impl LetterSegmenter {
let char_count = utf8_len(input);
if let Some(index) = self.arabic_end {
if index == (char_count - 1) {
let new_lexeme = Lexeme::new(
(self.arabic_start.unwrap())..(self.arabic_end.unwrap() + 1),
LexemeType::ARABIC,
);
let new_lexeme =
Lexeme::new((self.arabic_start.unwrap())..index + 1, LexemeType::ARABIC);
origin_lexemes.insert(new_lexeme);
self.reset_arabic_state();
}
Expand All @@ -201,11 +191,11 @@ impl LetterSegmenter {
self.arabic_end = None;
}

fn is_letter_connector(&self, input: char) -> bool {
LETTER_CONNECTOR.contains(&input)
fn is_letter_connector(&self, input: &char) -> bool {
LETTER_CONNECTOR.contains(input)
}

fn is_num_connector(&self, input: char) -> bool {
NUM_CONNECTOR.contains(&input)
fn is_num_connector(&self, input: &char) -> bool {
NUM_CONNECTOR.contains(input)
}
}
14 changes: 7 additions & 7 deletions src/core/lexeme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ impl PartialEq for Lexeme {

impl PartialOrd for Lexeme {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
return if self.begin_position() < other.begin_position() {
return if self.begin_pos() < other.begin_pos() {
Some(Ordering::Less)
} else if self.begin_position() == other.begin_position() {
} else if self.begin_pos() == other.begin_pos() {
if self.pos.len() > other.pos.len() {
Some(Ordering::Less)
} else if self.pos.len() == other.pos.len() {
Expand All @@ -86,15 +86,15 @@ impl Lexeme {
}
}

pub fn begin_position(&self) -> usize {
pub fn begin_pos(&self) -> usize {
self.offset + self.pos.start
}

pub fn end_position(&self) -> usize {
pub fn end_pos(&self) -> usize {
self.offset + self.pos.end
}

pub fn length(&self) -> usize {
pub fn len(&self) -> usize {
self.pos.len()
}

Expand All @@ -103,12 +103,12 @@ impl Lexeme {
}

pub fn parse_lexeme_text(&mut self, input: &str) {
let sub_text = utf8_slice(input, self.begin_position(), self.end_position());
let sub_text = utf8_slice(input, self.begin_pos(), self.end_pos());
self.lexeme_text = sub_text.to_string();
}

pub fn append(&mut self, l: &Lexeme, lexeme_type: LexemeType) -> bool {
if self.end_position() == l.begin_position() {
if self.end_pos() == l.begin_pos() {
self.pos.end = l.pos.end;
self.lexeme_type = lexeme_type;
return true;
Expand Down
Loading

0 comments on commit 8733601

Please sign in to comment.