Skip to content

Commit

Permalink
Updated test to fit UniDic segmentation. Whitespace changes.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmoen committed Jul 10, 2011
1 parent 77dd687 commit 1e3204f
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 11 deletions.
7 changes: 3 additions & 4 deletions src/main/java/org/atilika/kuromoji/Tokenizer.java
Expand Up @@ -82,7 +82,7 @@ protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
*/
public List<Token> tokenize(String text) {

if(!split) {
if (!split) {
return doTokenize(0, text);
}

Expand Down Expand Up @@ -149,12 +149,11 @@ private List<Token> doTokenize(int offset, String sentence) {

ViterbiNode[][][] lattice = viterbi.build(sentence);
List<ViterbiNode> bestPath = viterbi.search(lattice);
for(ViterbiNode node : bestPath) {
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
if(node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
continue;
}

Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
}
Expand Down
Expand Up @@ -74,8 +74,8 @@ public int lookup(String text) {
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
int length = 1;
for(int i = 1; i < text.length(); i++) {
if(characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
for (int i = 1; i < text.length(); i++) {
if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
length++;
} else {
break;
Expand Down Expand Up @@ -129,7 +129,6 @@ public static UnknownDictionary getInstance() throws IOException, ClassNotFoundE
dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));

return dictionary;
}

Expand Down
10 changes: 8 additions & 2 deletions src/test/java/org/atilika/kuromoji/TokenizerTest.java
Expand Up @@ -40,9 +40,15 @@ public static void setUpBeforeClass() throws Exception {

@Test
public void testSegmentation() {
String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
// "スペース", "ステーション", "に", "行き", "ます", "。",
// "うたがわしい", "。"
// };
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
"スペース", "ステーション", "に", "行き", "ます", "。",
"うたがわしい", "。"
};
Expand Down
2 changes: 0 additions & 2 deletions src/test/java/org/atilika/kuromoji/trie/TrieTest.java
Expand Up @@ -17,14 +17,12 @@
package org.atilika.kuromoji.trie;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import org.atilika.kuromoji.trie.Trie.Node;
import org.junit.Test;


/**
* @author Masaru Hasegawa
* @author Christian Moen
Expand Down

0 comments on commit 1e3204f

Please sign in to comment.