Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Cleaned up tests. Resolved connection cost work-around.

  • Loading branch information...
commit 40956c83c4a96646b15cf920942e8c06b1cb839c 1 parent f91f7d3
Christian Moen authored
View
10 README.txt
@@ -23,16 +23,6 @@ In order to build kuromoji from source, please do as follows:
and a jar file should also be available in the target directory
-Additional info
----------------
-
-The korumoji homepage is available on http://atilika.org for
-
-Further documentation is avilable on http://atilika.org/confluence
-
-Please file bugs using JIRA on http://atilika.org/jira
-
-
Contact us
----------
View
4 pom.xml
@@ -4,7 +4,7 @@
<groupId>org.atilika.kuromoji</groupId>
<artifactId>kuromoji</artifactId>
<packaging>jar</packaging>
- <version>0.7.6</version>
+ <version>0.7.7-SNAPSHOT</version>
<name>Kuromoji Japanese Morphological Analyzer</name>
<organization>
@@ -52,6 +52,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
<inherited>true</inherited>
<configuration>
<source>1.6</source>
@@ -76,6 +77,7 @@
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
+ <version>1.2.1</version>
<executions>
<execution>
<id>compile-dictionary</id>
View
2  src/main/java/org/atilika/kuromoji/Tokenizer.java
@@ -151,7 +151,7 @@ protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
- if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
+ if (node.getType() == Type.KNOWN && wordId == -1) { // Do not include BOS/EOS
continue;
}
Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
View
9 src/main/java/org/atilika/kuromoji/dict/ConnectionCosts.java
@@ -51,14 +51,7 @@ public void add(int forwardId, int backwardId, int cost) {
}
public int get(int forwardId, int backwardId) {
- // FIXME: There seems to be something wrong with the double array trie in some rare
- // cases causing and IndexOutOfBoundsException. Use a guard as a temporary work-around
- // and return a high cost to advise Mr. Viterbi strongly to not use this transition
- if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
- return costs[backwardId][forwardId];
- } else {
- return 50000;
- }
+ return costs[backwardId][forwardId];
}
public void write(String directoryname) throws IOException {
View
2  src/main/java/org/atilika/kuromoji/util/TokenInfoDictionaryBuilder.java
@@ -40,7 +40,7 @@
public class TokenInfoDictionaryBuilder {
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
- private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
+ private int offset = 0;
private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
View
4 src/main/java/org/atilika/kuromoji/viterbi/Viterbi.java
@@ -215,7 +215,7 @@ public Viterbi(DoubleArrayTrie trie,
int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
int[] endSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in endIndexArr
- ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
+ ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
// Process user dictionary;
@@ -278,7 +278,7 @@ public Viterbi(DoubleArrayTrie trie,
}
}
- ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
+ ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
View
14 src/test/java/org/atilika/kuromoji/TokenizerTest.java
@@ -73,6 +73,20 @@ public void testReadings() {
}
@Test
+ public void testYabottai() {
+ List<Token> tokens = tokenizer.tokenize("やぼったい");
+ assertEquals(1, tokens.size());
+ assertEquals("やぼったい", tokens.get(0).getSurfaceForm());
+ }
+
+ @Test
+ public void testTsukitosha() {
+ List<Token> tokens = tokenizer.tokenize("突き通しゃ");
+ assertEquals(1, tokens.size());
+ assertEquals("突き通しゃ", tokens.get(0).getSurfaceForm());
+ }
+
+ @Test
public void testBocchan() throws IOException, InterruptedException {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getClassLoader().getResourceAsStream("bocchan.utf-8.txt")));
View
218 src/test/java/org/atilika/kuromoji/dict/UnknownDictionaryTest.java
@@ -1,218 +0,0 @@
-/**
- * Copyright © 2010-2011 Atilika Inc. All rights reserved.
- *
- * Atilika Inc. licenses this file to you under the Apache License, Version
- * 2.0 (the "License"); you may not use this file except in compliance with
- * the License. A copy of the License is distributed with this work in the
- * LICENSE.txt file. You may also obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- */
-package org.atilika.kuromoji.dict;
-
-import static org.junit.Assert.fail;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-
-import org.atilika.kuromoji.dict.UnknownDictionary;
-import org.atilika.kuromoji.util.CSVUtil;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class UnknownDictionaryTest {
- public static final String FILENAME = "unk-tokeninfo-dict.obj";
-
- @BeforeClass
- public static void setUpBeforeClass() throws Exception {
- }
-
- @Test
- public void testPutCharacterCategory() {
- UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
-
- try{
- unkDic.putCharacterCategory(0, "DUMMY_NAME");
- fail();
- } catch(Exception e) {
-
- }
-
- try{
- unkDic.putCharacterCategory(-1, "KATAKANA");
- fail();
- } catch(Exception e) {
-
- }
-
- unkDic.putCharacterCategory(0, "DEFAULT");
- unkDic.putCharacterCategory(1, "GREEK");
- unkDic.putCharacterCategory(2, "HIRAGANA");
- unkDic.putCharacterCategory(3, "KATAKANA");
- unkDic.putCharacterCategory(4, "KANJI");
- }
-
- @Test
- public void testPut() {
- UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
- try{
- unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*"));
- fail();
- } catch(Exception e){
-
- }
-
- String entry1 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*";
- String entry2 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*";
- String entry3 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*";
-
- unkDic.putCharacterCategory(0, "KANJI");
- unkDic.putCharacterCategory(1, "ALPHA");
- unkDic.putCharacterCategory(2, "HIRAGANA");
-
- unkDic.put(CSVUtil.parse(entry1));
- unkDic.put(CSVUtil.parse(entry2));
- unkDic.put(CSVUtil.parse(entry3));
- }
-
-// @Test
-// public void testLookupForInvoke() throws IOException {
-// UnknownDictionary dictionary = createDictionary();
-// String notMatch1 = "あいうえお";
-// int resultNotMatch1 = dictionary.lookupForInvoke(notMatch1);
-// assertEquals(0, resultNotMatch1);
-//
-// String notMatch2 = "あイウエオ";
-// int resultNotMatch2 = dictionary.lookupForInvoke(notMatch2);
-// assertEquals(0, resultNotMatch2);
-//
-// String matchKatakana = "アイウエオ";
-// int resultMatchKatakana = dictionary.lookupForInvoke(matchKatakana);
-// assertEquals(5, resultMatchKatakana);
-//
-// String matchAlpha = "ABC";
-// int resultMatchAlpha = dictionary.lookupForInvoke(matchAlpha);
-// assertEquals(3, resultMatchAlpha);
-//
-// String matchKatakanaPartial = "アイウあいう";
-// int resultMatchKatakanaPartial = dictionary.lookupForInvoke(matchKatakanaPartial);
-// assertEquals(3, resultMatchKatakanaPartial);
-// }
-//
-// @Test
-// public void testLookupForNotInvoke() throws IOException {
-// UnknownDictionary dictionary = createDictionary();
-//
-// String matchHiragana1 = "あ";
-// int resultMatchHiragana1 = dictionary.lookupForNotInvoke(matchHiragana1);
-// assertEquals(1, resultMatchHiragana1);
-//
-// String matchHiragana2 = "あい";
-// int resultMatchHiragana2 = dictionary.lookupForNotInvoke(matchHiragana2);
-// assertEquals(2, resultMatchHiragana2);
-//
-// String matchKanji1 = "漢";
-// int resultMatchKanji1 = dictionary.lookupForNotInvoke(matchKanji1);
-// assertEquals(1, resultMatchKanji1);
-//
-// String matchKanji2 = "漢字";
-// int resultMatchKanji2 = dictionary.lookupForNotInvoke(matchKanji2);
-// assertEquals(1, resultMatchKanji2);
-//
-// String matchKanjiMix = "漢あ";
-// int resultMatchKanjiMix = dictionary.lookupForNotInvoke(matchKanjiMix);
-// assertEquals(1, resultMatchKanjiMix);
-//
-// String notMatch = "アイウ";
-// int resultNotMatch = dictionary.lookupForNotInvoke(notMatch);
-// assertEquals(0, resultNotMatch);
-//
-// String exception = "";
-// try{
-// dictionary.lookupForNotInvoke(exception);
-// fail();
-// } catch(Exception e){
-//
-// }
-// }
-
-// @Test
-// public void testLookupWordIds() throws IOException {
-// UnknownDictionary dictionary = createDictionary();
-// String hiragana = "あい";
-// int[] hiraganaResults = dictionary.lookupWordIds(hiragana);
-// assertEquals(7, hiraganaResults.length);
-//
-// String katakana = "アイ";
-// int[] katakanaResults = dictionary.lookupWordIds(katakana);
-// assertEquals(6, katakanaResults.length);
-//
-// String symbol = "!";
-// int[] symbolResults = dictionary.lookupWordIds(symbol);
-// assertEquals(1, symbolResults.length);
-// }
-
- private UnknownDictionary createDictionary() throws IOException {
- InputStream is = this.getClass().getClassLoader().getResourceAsStream("unk.def.utf-8");
- UnknownDictionary dictionary = new UnknownDictionary();
- BufferedReader reader = new BufferedReader(new InputStreamReader(is));
-
- String line = null;
- while((line = reader.readLine()) != null) {
- dictionary.put(CSVUtil.parse(line));
- }
- reader.close();
-
- is = this.getClass().getClassLoader().getResourceAsStream("char.def.utf-8");
- reader = new BufferedReader(new InputStreamReader(is));
-
- line = null;
- while ((line = reader.readLine()) != null) {
- line = line.replaceAll("^\\s", "");
- line = line.replaceAll("\\s*#.*", "");
- line = line.replaceAll("\\s+", " ");
-
- // Skip empty line or comment line
- if(line.length() == 0) {
- continue;
- }
-
- if(line.startsWith("0x")) { // Category mapping
- String[] values = line.split(" ", 2); // Split only first space
-
- if(!values[0].contains("..")) {
- int cp = Integer.decode(values[0]).intValue();
- dictionary.putCharacterCategory(cp, values[1]);
- } else {
- String[] codePoints = values[0].split("\\.\\.");
- int cpFrom = Integer.decode(codePoints[0]).intValue();
- int cpTo = Integer.decode(codePoints[1]).intValue();
-
- for(int i = cpFrom; i <= cpTo; i++){
- dictionary.putCharacterCategory(i, values[1]);
- }
- }
- } else { // Invoke definition
- String[] values = line.split(" "); // Consecutive space is merged above
- String characterClassName = values[0];
- int invoke = Integer.parseInt(values[1]);
- int group = Integer.parseInt(values[2]);
- int length = Integer.parseInt(values[3]);
- dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
- }
-
- }
-
- reader.close();
-
- return dictionary;
- }
-}
View
7 src/test/java/org/atilika/kuromoji/dict/UserDictionaryTest.java
@@ -16,12 +16,13 @@
*/
package org.atilika.kuromoji.dict;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
import java.io.IOException;
-import org.atilika.kuromoji.dict.UserDictionary;
-import org.junit.BeforeClass;
import org.junit.Test;
-import static org.junit.Assert.*;
/**
* @author Masaru Hasegawa
View
6 src/test/java/org/atilika/kuromoji/trie/DoubleArrayTrieTest.java
@@ -65,10 +65,7 @@ public void writeTest() throws IOException {
}
assertTrue(dir.length() > 0);
-
}
-
-
@Test
public void lookupTest() throws IOException {
@@ -92,7 +89,6 @@ public void lookupTest() throws IOException {
assertTrue(doubleArrayTrie.lookup("abc") > 0);
assertTrue(doubleArrayTrie.lookup("あいう") > 0);
assertTrue(doubleArrayTrie.lookup("xyz") < 0);
-
}
private Trie getTrie() {
@@ -103,6 +99,4 @@ private Trie getTrie() {
trie.add("あいう");
return trie;
}
-
-
}
View
61 src/test/java/org/atilika/kuromoji/viterbi/ViterbiTest.java
@@ -1,61 +0,0 @@
-/**
- * Copyright © 2010-2011 Atilika Inc. All rights reserved.
- *
- * Atilika Inc. licenses this file to you under the Apache License, Version
- * 2.0 (the "License"); you may not use this file except in compliance with
- * the License. A copy of the License is distributed with this work in the
- * LICENSE.txt file. You may also obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- */
-package org.atilika.kuromoji.viterbi;
-
-import java.io.IOException;
-
-import org.atilika.kuromoji.dict.ConnectionCosts;
-import org.atilika.kuromoji.dict.TokenInfoDictionary;
-import org.junit.Test;
-
-/**
- * @author Masaru Hasegawa
- * @author Christian Moen
- */
-public class ViterbiTest {
-
- private static Viterbi viterbi;
-
- private static TokenInfoDictionary dictionary;
-
- private static ConnectionCosts costs;
-
- @Test
- public void dummyTest() throws IOException, ClassNotFoundException{
-// System.out.print("reading tokeninfo dict...");
-// long dictStart = System.currentTimeMillis();
-//// TokenInfoDictionary dictionary = TokenInfoDictionary.read(getClass().getClassLoader().getResourceAsStream(TokenInfoDictionary.FILENAME));
-// System.out.println("done in " + (System.currentTimeMillis() - dictStart) + " ms");
-//
-// System.out.print("reading connection costs...");
-// long costStart = System.currentTimeMillis();
-// ConnectionCosts costs = ConnectionCosts.read(getClass().getClassLoader().getResourceAsStream(ConnectionCosts.FILENAME));
-// System.out.println("done in " + (System.currentTimeMillis() - costStart) + " ms");
-//
-// GraphvizFormatter formatter = new GraphvizFormatter(dictionary, costs);
-//
-// Viterbi viterbi = new Tokenizer().initialize(null, Mode.EXTENDED);
-// ViterbiNode[][][] graph = viterbi.build("ピタゴラスイッチ");
-//
-// File viterbiDebug = File.createTempFile("debug-viterbi-", ".gv");
-// System.out.println("Writing to output file " + viterbiDebug.getCanonicalPath());
-// PrintWriter outputStream = new PrintWriter(new FileOutputStream(viterbiDebug));
-//
-// outputStream.println(formatter.format(graph[0], graph[1], viterbi.search(graph)));
-// outputStream.close();
- }
-}
Please sign in to comment.
Something went wrong with that request. Please try again.