Permalink
Browse files

Made source dictionary encoding configurable.

  • Loading branch information...
1 parent 4a88891 commit 872caf361439abcc88a778a71ca0d189c8e9fd06 masaruh committed Mar 1, 2011
View
@@ -85,6 +85,7 @@
<arguments>
<argument>dictionary/mecab-ipadic-2.7.0-20070801</argument>
<argument>target/classes</argument>
+ <argument>euc-jp</argument>
<argument>false</argument>
</arguments>
</configuration>
@@ -36,9 +36,9 @@ public DictionaryBuilder() {
}
- public void build(String inputDirname, String outputDirname, boolean normalizeEntry) throws IOException {
+ public void build(String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException {
System.out.println("building tokeninfo dict...");
- TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(normalizeEntry);
+ TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(encoding, normalizeEntry);
TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
System.out.print(" building double array trie...");
@@ -62,7 +62,7 @@ public void build(String inputDirname, String outputDirname, boolean normalizeEn
System.out.println("done");
System.out.print("building unknown word dict...");
- UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder();
+ UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
unkDictionary.write(outputDirname);
System.out.println("done");
@@ -76,8 +76,8 @@ public void build(String inputDirname, String outputDirname, boolean normalizeEn
public static void main(String[] args) throws IOException, ClassNotFoundException {
DictionaryBuilder builder = new DictionaryBuilder();
- boolean normalizeEntry = Boolean.parseBoolean(args[2]);
- builder.build(args[0], args[1], normalizeEntry);
+ boolean normalizeEntry = Boolean.parseBoolean(args[3]);
+ builder.build(args[0], args[1], args[2], normalizeEntry);
}
}
@@ -37,20 +37,20 @@
* @author Christian Moen
*/
public class TokenInfoDictionaryBuilder {
- private static final String DEFAULT_DICTIONARY_ENCODING = "euc-jp";
-
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file*/
private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
+
+ private String encoding = "euc-jp";
private boolean normalizeEntry = false;
public TokenInfoDictionaryBuilder() {
- this(false);
}
- public TokenInfoDictionaryBuilder(boolean normalizeEntry) {
+ public TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntry) {
+ this.encoding = encoding;
dictionaryEntries = new TreeMap<Integer, String>();
this.normalizeEntry = normalizeEntry;
}
@@ -74,7 +74,7 @@ public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOExcepti
for(File file : csvFiles){
FileInputStream inputStream = new FileInputStream(file);
- InputStreamReader streamReader = new InputStreamReader(inputStream, DEFAULT_DICTIONARY_ENCODING);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
BufferedReader reader = new BufferedReader(streamReader);
String line = null;
@@ -30,15 +30,18 @@
* @author Christian Moen
*/
public class UnknownDictionaryBuilder {
-
- private static final String DEFAULT_DICTIONARY_ENCODING = "euc-jp";
-
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*";
+ private String encoding = "euc-jp";
+
public UnknownDictionaryBuilder() {
}
+ public UnknownDictionaryBuilder(String encoding) {
+ this.encoding = encoding;
+ }
+
public UnknownDictionary build(String dirname) throws IOException {
UnknownDictionary unkDictionary = null;
unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
@@ -48,7 +51,7 @@ public UnknownDictionary build(String dirname) throws IOException {
public UnknownDictionary readDictionaryFile(String filename)
throws IOException {
- return readDictionaryFile(filename, DEFAULT_DICTIONARY_ENCODING);
+ return readDictionaryFile(filename, encoding);
}
public UnknownDictionary readDictionaryFile(String filename, String encoding)
@@ -71,7 +74,7 @@ public UnknownDictionary readDictionaryFile(String filename, String encoding)
public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream, DEFAULT_DICTIONARY_ENCODING);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
LineNumberReader lineReader = new LineNumberReader(streamReader);
String line = null;

0 comments on commit 872caf3

Please sign in to comment.