diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 32b0150fc..b9d4e8a1b 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -49,7 +49,7 @@ * [List of generic Hivemall functions](misc/generic_funcs.md) * [Efficient Top-K query processing](misc/topk.md) -* [English/Japanese Text Tokenizer](misc/tokenizer.md) +* [Text Tokenizer](misc/tokenizer.md) ## Part III - Feature Engineering diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md index 47f07e009..99f281df5 100644 --- a/docs/gitbook/misc/tokenizer.md +++ b/docs/gitbook/misc/tokenizer.md @@ -24,9 +24,9 @@ Hivemall provides simple English text tokenizer UDF that has following syntax: tokenize(text input, optional boolean toLowerCase = false) ``` -# Tokenizer for Japanese Texts +# Tokenizer for Non-English Texts -Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji). +Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows. First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases). @@ -34,6 +34,10 @@ First of all, you need to issue the following DDLs to use the NLP module. Note N > source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases); +## Japanese Tokenizer + +Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji). + The signature of the UDF is as follows: ```sql tokenize_ja(text input, optional const text mode = "normal", optional const array stopWords, optional const array stopTags) @@ -46,4 +50,21 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第 ``` > ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"] -For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well. \ No newline at end of file +For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well. + +## Chinese Tokenizer + +Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). + +The signature of the UDF is as follows: +```sql +tokenize_cn(string line, optional const array stopWords) +``` + +It's basic usage is as follows: +```sql +select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。"); +``` +> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统] + +For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well. \ No newline at end of file diff --git a/nlp/pom.xml b/nlp/pom.xml index b6ea409bb..021cd6d76 100644 --- a/nlp/pom.xml +++ b/nlp/pom.xml @@ -117,6 +117,12 @@ 5.3.1 compile + + org.apache.lucene + lucene-analyzers-smartcn + 5.3.1 + compile + @@ -171,6 +177,7 @@ io.github.myui:hivemall-core org.apache.lucene:lucene-analyzers-kuromoji + org.apache.lucene:lucene-analyzers-smartcn org.apache.lucene:lucene-analyzers-common org.apache.lucene:lucene-core @@ -182,6 +189,12 @@ ** + + org.apache.lucene:lucene-analyzers-smartcn + + ** + + org.apache.lucene:lucene-analyzers-common diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java new file mode 100644 index 000000000..a016c7e36 --- /dev/null +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.nlp.tokenizer; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.io.IOUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import javax.annotation.Nonnull; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; + +@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list stopWords])" + + " - returns tokenized strings in array") +@UDFType(deterministic = true, stateful = false) +public final class SmartcnUDF extends GenericUDF { + + private String[] _stopWordsArray; + + private transient SmartChineseAnalyzer _analyzer; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + final int arglen = arguments.length; + if (arglen < 1 || arglen > 2) { + throw new UDFArgumentException( + "Invalid number of arguments for `tokenize_cn`: " + arglen); + } + + this._stopWordsArray = (arglen >= 2) ? HiveUtils + .getConstStringArray(arguments[1]) : null; + this._analyzer = null; + + return ObjectInspectorFactory + .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + @Override + public List evaluate(DeferredObject[] arguments) throws HiveException { + SmartChineseAnalyzer analyzer = _analyzer; + if (analyzer == null) { + CharArraySet stopwords = stopWords(_stopWordsArray); + analyzer = new SmartChineseAnalyzer(stopwords); + this._analyzer = analyzer; + } + + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + String line = arg0.toString(); + + final List results = new ArrayList(32); + TokenStream stream = null; + try { + stream = analyzer.tokenStream("", line); + if (stream != null) { + analyzeTokens(stream, results); + } + } catch (IOException e) { + IOUtils.closeQuietly(analyzer); + throw new HiveException(e); + } finally { + IOUtils.closeQuietly(stream); + } + return results; + } + + @Override + public void close() throws IOException { + IOUtils.closeQuietly(_analyzer); + } + + @Nonnull + private static CharArraySet stopWords(@Nonnull final String[] array) + throws UDFArgumentException { + if (array == null) { + return SmartChineseAnalyzer.getDefaultStopSet(); + } + if (array.length == 0) { + return CharArraySet.EMPTY_SET; + } + CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */ + true); + return results; + } + + private static void analyzeTokens(@Nonnull TokenStream stream, + @Nonnull List results) throws IOException { + // instantiate an attribute placeholder once + CharTermAttribute termAttr = stream + .getAttribute(CharTermAttribute.class); + stream.reset(); + + while (stream.incrementToken()) { + String term = termAttr.toString(); + results.add(new Text(term)); + } + } + + @Override + public String getDisplayString(String[] children) { + return "tokenize_cn(" + Arrays.toString(children) + ')'; + } +} diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java index 005e689fe..7bbaed7b8 100644 --- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java @@ -40,7 +40,7 @@ public class KuromojiUDFTest { @Test - public void testOneArgment() throws UDFArgumentException, IOException { + public void testOneArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line @@ -50,7 +50,7 @@ public void testOneArgment() throws UDFArgumentException, IOException { } @Test - public void testTwoArgment() throws UDFArgumentException, IOException { + public void testTwoArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[2]; // line @@ -94,7 +94,7 @@ public void testInvalidMode() throws UDFArgumentException, IOException { } @Test - public void testThreeArgment() throws UDFArgumentException, IOException { + public void testThreeArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[3]; // line @@ -112,7 +112,7 @@ public void testThreeArgment() throws UDFArgumentException, IOException { } @Test - public void testFourArgment() throws UDFArgumentException, IOException { + public void testFourArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[4]; // line @@ -133,7 +133,7 @@ public void testFourArgment() throws UDFArgumentException, IOException { } @Test - public void testEvalauteOneRow() throws IOException, HiveException { + public void testEvaluateOneRow() throws IOException, HiveException { KuromojiUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line @@ -156,7 +156,7 @@ public void prepare(int arg) throws HiveException {} } @Test - public void testEvalauteTwoRows() throws IOException, HiveException { + public void testEvaluateTwoRows() throws IOException, HiveException { KuromojiUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java new file mode 100644 index 000000000..67c2283ab --- /dev/null +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.nlp.tokenizer; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class SmartcnUDFTest { + + @Test + public void testOneArgument() throws UDFArgumentException, IOException { + GenericUDF udf = new SmartcnUDF(); + ObjectInspector[] argOIs = new ObjectInspector[1]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + udf.initialize(argOIs); + udf.close(); + } + + @Test + public void testTwoArgument() throws UDFArgumentException, IOException { + GenericUDF udf = new SmartcnUDF(); + ObjectInspector[] argOIs = new ObjectInspector[2]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + // stopWords + argOIs[1] = ObjectInspectorFactory + .getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + null); + udf.initialize(argOIs); + udf.close(); + } + + @Test + public void testEvaluateOneRow() throws IOException, HiveException { + SmartcnUDF udf = new SmartcnUDF(); + ObjectInspector[] argOIs = new ObjectInspector[1]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + udf.initialize(argOIs); + + DeferredObject[] args = new DeferredObject[1]; + args[0] = new DeferredObject() { + public Text get() throws HiveException { + return new Text( + "Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。"); + } + + @Override + public void prepare(int arg) throws HiveException { + } + }; + List tokens = udf.evaluate(args); + Assert.assertNotNull(tokens); + udf.close(); + } +} diff --git a/resources/ddl/define-additional.hive b/resources/ddl/define-additional.hive index 7bbfcf4ef..af5cf824a 100644 --- a/resources/ddl/define-additional.hive +++ b/resources/ddl/define-additional.hive @@ -9,6 +9,9 @@ drop temporary function if exists tokenize_ja; create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; +drop temporary function if exists tokenize_cn; +create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF'; + ------------------------------ -- XGBoost related features -- ------------------------------ diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 1d11d1a5c..953a6ac7d 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -176,6 +176,7 @@ create temporary function train_regression as 'hivemall.regression.GeneralRegres -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; +create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF'; -- Backward compatibilities create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';