Merge #91: [HIVEMALL-122] Added tokenize_cn UDF

apache · Jul 1, 2017 · ec6d945 · ec6d945
2 parents 9876d06 + efc3a6d
commit ec6d945
Show file tree

Hide file tree

Showing 8 changed files with 271 additions and 10 deletions.
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
@@ -49,7 +49,7 @@
 
 * [List of generic Hivemall functions](misc/generic_funcs.md)
 * [Efficient Top-K query processing](misc/topk.md)
-* [English/Japanese Text Tokenizer](misc/tokenizer.md)
+* [Text Tokenizer](misc/tokenizer.md)
 
 ## Part III - Feature Engineering
 

diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
@@ -24,16 +24,20 @@ Hivemall provides simple English text tokenizer UDF that has following syntax:
 tokenize(text input, optional boolean toLowerCase = false)
 ```
 
-# Tokenizer for Japanese Texts
+# Tokenizer for Non-English Texts
 
-Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji). 
+Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.
 
 First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).
 
 > add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
 
 > source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
 
+## Japanese Tokenizer
+
+Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji). 
+
 The signature of the UDF is as follows:
 ```sql
 tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags)
@@ -46,4 +50,21 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
 ```
 > ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"]
 
-For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
+For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
+
+## Chinese Tokenizer
+
+Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). 
+
+The signature of the UDF is as follows:
+```sql
+tokenize_cn(string line, optional const array<string> stopWords)
+```
+
+It's basic usage is as follows:
+```sql
+select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统，Java语言编写，修改的中科院计算所ICTCLAS分词系统。");
+```
+> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统]
+
+For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well.
diff --git a/nlp/pom.xml b/nlp/pom.xml
@@ -117,6 +117,12 @@
 			<version>5.3.1</version>
 			<scope>compile</scope>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.lucene</groupId>
+			<artifactId>lucene-analyzers-smartcn</artifactId>
+			<version>5.3.1</version>
+			<scope>compile</scope>
+		</dependency>
 
 		<!-- test scope -->
 		<dependency>
@@ -171,6 +177,7 @@
 								<includes>
 									<include>io.github.myui:hivemall-core</include>
 									<include>org.apache.lucene:lucene-analyzers-kuromoji</include>
+									<include>org.apache.lucene:lucene-analyzers-smartcn</include>
 									<include>org.apache.lucene:lucene-analyzers-common</include>
 									<include>org.apache.lucene:lucene-core</include>
 								</includes>
@@ -182,6 +189,12 @@
 										<include>**</include>
 									</includes>
 								</filter>
+								<filter>
+									<artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact>
+									<includes>
+										<include>**</include>
+									</includes>
+								</filter>
 								<filter>
 									<artifact>org.apache.lucene:lucene-analyzers-common</artifact>
 									<includes>

diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.IOUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])"
+		+ " - returns tokenized strings in array<string>")
+@UDFType(deterministic = true, stateful = false)
+public final class SmartcnUDF extends GenericUDF {
+
+	private String[] _stopWordsArray;
+
+	private transient SmartChineseAnalyzer _analyzer;
+
+	@Override
+	public ObjectInspector initialize(ObjectInspector[] arguments)
+			throws UDFArgumentException {
+		final int arglen = arguments.length;
+		if (arglen < 1 || arglen > 2) {
+			throw new UDFArgumentException(
+					"Invalid number of arguments for `tokenize_cn`: " + arglen);
+		}
+
+		this._stopWordsArray = (arglen >= 2) ? HiveUtils
+				.getConstStringArray(arguments[1]) : null;
+		this._analyzer = null;
+
+		return ObjectInspectorFactory
+				.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+	}
+
+	@Override
+	public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+		SmartChineseAnalyzer analyzer = _analyzer;
+		if (analyzer == null) {
+			CharArraySet stopwords = stopWords(_stopWordsArray);
+			analyzer = new SmartChineseAnalyzer(stopwords);
+			this._analyzer = analyzer;
+		}
+
+		Object arg0 = arguments[0].get();
+		if (arg0 == null) {
+			return null;
+		}
+		String line = arg0.toString();
+
+		final List<Text> results = new ArrayList<Text>(32);
+		TokenStream stream = null;
+		try {
+			stream = analyzer.tokenStream("", line);
+			if (stream != null) {
+				analyzeTokens(stream, results);
+			}
+		} catch (IOException e) {
+			IOUtils.closeQuietly(analyzer);
+			throw new HiveException(e);
+		} finally {
+			IOUtils.closeQuietly(stream);
+		}
+		return results;
+	}
+
+	@Override
+	public void close() throws IOException {
+		IOUtils.closeQuietly(_analyzer);
+	}
+
+	@Nonnull
+	private static CharArraySet stopWords(@Nonnull final String[] array)
+			throws UDFArgumentException {
+		if (array == null) {
+			return SmartChineseAnalyzer.getDefaultStopSet();
+		}
+		if (array.length == 0) {
+			return CharArraySet.EMPTY_SET;
+		}
+		CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */
+				true);
+		return results;
+	}
+
+	private static void analyzeTokens(@Nonnull TokenStream stream,
+			@Nonnull List<Text> results) throws IOException {
+		// instantiate an attribute placeholder once
+		CharTermAttribute termAttr = stream
+				.getAttribute(CharTermAttribute.class);
+		stream.reset();
+
+		while (stream.incrementToken()) {
+			String term = termAttr.toString();
+			results.add(new Text(term));
+		}
+	}
+
+	@Override
+	public String getDisplayString(String[] children) {
+		return "tokenize_cn(" + Arrays.toString(children) + ')';
+	}
+}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -40,7 +40,7 @@
 public class KuromojiUDFTest {
 
     @Test
-    public void testOneArgment() throws UDFArgumentException, IOException {
+    public void testOneArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line
@@ -50,7 +50,7 @@ public void testOneArgment() throws UDFArgumentException, IOException {
     }
 
     @Test
-    public void testTwoArgment() throws UDFArgumentException, IOException {
+    public void testTwoArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[2];
         // line
@@ -94,7 +94,7 @@ public void testInvalidMode() throws UDFArgumentException, IOException {
     }
 
     @Test
-    public void testThreeArgment() throws UDFArgumentException, IOException {
+    public void testThreeArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[3];
         // line
@@ -112,7 +112,7 @@ public void testThreeArgment() throws UDFArgumentException, IOException {
     }
 
     @Test
-    public void testFourArgment() throws UDFArgumentException, IOException {
+    public void testFourArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[4];
         // line
@@ -133,7 +133,7 @@ public void testFourArgment() throws UDFArgumentException, IOException {
     }
 
     @Test
-    public void testEvalauteOneRow() throws IOException, HiveException {
+    public void testEvaluateOneRow() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line
@@ -156,7 +156,7 @@ public void prepare(int arg) throws HiveException {}
     }
 
     @Test
-    public void testEvalauteTwoRows() throws IOException, HiveException {
+    public void testEvaluateTwoRows() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line

diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class SmartcnUDFTest {
+
+	@Test
+	public void testOneArgument() throws UDFArgumentException, IOException {
+		GenericUDF udf = new SmartcnUDF();
+		ObjectInspector[] argOIs = new ObjectInspector[1];
+		// line
+		argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+		udf.initialize(argOIs);
+		udf.close();
+	}
+
+	@Test
+	public void testTwoArgument() throws UDFArgumentException, IOException {
+		GenericUDF udf = new SmartcnUDF();
+		ObjectInspector[] argOIs = new ObjectInspector[2];
+		// line
+		argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+		// stopWords
+		argOIs[1] = ObjectInspectorFactory
+				.getStandardConstantListObjectInspector(
+						PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+						null);
+		udf.initialize(argOIs);
+		udf.close();
+	}
+
+	@Test
+	public void testEvaluateOneRow() throws IOException, HiveException {
+		SmartcnUDF udf = new SmartcnUDF();
+		ObjectInspector[] argOIs = new ObjectInspector[1];
+		// line
+		argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+		udf.initialize(argOIs);
+
+		DeferredObject[] args = new DeferredObject[1];
+		args[0] = new DeferredObject() {
+			public Text get() throws HiveException {
+				return new Text(
+						"Smartcn为Apache2.0协议的开源中文分词系统，Java语言编写，修改的中科院计算所ICTCLAS分词系统。");
+			}
+
+			@Override
+			public void prepare(int arg) throws HiveException {
+			}
+		};
+		List<Text> tokens = udf.evaluate(args);
+		Assert.assertNotNull(tokens);
+		udf.close();
+	}
+}
diff --git a/resources/ddl/define-additional.hive b/resources/ddl/define-additional.hive
@@ -9,6 +9,9 @@
 drop temporary function if exists tokenize_ja;
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
 
+drop temporary function if exists tokenize_cn;
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
+
 ------------------------------
 -- XGBoost related features --
 ------------------------------

diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
@@ -177,6 +177,7 @@ create temporary function tree_export as 'hivemall.smile.tools.TreeExportUDF';
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
 
 -- Backward compatibilities
 create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';