Skip to content
This repository has been archived by the owner on Sep 20, 2022. It is now read-only.

Commit

Permalink
Merge #91: [HIVEMALL-122] Added tokenize_cn UDF
Browse files Browse the repository at this point in the history
  • Loading branch information
takuti committed Jul 1, 2017
2 parents 9876d06 + efc3a6d commit ec6d945
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/gitbook/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@

* [List of generic Hivemall functions](misc/generic_funcs.md)
* [Efficient Top-K query processing](misc/topk.md)
* [English/Japanese Text Tokenizer](misc/tokenizer.md)
* [Text Tokenizer](misc/tokenizer.md)

## Part III - Feature Engineering

Expand Down
27 changes: 24 additions & 3 deletions docs/gitbook/misc/tokenizer.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,20 @@ Hivemall provides simple English text tokenizer UDF that has following syntax:
tokenize(text input, optional boolean toLowerCase = false)
```

# Tokenizer for Japanese Texts
# Tokenizer for Non-English Texts

Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji).
Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.

First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).

> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
## Japanese Tokenizer

Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji).

The signature of the UDF is as follows:
```sql
tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags)
Expand All @@ -46,4 +50,21 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
```
> ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"]
For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.

## Chinese Tokenizer

Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).

The signature of the UDF is as follows:
```sql
tokenize_cn(string line, optional const array<string> stopWords)
```

It's basic usage is as follows:
```sql
select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
```
> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统]
For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well.
13 changes: 13 additions & 0 deletions nlp/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@
<version>5.3.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>5.3.1</version>
<scope>compile</scope>
</dependency>

<!-- test scope -->
<dependency>
Expand Down Expand Up @@ -171,6 +177,7 @@
<includes>
<include>io.github.myui:hivemall-core</include>
<include>org.apache.lucene:lucene-analyzers-kuromoji</include>
<include>org.apache.lucene:lucene-analyzers-smartcn</include>
<include>org.apache.lucene:lucene-analyzers-common</include>
<include>org.apache.lucene:lucene-core</include>
</includes>
Expand All @@ -182,6 +189,12 @@
<include>**</include>
</includes>
</filter>
<filter>
<artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact>
<includes>
<include>**</include>
</includes>
</filter>
<filter>
<artifact>org.apache.lucene:lucene-analyzers-common</artifact>
<includes>
Expand Down
138 changes: 138 additions & 0 deletions nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.nlp.tokenizer;

import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.io.IOUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import javax.annotation.Nonnull;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;

@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])"
+ " - returns tokenized strings in array<string>")
@UDFType(deterministic = true, stateful = false)
public final class SmartcnUDF extends GenericUDF {

private String[] _stopWordsArray;

private transient SmartChineseAnalyzer _analyzer;

@Override
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
final int arglen = arguments.length;
if (arglen < 1 || arglen > 2) {
throw new UDFArgumentException(
"Invalid number of arguments for `tokenize_cn`: " + arglen);
}

this._stopWordsArray = (arglen >= 2) ? HiveUtils
.getConstStringArray(arguments[1]) : null;
this._analyzer = null;

return ObjectInspectorFactory
.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}

@Override
public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
SmartChineseAnalyzer analyzer = _analyzer;
if (analyzer == null) {
CharArraySet stopwords = stopWords(_stopWordsArray);
analyzer = new SmartChineseAnalyzer(stopwords);
this._analyzer = analyzer;
}

Object arg0 = arguments[0].get();
if (arg0 == null) {
return null;
}
String line = arg0.toString();

final List<Text> results = new ArrayList<Text>(32);
TokenStream stream = null;
try {
stream = analyzer.tokenStream("", line);
if (stream != null) {
analyzeTokens(stream, results);
}
} catch (IOException e) {
IOUtils.closeQuietly(analyzer);
throw new HiveException(e);
} finally {
IOUtils.closeQuietly(stream);
}
return results;
}

@Override
public void close() throws IOException {
IOUtils.closeQuietly(_analyzer);
}

@Nonnull
private static CharArraySet stopWords(@Nonnull final String[] array)
throws UDFArgumentException {
if (array == null) {
return SmartChineseAnalyzer.getDefaultStopSet();
}
if (array.length == 0) {
return CharArraySet.EMPTY_SET;
}
CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */
true);
return results;
}

private static void analyzeTokens(@Nonnull TokenStream stream,
@Nonnull List<Text> results) throws IOException {
// instantiate an attribute placeholder once
CharTermAttribute termAttr = stream
.getAttribute(CharTermAttribute.class);
stream.reset();

while (stream.incrementToken()) {
String term = termAttr.toString();
results.add(new Text(term));
}
}

@Override
public String getDisplayString(String[] children) {
return "tokenize_cn(" + Arrays.toString(children) + ')';
}
}
12 changes: 6 additions & 6 deletions nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
public class KuromojiUDFTest {

@Test
public void testOneArgment() throws UDFArgumentException, IOException {
public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
Expand All @@ -50,7 +50,7 @@ public void testOneArgment() throws UDFArgumentException, IOException {
}

@Test
public void testTwoArgment() throws UDFArgumentException, IOException {
public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
Expand Down Expand Up @@ -94,7 +94,7 @@ public void testInvalidMode() throws UDFArgumentException, IOException {
}

@Test
public void testThreeArgment() throws UDFArgumentException, IOException {
public void testThreeArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[3];
// line
Expand All @@ -112,7 +112,7 @@ public void testThreeArgment() throws UDFArgumentException, IOException {
}

@Test
public void testFourArgment() throws UDFArgumentException, IOException {
public void testFourArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[4];
// line
Expand All @@ -133,7 +133,7 @@ public void testFourArgment() throws UDFArgumentException, IOException {
}

@Test
public void testEvalauteOneRow() throws IOException, HiveException {
public void testEvaluateOneRow() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
Expand All @@ -156,7 +156,7 @@ public void prepare(int arg) throws HiveException {}
}

@Test
public void testEvalauteTwoRows() throws IOException, HiveException {
public void testEvaluateTwoRows() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
Expand Down
85 changes: 85 additions & 0 deletions nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.nlp.tokenizer;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.junit.Assert;
import org.junit.Test;

public class SmartcnUDFTest {

@Test
public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
udf.initialize(argOIs);
udf.close();
}

@Test
public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// stopWords
argOIs[1] = ObjectInspectorFactory
.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector,
null);
udf.initialize(argOIs);
udf.close();
}

@Test
public void testEvaluateOneRow() throws IOException, HiveException {
SmartcnUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
udf.initialize(argOIs);

DeferredObject[] args = new DeferredObject[1];
args[0] = new DeferredObject() {
public Text get() throws HiveException {
return new Text(
"Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
}

@Override
public void prepare(int arg) throws HiveException {
}
};
List<Text> tokens = udf.evaluate(args);
Assert.assertNotNull(tokens);
udf.close();
}
}
3 changes: 3 additions & 0 deletions resources/ddl/define-additional.hive
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
drop temporary function if exists tokenize_ja;
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';

drop temporary function if exists tokenize_cn;
create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';

------------------------------
-- XGBoost related features --
------------------------------
Expand Down
1 change: 1 addition & 0 deletions resources/ddl/define-udfs.td.hql
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ create temporary function tree_export as 'hivemall.smile.tools.TreeExportUDF';

-- NLP features
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';

-- Backward compatibilities
create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';
Expand Down

0 comments on commit ec6d945

Please sign in to comment.