Skip to content
This repository has been archived by the owner on Sep 20, 2022. It is now read-only.

[HIVEMALL-122] Added tokenize_cn UDF #91

Merged
merged 4 commits into from Jul 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/gitbook/SUMMARY.md
Expand Up @@ -49,7 +49,7 @@

* [List of generic Hivemall functions](misc/generic_funcs.md)
* [Efficient Top-K query processing](misc/topk.md)
* [English/Japanese Text Tokenizer](misc/tokenizer.md)
* [Text Tokenizer](misc/tokenizer.md)

## Part III - Feature Engineering

Expand Down
27 changes: 24 additions & 3 deletions docs/gitbook/misc/tokenizer.md
Expand Up @@ -24,16 +24,20 @@ Hivemall provides simple English text tokenizer UDF that has following syntax:
tokenize(text input, optional boolean toLowerCase = false)
```

# Tokenizer for Japanese Texts
# Tokenizer for Non-English Texts

Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji).
Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.

First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).

> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);

> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);

## Japanese Tokenizer

Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji).

The signature of the UDF is as follows:
```sql
tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags)
Expand All @@ -46,4 +50,21 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
```
> ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"]

For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.

## Chinese Tokenizer

Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).

The signature of the UDF is as follows:
```sql
tokenize_cn(string line, optional const array<string> stopWords)
```

It's basic usage is as follows:
```sql
select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
```
> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统]

For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well.
13 changes: 13 additions & 0 deletions nlp/pom.xml
Expand Up @@ -117,6 +117,12 @@
<version>5.3.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>5.3.1</version>
<scope>compile</scope>
</dependency>

<!-- test scope -->
<dependency>
Expand Down Expand Up @@ -171,6 +177,7 @@
<includes>
<include>io.github.myui:hivemall-core</include>
<include>org.apache.lucene:lucene-analyzers-kuromoji</include>
<include>org.apache.lucene:lucene-analyzers-smartcn</include>
<include>org.apache.lucene:lucene-analyzers-common</include>
<include>org.apache.lucene:lucene-core</include>
</includes>
Expand All @@ -182,6 +189,12 @@
<include>**</include>
</includes>
</filter>
<filter>
<artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact>
<includes>
<include>**</include>
</includes>
</filter>
<filter>
<artifact>org.apache.lucene:lucene-analyzers-common</artifact>
<includes>
Expand Down
138 changes: 138 additions & 0 deletions nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.nlp.tokenizer;

import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.io.IOUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import javax.annotation.Nonnull;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;

@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])"
+ " - returns tokenized strings in array<string>")
@UDFType(deterministic = true, stateful = false)
public final class SmartcnUDF extends GenericUDF {

private String[] _stopWordsArray;

private transient SmartChineseAnalyzer _analyzer;

@Override
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
final int arglen = arguments.length;
if (arglen < 1 || arglen > 2) {
throw new UDFArgumentException(
"Invalid number of arguments for `tokenize_cn`: " + arglen);
}

this._stopWordsArray = (arglen >= 2) ? HiveUtils
.getConstStringArray(arguments[1]) : null;
this._analyzer = null;

return ObjectInspectorFactory
.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}

@Override
public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
SmartChineseAnalyzer analyzer = _analyzer;
if (analyzer == null) {
CharArraySet stopwords = stopWords(_stopWordsArray);
analyzer = new SmartChineseAnalyzer(stopwords);
this._analyzer = analyzer;
}

Object arg0 = arguments[0].get();
if (arg0 == null) {
return null;
}
String line = arg0.toString();

final List<Text> results = new ArrayList<Text>(32);
TokenStream stream = null;
try {
stream = analyzer.tokenStream("", line);
if (stream != null) {
analyzeTokens(stream, results);
}
} catch (IOException e) {
IOUtils.closeQuietly(analyzer);
throw new HiveException(e);
} finally {
IOUtils.closeQuietly(stream);
}
return results;
}

@Override
public void close() throws IOException {
IOUtils.closeQuietly(_analyzer);
}

@Nonnull
private static CharArraySet stopWords(@Nonnull final String[] array)
throws UDFArgumentException {
if (array == null) {
return SmartChineseAnalyzer.getDefaultStopSet();
}
if (array.length == 0) {
return CharArraySet.EMPTY_SET;
}
CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */
true);
return results;
}

private static void analyzeTokens(@Nonnull TokenStream stream,
@Nonnull List<Text> results) throws IOException {
// instantiate an attribute placeholder once
CharTermAttribute termAttr = stream
.getAttribute(CharTermAttribute.class);
stream.reset();

while (stream.incrementToken()) {
String term = termAttr.toString();
results.add(new Text(term));
}
}

@Override
public String getDisplayString(String[] children) {
return "tokenize_cn(" + Arrays.toString(children) + ')';
}
}
12 changes: 6 additions & 6 deletions nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
Expand Up @@ -40,7 +40,7 @@
public class KuromojiUDFTest {

@Test
public void testOneArgment() throws UDFArgumentException, IOException {
public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
Expand All @@ -50,7 +50,7 @@ public void testOneArgment() throws UDFArgumentException, IOException {
}

@Test
public void testTwoArgment() throws UDFArgumentException, IOException {
public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
Expand Down Expand Up @@ -94,7 +94,7 @@ public void testInvalidMode() throws UDFArgumentException, IOException {
}

@Test
public void testThreeArgment() throws UDFArgumentException, IOException {
public void testThreeArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[3];
// line
Expand All @@ -112,7 +112,7 @@ public void testThreeArgment() throws UDFArgumentException, IOException {
}

@Test
public void testFourArgment() throws UDFArgumentException, IOException {
public void testFourArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[4];
// line
Expand All @@ -133,7 +133,7 @@ public void testFourArgment() throws UDFArgumentException, IOException {
}

@Test
public void testEvalauteOneRow() throws IOException, HiveException {
public void testEvaluateOneRow() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
Expand All @@ -156,7 +156,7 @@ public void prepare(int arg) throws HiveException {}
}

@Test
public void testEvalauteTwoRows() throws IOException, HiveException {
public void testEvaluateTwoRows() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
Expand Down
85 changes: 85 additions & 0 deletions nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.nlp.tokenizer;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.junit.Assert;
import org.junit.Test;

public class SmartcnUDFTest {

@Test
public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
udf.initialize(argOIs);
udf.close();
}

@Test
public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// stopWords
argOIs[1] = ObjectInspectorFactory
.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector,
null);
udf.initialize(argOIs);
udf.close();
}

@Test
public void testEvaluateOneRow() throws IOException, HiveException {
SmartcnUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
udf.initialize(argOIs);

DeferredObject[] args = new DeferredObject[1];
args[0] = new DeferredObject() {
public Text get() throws HiveException {
return new Text(
"Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
}

@Override
public void prepare(int arg) throws HiveException {
}
};
List<Text> tokens = udf.evaluate(args);
Assert.assertNotNull(tokens);
udf.close();
}
}
3 changes: 3 additions & 0 deletions resources/ddl/define-additional.hive
Expand Up @@ -9,6 +9,9 @@
drop temporary function if exists tokenize_ja;
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';

drop temporary function if exists tokenize_cn;
create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also update resources/ddl/define-udfs.td.hql?

$ grep -r 'tokenize_ja' resources/ddl
resources/ddl/define-additional.hive:drop temporary function if exists tokenize_ja;
resources/ddl/define-additional.hive:create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
resources/ddl/define-udfs.td.hql:create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';


------------------------------
-- XGBoost related features --
------------------------------
Expand Down
1 change: 1 addition & 0 deletions resources/ddl/define-udfs.td.hql
Expand Up @@ -176,6 +176,7 @@ create temporary function train_regression as 'hivemall.regression.GeneralRegres

-- NLP features
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';

-- Backward compatibilities
create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';
Expand Down