Skip to content
This repository has been archived by the owner on Sep 20, 2022. It is now read-only.

Commit

Permalink
Close #110: [HIVEMALL-142] Implement SingularizeUDF
Browse files Browse the repository at this point in the history
  • Loading branch information
takuti authored and myui committed Sep 13, 2017
1 parent 8639810 commit 5e1d0d0
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 8 deletions.
173 changes: 173 additions & 0 deletions core/src/main/java/hivemall/tools/text/SingularizeUDF.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.tools.text;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Nullable;

import hivemall.utils.lang.StringUtils;

/**
* @link
* https://github.com/sundrio/sundrio/blob/95c2b11f7b842bdaa04f61e8e338aea60fb38f70/codegen/src
* /main/java/io/sundr/codegen/functions/Singularize.java
* @link https://github.com/clips/pattern/blob/
* 3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623
*/
@Description(name = "singularize",
value = "_FUNC_(string word) - Returns singular form of a given English word")
@UDFType(deterministic = true, stateful = false)
public final class SingularizeUDF extends UDF {

// sorted by an ascending (i.e., alphabetical) order for binary search
// plural preposition to detect compound words like "plural-preposition-something"
private static final String[] prepositions = new String[] {"about", "above", "across", "after",
"among", "around", "at", "athwart", "before", "behind", "below", "beneath", "beside",
"besides", "between", "betwixt", "beyond", "but", "by", "during", "except", "for",
"from", "in", "into", "near", "of", "off", "on", "onto", "out", "over", "since",
"till", "to", "under", "until", "unto", "upon", "with"};
// uninfected or uncountable words
private static final String[] unchanged = new String[] {"advice", "bison", "bread", "bream",
"breeches", "britches", "butter", "carp", "chassis", "cheese", "christmas", "clippers",
"cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "electricity",
"elk", "equipment", "flounder", "fruit", "furniture", "gallows", "garbage", "georgia",
"graffiti", "gravel", "happiness", "headquarters", "herpes", "high-jinks", "homework",
"information", "innings", "jackanapes", "ketchup", "knowledge", "love", "luggage",
"mackerel", "mathematics", "mayonnaise", "measles", "meat", "mews", "mumps", "mustard",
"news", "news", "pincers", "pliers", "proceedings", "progress", "rabies", "research",
"rice", "salmon", "sand", "scissors", "series", "shears", "software", "species",
"swine", "swiss", "trout", "tuna", "understanding", "water", "whiting", "wildebeest"};

private static final Map<String, String> irregular = new HashMap<String, String>();
static {
irregular.put("atlantes", "atlas");
irregular.put("atlases", "atlas");
irregular.put("axes", "axe");
irregular.put("beeves", "beef");
irregular.put("brethren", "brother");
irregular.put("children", "child");
irregular.put("corpora", "corpus");
irregular.put("corpuses", "corpus");
irregular.put("ephemerides", "ephemeris");
irregular.put("feet", "foot");
irregular.put("ganglia", "ganglion");
irregular.put("geese", "goose");
irregular.put("genera", "genus");
irregular.put("genii", "genie");
irregular.put("graffiti", "graffito");
irregular.put("helves", "helve");
irregular.put("kine", "cow");
irregular.put("leaves", "leaf");
irregular.put("loaves", "loaf");
irregular.put("men", "man");
irregular.put("mongooses", "mongoose");
irregular.put("monies", "money");
irregular.put("moves", "move");
irregular.put("mythoi", "mythos");
irregular.put("numena", "numen");
irregular.put("occipita", "occiput");
irregular.put("octopodes", "octopus");
irregular.put("opera", "opus");
irregular.put("opuses", "opus");
irregular.put("our", "my");
irregular.put("oxen", "ox");
irregular.put("penes", "penis");
irregular.put("penises", "penis");
irregular.put("people", "person");
irregular.put("sexes", "sex");
irregular.put("soliloquies", "soliloquy");
irregular.put("teeth", "tooth");
irregular.put("testes", "testis");
irregular.put("trilbys", "trilby");
irregular.put("turves", "turf");
irregular.put("zoa", "zoon");
}

private static final List<String> rules = Arrays.asList(
// regexp1, replacement1, regexp2, replacement2, ...
"(quiz)zes$", "$1", "(matr)ices$", "$1ix", "(vert|ind)ices$", "$1ex", "^(ox)en", "$1",
"(alias|status)$", "$1", "(alias|status)es$", "$1", "(octop|vir)us$", "$1us",
"(octop|vir)i$", "$1us", "(cris|ax|test)es$", "$1is", "(cris|ax|test)is$", "$1is",
"(shoe)s$", "$1", "(o)es$", "$1", "(bus)es$", "$1", "([m|l])ice$", "$1ouse",
"(x|ch|ss|sh)es$", "$1", "(m)ovies$", "$1ovie", "(s)eries$", "$1eries",
"([^aeiouy]|qu)ies$", "$1y", "([lr])ves$", "$1f", "(tive)s$", "$1", "(hive)s$", "$1",
"([^f])ves$", "$1fe", "(^analy)sis$", "$1sis", "(^analy)ses$", "$1sis",
"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis", "([ti])a$",
"$1um", "(n)ews$", "$1ews", "(s|si|u)s$", "$1s", "s$", "");

@Nullable
public String evaluate(@Nullable String word) {
return singularize(word);
}

@Nullable
private String singularize(@Nullable String word) {
if (word == null) {
return null;
}

if (word.isEmpty()) {
return word;
}

if (Arrays.binarySearch(unchanged, word) >= 0) {
return word;
}

if (word.contains("-")) { // compound words (e.g., mothers-in-law)
final List<String> chunks = new ArrayList<>();
chunks.addAll(Arrays.asList(word.split("-")));

if ((chunks.size() > 1) && (Arrays.binarySearch(prepositions, chunks.get(1)) >= 0)) {
String head = chunks.remove(0);
return singularize(head) + "-" + StringUtils.join(chunks, "-");
}
}

if (word.endsWith("'")) { // dogs' => dog's
return singularize(word.substring(0, word.length() - 1)) + "'s";
}

if (irregular.containsKey(word)) {
return irregular.get(word);
}

for (int i = 0, n = rules.size(); i < n; i += 2) {
Pattern pattern = Pattern.compile(rules.get(i), Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(word);
if (matcher.find()) {
return matcher.replaceAll(rules.get(i + 1));
}
}

return word;
}

}
38 changes: 30 additions & 8 deletions core/src/main/java/hivemall/utils/lang/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ public final class StringUtils {

private StringUtils() {}

public static byte[] getBytes(final String s) {
@Nonnull
public static byte[] getBytes(@Nonnull final String s) {
final int len = s.length();
final byte[] b = new byte[len * 2];
for (int i = 0; i < len; i++) {
Expand All @@ -37,11 +38,13 @@ public static byte[] getBytes(final String s) {
return b;
}

public static String toString(byte[] b) {
@Nonnull
public static String toString(@Nonnull final byte[] b) {
return toString(b, 0, b.length);
}

public static String toString(byte[] b, int off, int len) {
@Nonnull
public static String toString(@Nonnull final byte[] b, final int off, final int len) {
final int clen = len >>> 1;
final char[] c = new char[clen];
for (int i = 0; i < clen; i++) {
Expand All @@ -53,11 +56,11 @@ public static String toString(byte[] b, int off, int len) {

/**
* Checks whether the String a valid Java number. this code is ported from jakarta commons lang.
*
*
* @link http://jakarta.apache.org/commons/lang/apidocs/org/apache/commons/lang
* /math/NumberUtils.html
*/
public static boolean isNumber(final String str) {
public static boolean isNumber(@Nullable final String str) {
if (str == null || str.length() == 0) {
return false;
}
Expand Down Expand Up @@ -97,7 +100,7 @@ public static boolean isNumber(final String str) {

} else if (chars[i] == '.') {
if (hasDecPoint || hasExp) {
// two decimal points or dec in exponent
// two decimal points or dec in exponent
return false;
}
hasDecPoint = true;
Expand Down Expand Up @@ -170,6 +173,7 @@ public static void clear(@Nonnull final StringBuilder buf) {
buf.setLength(0);
}

@Nonnull
public static String concat(@Nonnull final List<String> list, @Nonnull final String sep) {
final StringBuilder buf = new StringBuilder(128);
for (String s : list) {
Expand All @@ -182,11 +186,29 @@ public static String concat(@Nonnull final List<String> list, @Nonnull final Str
return buf.toString();
}

public static String[] split(final String str, final char separatorChar) {
@Nonnull
public static String join(@Nonnull final List<String> list, @Nonnull final String sep) {
final StringBuilder buf = new StringBuilder(128);
for (int i = 0, size = list.size(); i < size; i++) {
if (i > 0) { // append separator before each element, except for the head element
buf.append(sep);
}

final String s = list.get(i);
if (s != null) {
buf.append(s);
}
}
return buf.toString();
}

@Nullable
public static String[] split(@Nullable final String str, final char separatorChar) {
return split(str, separatorChar, false);
}

public static String[] split(final String str, final char separatorChar,
@Nullable
public static String[] split(@Nullable final String str, final char separatorChar,
final boolean preserveAllTokens) {
if (str == null) {
return null;
Expand Down
71 changes: 71 additions & 0 deletions core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.tools.text;

import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

public class SingularizeUDFTest {

private SingularizeUDF udf;

@Before
public void setUp() {
this.udf = new SingularizeUDF();
}

@Test
public void testNull() {
Assert.assertEquals(null, udf.evaluate(null));
}

@Test
public void testEmpty() {
Assert.assertEquals("", udf.evaluate(""));
}

@Test
public void testUnchanged() {
Assert.assertEquals("christmas", udf.evaluate("christmas"));
}

@Test
public void testCompound() {
Assert.assertEquals("mother-in-law", udf.evaluate("mothers-in-law"));
}

@Test
public void testTailSingleQuote() {
Assert.assertEquals("dog's", udf.evaluate("dogs'"));
}

@Test
public void testIrregular() {
Assert.assertEquals("child", udf.evaluate("children"));
}

@Test
public void testRule() {
Assert.assertEquals("apple", udf.evaluate("apples"));
Assert.assertEquals("bus", udf.evaluate("buses"));
Assert.assertEquals("candy", udf.evaluate("candies"));
}

}
8 changes: 8 additions & 0 deletions docs/gitbook/misc/generic_funcs.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,14 @@ The compression level must be in range [-1,9]

- `is_stopword(string word)` - Returns whether English stopword or not

- `singularize(string word)` - Returns singular form of a given English word

```sql
select singularize(lower("Apples"));

> "apple"
```

- `tokenize(string englishText [, boolean toLowerCase])` - Returns words in array<string>

- `tokenize_ja(String line [, const string mode = "normal", const list<string> stopWords, const list<string> stopTags])` - returns tokenized strings in array<string>. Refer [this article](../misc/tokenizer.html) for detail.
Expand Down
3 changes: 3 additions & 0 deletions resources/ddl/define-all-as-permanent.hive
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,9 @@ CREATE FUNCTION tokenize as 'hivemall.tools.text.TokenizeUDF' USING JAR '${hivem
DROP FUNCTION IF EXISTS is_stopword;
CREATE FUNCTION is_stopword as 'hivemall.tools.text.StopwordUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS singularize;
CREATE FUNCTION singularize as 'hivemall.tools.text.SingularizeUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS split_words;
CREATE FUNCTION split_words as 'hivemall.tools.text.SplitWordsUDF' USING JAR '${hivemall_jar}';

Expand Down
3 changes: 3 additions & 0 deletions resources/ddl/define-all.hive
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,9 @@ create temporary function tokenize as 'hivemall.tools.text.TokenizeUDF';
drop temporary function if exists is_stopword;
create temporary function is_stopword as 'hivemall.tools.text.StopwordUDF';

drop temporary function if exists singularize;
create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';

drop temporary function if exists split_words;
create temporary function split_words as 'hivemall.tools.text.SplitWordsUDF';

Expand Down
3 changes: 3 additions & 0 deletions resources/ddl/define-all.spark
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize AS 'hivemall.tools.text.Token
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS is_stopword")
sqlContext.sql("CREATE TEMPORARY FUNCTION is_stopword AS 'hivemall.tools.text.StopwordUDF'")

sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS singularize")
sqlContext.sql("CREATE TEMPORARY FUNCTION singularize AS 'hivemall.tools.text.SingularizeUDF'")

sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS split_words")
sqlContext.sql("CREATE TEMPORARY FUNCTION split_words AS 'hivemall.tools.text.SplitWordsUDF'")

Expand Down
1 change: 1 addition & 0 deletions resources/ddl/define-udfs.td.hql
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ create temporary function train_ffm as 'hivemall.fm.FieldAwareFactorizationMachi
create temporary function ffm_predict as 'hivemall.fm.FFMPredictGenericUDAF';
create temporary function add_field_indicies as 'hivemall.ftvec.trans.AddFieldIndicesUDF';
create temporary function to_ordered_list as 'hivemall.tools.list.UDAFToOrderedList';
create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';

-- NLP features
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
Expand Down

0 comments on commit 5e1d0d0

Please sign in to comment.