Skip to content

Commit

Permalink
[SPARK-8271][SQL]string function: soundex
Browse files Browse the repository at this point in the history
This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738

It's based on #7115 , thanks to HuJiayin

Author: HuJiayin <jiayin.hu@intel.com>
Author: Davies Liu <davies@databricks.com>

Closes #7812 from davies/soundex and squashes the following commits:

fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex
a4bd6d8 [Davies Liu] fix soundex
2538908 [HuJiayin] add codegen soundex
d15d329 [HuJiayin] add back ut
ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark
e2dec2c [HuJiayin] support soundex rebase code
  • Loading branch information
hujy authored and rxin committed Jul 31, 2015
1 parent 3fc0cb9 commit 4d5a6e7
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 0 deletions.
17 changes: 17 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
'year', 'quarter', 'month', 'hour', 'minute', 'second',
'dayofmonth', 'dayofyear', 'weekofyear']

__all__ += ['soundex']


def _create_function(name, doc=""):
""" Create a function for aggregator by name"""
Expand Down Expand Up @@ -922,6 +924,7 @@ def trunc(date, format):
def size(col):
"""
Collection function: returns the length of the array or map stored in the column.
:param col: name of column or expression
>>> df = sqlContext.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
Expand All @@ -932,6 +935,20 @@ def size(col):
return Column(sc._jvm.functions.size(_to_java_column(col)))


@since
@ignore_unicode_prefix
def soundex(col):
"""
Returns the SoundEx encoding for a string
>>> df = sqlContext.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
>>> df.select(soundex(df.name).alias("soundex")).collect()
[Row(soundex=u'P362'), Row(soundex=u'U612')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.size(_to_java_column(col)))


class UserDefinedFunction(object):
"""
User defined function in Python
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ object FunctionRegistry {
expression[StringRepeat]("repeat"),
expression[StringReverse]("reverse"),
expression[StringTrimRight]("rtrim"),
expression[SoundEx]("soundex"),
expression[StringSpace]("space"),
expression[StringSplit]("split"),
expression[Substring]("substr"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,22 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
}
}

/**
* A function that return soundex code of the given string expression.
*/
case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {

override def dataType: DataType = StringType

override def inputTypes: Seq[DataType] = Seq(StringType)

override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex()

override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
defineCodeGen(ctx, ev, c => s"$c.soundex()")
}
}

/**
* Returns the numeric value of the first character of str.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,34 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// scalastyle:on
}

test("soundex unit test") {
checkEvaluation(SoundEx(Literal("ZIN")), "Z500")
checkEvaluation(SoundEx(Literal("SU")), "S000")
checkEvaluation(SoundEx(Literal("")), "")
checkEvaluation(SoundEx(Literal.create(null, StringType)), null)

// scalastyle:off
// non ascii characters are not allowed in the code, so we disable the scalastyle here.
checkEvaluation(SoundEx(Literal("测试")), "测试")
checkEvaluation(SoundEx(Literal("Tschüss")), "T220")
// scalastyle:on
checkEvaluation(SoundEx(Literal("zZ")), "Z000", create_row("s8"))
checkEvaluation(SoundEx(Literal("RAGSSEEESSSVEEWE")), "R221")
checkEvaluation(SoundEx(Literal("Ashcraft")), "A261")
checkEvaluation(SoundEx(Literal("Aswcraft")), "A261")
checkEvaluation(SoundEx(Literal("Tymczak")), "T522")
checkEvaluation(SoundEx(Literal("Pfister")), "P236")
checkEvaluation(SoundEx(Literal("Miller")), "M460")
checkEvaluation(SoundEx(Literal("Peterson")), "P362")
checkEvaluation(SoundEx(Literal("Peters")), "P362")
checkEvaluation(SoundEx(Literal("Auerbach")), "A612")
checkEvaluation(SoundEx(Literal("Uhrbach")), "U612")
checkEvaluation(SoundEx(Literal("Moskowitz")), "M232")
checkEvaluation(SoundEx(Literal("Moskovitz")), "M213")
checkEvaluation(SoundEx(Literal("relyheewsgeessg")), "R422")
checkEvaluation(SoundEx(Literal("!!")), "!!")
}

test("TRIM/LTRIM/RTRIM") {
val s = 'a.string.at(0)
checkEvaluation(StringTrim(Literal(" aa ")), "aa", create_row(" abdef "))
Expand Down
8 changes: 8 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1902,6 +1902,14 @@ object functions {
StringRepeat(str.expr, lit(n).expr)
}

/**
* * Return the soundex code for the specified expression.
*
* @group string_funcs
* @since 1.5.0
*/
def soundex(e: Column): Column = SoundEx(e.expr)

/**
* Splits str around pattern (pattern is a regular expression).
* NOTE: pattern is a string represent the regular expression.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ class StringFunctionsSuite extends QueryTest {
Row("aa123cc"))
}

test("soundex function") {
val df = Seq(("MARY", "SU")).toDF("l", "r")
checkAnswer(
df.select(soundex($"l"), soundex($"r")), Row("M600", "S000"))

checkAnswer(
df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000"))
}

test("string instr function") {
val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")

Expand Down
53 changes: 53 additions & 0 deletions unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
Original file line number Diff line number Diff line change
Expand Up @@ -680,4 +680,57 @@ public int hashCode() {
}
return result;
}

/**
* Soundex mapping table
*/
private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7',
'0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'};

/**
* Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
* but can also be used as a general purpose scheme to find word with similar phonemes.
* https://en.wikipedia.org/wiki/Soundex
*/
public UTF8String soundex() {
if (numBytes == 0) {
return EMPTY_UTF8;
}

byte b = getByte(0);
if ('a' <= b && b <= 'z') {
b -= 32;
} else if (b < 'A' || 'Z' < b) {
// first character must be a letter
return this;
}
byte sx[] = {'0', '0', '0', '0'};
sx[0] = b;
int sxi = 1;
int idx = b - 'A';
byte lastCode = US_ENGLISH_MAPPING[idx];

for (int i = 1; i < numBytes; i++) {
b = getByte(i);
if ('a' <= b && b <= 'z') {
b -= 32;
} else if (b < 'A' || 'Z' < b) {
// not a letter, skip it
lastCode = '0';
continue;
}
idx = b - 'A';
byte code = US_ENGLISH_MAPPING[idx];
if (code == '7') {
// ignore it
} else {
if (code != '0' && code != lastCode) {
sx[sxi++] = code;
if (sxi > 3) break;
}
lastCode = code;
}
}
return UTF8String.fromBytes(sx);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -341,4 +341,52 @@ public void createBlankString() {
assertEquals(fromString(" "), blankString(3));
assertEquals(fromString(""), blankString(0));
}

@Test
public void soundex() {
assertEquals(fromString("Robert").soundex(), fromString("R163"));
assertEquals(fromString("Rupert").soundex(), fromString("R163"));
assertEquals(fromString("Rubin").soundex(), fromString("R150"));
assertEquals(fromString("Ashcraft").soundex(), fromString("A261"));
assertEquals(fromString("Ashcroft").soundex(), fromString("A261"));
assertEquals(fromString("Burroughs").soundex(), fromString("B620"));
assertEquals(fromString("Burrows").soundex(), fromString("B620"));
assertEquals(fromString("Ekzampul").soundex(), fromString("E251"));
assertEquals(fromString("Example").soundex(), fromString("E251"));
assertEquals(fromString("Ellery").soundex(), fromString("E460"));
assertEquals(fromString("Euler").soundex(), fromString("E460"));
assertEquals(fromString("Ghosh").soundex(), fromString("G200"));
assertEquals(fromString("Gauss").soundex(), fromString("G200"));
assertEquals(fromString("Gutierrez").soundex(), fromString("G362"));
assertEquals(fromString("Heilbronn").soundex(), fromString("H416"));
assertEquals(fromString("Hilbert").soundex(), fromString("H416"));
assertEquals(fromString("Jackson").soundex(), fromString("J250"));
assertEquals(fromString("Kant").soundex(), fromString("K530"));
assertEquals(fromString("Knuth").soundex(), fromString("K530"));
assertEquals(fromString("Lee").soundex(), fromString("L000"));
assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222"));
assertEquals(fromString("Lissajous").soundex(), fromString("L222"));
assertEquals(fromString("Ladd").soundex(), fromString("L300"));
assertEquals(fromString("Lloyd").soundex(), fromString("L300"));
assertEquals(fromString("Moses").soundex(), fromString("M220"));
assertEquals(fromString("O'Hara").soundex(), fromString("O600"));
assertEquals(fromString("Pfister").soundex(), fromString("P236"));
assertEquals(fromString("Rubin").soundex(), fromString("R150"));
assertEquals(fromString("Robert").soundex(), fromString("R163"));
assertEquals(fromString("Rupert").soundex(), fromString("R163"));
assertEquals(fromString("Soundex").soundex(), fromString("S532"));
assertEquals(fromString("Sownteks").soundex(), fromString("S532"));
assertEquals(fromString("Tymczak").soundex(), fromString("T522"));
assertEquals(fromString("VanDeusen").soundex(), fromString("V532"));
assertEquals(fromString("Washington").soundex(), fromString("W252"));
assertEquals(fromString("Wheaton").soundex(), fromString("W350"));

assertEquals(fromString("a").soundex(), fromString("A000"));
assertEquals(fromString("ab").soundex(), fromString("A100"));
assertEquals(fromString("abc").soundex(), fromString("A120"));
assertEquals(fromString("abcd").soundex(), fromString("A123"));
assertEquals(fromString("").soundex(), fromString(""));
assertEquals(fromString("123").soundex(), fromString("123"));
assertEquals(fromString("世界千世").soundex(), fromString("世界千世"));
}
}

0 comments on commit 4d5a6e7

Please sign in to comment.