apache · zhichao-li · Jul 20, 2015 · Jul 21, 2015 · Jul 22, 2015 · Jul 22, 2015
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -177,6 +177,7 @@ object FunctionRegistry {
     expression[StringSplit]("split"),
     expression[Substring]("substr"),
     expression[Substring]("substring"),
+    expression[Substring_index]("substring_index"),
     expression[StringTrim]("trim"),
     expression[UnBase64]("unbase64"),
     expression[Upper]("ucase"),

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -21,6 +21,8 @@ import java.text.DecimalFormat
 import java.util.Locale
 import java.util.regex.{MatchResult, Pattern}
 
+import org.apache.commons.lang.StringUtils
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -355,6 +357,92 @@ case class StringInstr(str: Expression, substr: Expression)
   }
 }
 
+/**
+ * Returns the substring from string str before count occurrences of the delimiter delim.
+ * If count is positive, everything the left of the final delimiter (counting from left) is
+ * returned. If count is negative, every to the right of the final delimiter (counting from the
+ * right) is returned. substring_index performs a case-sensitive match when searching for delim.
+ */
+case class Substring_index(strExpr: Expression, delimExpr: Expression, countExpr: Expression)
+ extends Expression with ImplicitCastInputTypes with CodegenFallback {
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
+  override def nullable: Boolean = strExpr.nullable || delimExpr.nullable || countExpr.nullable
+  override def children: Seq[Expression] = Seq(strExpr, delimExpr, countExpr)
+  override def prettyName: String = "substring_index"
+  override def toString: String = s"substring_index($strExpr, $delimExpr, $countExpr)"
+
+  override def eval(input: InternalRow): Any = {
+    val str = strExpr.eval(input)
+    val delim = delimExpr.eval(input)
+    val count = countExpr.eval(input)
+    if (str == null || delim == null || count == null) {
+      null
+    } else {
+      subStrIndex(
+        str.asInstanceOf[UTF8String],
+        delim.asInstanceOf[UTF8String],
+        count.asInstanceOf[Int])
+    }
+  }
+
+  private def lastOrdinalIndexOf(
+    str: UTF8String, searchStr: UTF8String, ordinal: Int, lastIndex: Boolean = false): Int = {
+    ordinalIndexOf(str, searchStr, ordinal, true)
+  }
+
+  private def ordinalIndexOf(
+      str: UTF8String, searchStr: UTF8String, ordinal: Int, lastIndex: Boolean = false): Int = {
+    if (str == null || searchStr == null || ordinal <= 0) {
+      return -1
+    }
+    val strNumChars = str.numChars()
+    if (searchStr.numBytes() == 0) {
+      return if (lastIndex) {strNumChars} else {0}
+    }
+    var found = 0
+    var index = if (lastIndex) {strNumChars} else {0}
+    do {
+      if (lastIndex) {
+        index = str.lastIndexOf(searchStr, index - 1)
+      } else {
+        index = str.indexOf(searchStr, index + 1)
+      }
+      if (index < 0) {
+        return index
+      }
+      found += 1
+    } while (found < ordinal)
+    index
+  }
+
+  private def subStrIndex(strUtf8: UTF8String, delimUtf8: UTF8String, count: Int): UTF8String = {
+    if (strUtf8 == null || delimUtf8 == null || count == null) {
+      return null
+    }
+    if (strUtf8.numBytes() == 0 || delimUtf8.numBytes() == 0 || count == 0) {
+      return UTF8String.fromString("")
+    }
+    val res = if (count > 0) {
+      val idx = ordinalIndexOf(strUtf8, delimUtf8, count)
+      if (idx != -1) {
+        strUtf8.substring(0, idx)
+      } else {
+        strUtf8
+      }
+    } else {
+      val idx = lastOrdinalIndexOf(strUtf8, delimUtf8, -count)
+      if (idx != -1) {
+        strUtf8.substring(idx + delimUtf8.numChars(), strUtf8.numChars())
+      } else {
+        strUtf8
+      }
+    }
+    res
+  }
+}
+
 /**
  * A function that returns the position of the first occurrence of substr
  * in given string after position pos.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1777,8 +1777,31 @@ object functions {
   def instr(str: Column, substring: String): Column = StringInstr(str.expr, lit(substring).expr)
 
   /**
-   * Locate the position of the first occurrence of substr in a string column.
+   * Returns the substring from string str before count occurrences of the delimiter delim.
+   * If count is positive, everything the left of the final delimiter (counting from left) is
+   * returned. If count is negative, every to the right of the final delimiter (counting from the
+   * right) is returned. substring_index performs a case-sensitive match when searching for delim.
    *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def substring_index(str: String, delim: String, count: Int): Column =
+    substring_index(Column(str), delim, count)
+
+  /**
+   * Returns the substring from string str before count occurrences of the delimiter delim.
+   * If count is positive, everything the left of the final delimiter (counting from left) is
+   * returned. If count is negative, every to the right of the final delimiter (counting from the
+   * right) is returned. substring_index performs a case-sensitive match when searching for delim.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def substring_index(str: Column, delim: String, count: Int): Column =
+    Substring_index(str.expr, lit(delim).expr, lit(count).expr)
+
+  /**
+   * Locate the position of the first occurrence of substr.
    * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
    * could not be found in str.
    *

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -156,6 +156,63 @@ class StringFunctionsSuite extends QueryTest {
       Row(1))
   }
 
+  test("string substring_index function") {
+    val df = Seq(("www.apache.org", ".", "zz")).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(substring_index($"a", ".", 3)),
+      Row("www.apache.org"))
+    checkAnswer(
+      df.select(substring_index($"a", ".", 2)),
+      Row("www.apache"))
+    checkAnswer(
+      df.select(substring_index($"a", ".", 1)),
+      Row("www"))
+    checkAnswer(
+      df.select(substring_index($"a", ".", 0)),
+      Row(""))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), ".", -1)),
+      Row("org"))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), ".", -2)),
+      Row("apache.org"))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), ".", -3)),
+      Row("www.apache.org"))
+    // str is empty string
+    checkAnswer(
+      df.select(substring_index(lit(""), ".", 1)),
+      Row(""))
+    // empty string delim
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), "", 1)),
+      Row(""))
+    // delim does not exist in str
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), "#", 1)),
+      Row("www.apache.org"))
+    // delim is 2 chars
+    checkAnswer(
+      df.select(substring_index(lit("www||apache||org"), "||", 2)),
+      Row("www||apache"))
+    checkAnswer(
+      df.select(substring_index(lit("www||apache||org"), "||", -2)),
+      Row("apache||org"))
+    // null
+    checkAnswer(
+      df.select(substring_index(lit(null), "||", 2)),
+      Row(null))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), null, 2)),
+      Row(null))
+    // non ascii chars
+    // scalastyle:off
+    checkAnswer(
+      df.selectExpr("""substring_index("大千世界大千世界", "千", 2)"""),
+      Row("大千世界大"))
+    // scalastyle:on
+  }
+
   test("string locate function") {
     val df = Seq(("aaads", "aa", "zz", 1)).toDF("a", "b", "c", "d")
 

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -352,6 +352,69 @@ public int indexOf(UTF8String v, int start) {
     return -1;
   }
 
+  private enum ByteType {FIRSTBYTE, MIDBYTE, SINGLEBYTECHAR};
+
+  private ByteType checkByteType(Byte b) {
+    int firstTwoBits = (b >>> 6) & 0x03;
+    if (firstTwoBits == 3) {
+       return ByteType.FIRSTBYTE;
+     } else if (firstTwoBits == 2) {
+      return ByteType.MIDBYTE;
+    } else {
+      return ByteType.SINGLEBYTECHAR;
+    }
+  }
+
+  /**
+   * Return the first byte position for a given byte which shared the same code point.
+   * @param bytePos any byte within the code point
+   * @return the first byte position of a given code point, throw exception if not a valid UTF8 str
+   */
+  private int firstOfCurrentCodePoint(int bytePos) {
+    while (bytePos >= 0) {
+      if (ByteType.FIRSTBYTE == checkByteType(getByte(bytePos))
+        || ByteType.SINGLEBYTECHAR == checkByteType(getByte(bytePos))) {
+        return bytePos;
+      }
+      bytePos--;
+    }
+    throw new RuntimeException("Invalid utf8 string");
+  }
+
+  private int indexEnd(int startCodePoint) {
+    int i = numBytes -1; // position in byte
+    int c = numChars() - 1; // position in character
+    while (i >=0 && c > startCodePoint) {
+      i = firstOfCurrentCodePoint(i) - 1;
+      c -= 1;
+    }
+    return i;
+  }
+
+  public int lastIndexOf(UTF8String v, int startCodePoint) {
+    if (v.numBytes == 0) {
+      return 0;
+    }
+    if (numBytes == 0) {
+      return -1;
+    }
+    int fromIndexEnd = indexEnd(startCodePoint);
+    int count = startCodePoint;
+    int vNumChars = v.numChars();
+    do {
+      if (fromIndexEnd - v.numBytes + 1 < 0 ) {
+        return -1;
+      }
+      if (ByteArrayMethods.arrayEquals(
+          base, offset + fromIndexEnd - v.numBytes + 1, v.base, v.offset, v.numBytes)) {
+        return count - vNumChars + 1;
+      }
+      fromIndexEnd  = firstOfCurrentCodePoint(fromIndexEnd) - 1;
+      count--;
+    } while (fromIndexEnd >= 0);
+    return -1;
+  }
+
   /**
    * Returns str, right-padded with pad to a length of len
    * For example:

diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -221,6 +221,22 @@ public void indexOf() {
     assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
   }
 
+  @Test
+  public void lastIndexOf() {
+    assertEquals(0, fromString("").lastIndexOf(fromString(""), 0));
+    assertEquals(-1, fromString("").lastIndexOf(fromString("l"), 0));
+    assertEquals(0, fromString("hello").lastIndexOf(fromString(""), 0));
+    assertEquals(-1, fromString("hello").lastIndexOf(fromString("l"), 0));
+    assertEquals(3, fromString("hello").lastIndexOf(fromString("l"), 3));
+    assertEquals(-1, fromString("hello").lastIndexOf(fromString("a"), 4));
+    assertEquals(2, fromString("hello").lastIndexOf(fromString("ll"), 4));
+    assertEquals(-1, fromString("hello").lastIndexOf(fromString("ll"), 0));
+    assertEquals(5, fromString("数据砖头数据砖头").lastIndexOf(fromString("据砖"), 7));
+    assertEquals(0, fromString("数据砖头").lastIndexOf(fromString("数"), 3));
+    assertEquals(0, fromString("数据砖头").lastIndexOf(fromString("数"), 0));
+    assertEquals(3, fromString("数据砖头").lastIndexOf(fromString("头"), 3));
+  }
+
   @Test
   public void reverse() {
     assertEquals(fromString("olleh"), fromString("hello").reverse());