From c4136107717076eac00bb8d2cbb02f31a76e37fa Mon Sep 17 00:00:00 2001 From: Sylvain Zimmer Date: Mon, 1 Aug 2016 22:38:37 -0400 Subject: [PATCH 1/2] Investigate switching to java.net.URI for SPARK-16826 --- .../expressions/stringExpressions.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index fc13845a7f6c2..10006fd1cf4b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.expressions -import java.net.{MalformedURLException, URL} +import java.net.{MalformedURLException, URI} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{HashMap, Locale, Map => JMap} import java.util.regex.Pattern @@ -749,25 +749,25 @@ case class ParseUrl(children: Seq[Expression]) Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX) } - private def getUrl(url: UTF8String): URL = { + private def getUrl(url: UTF8String): URI = { try { - new URL(url.toString) + new URI(url.toString) } catch { case e: MalformedURLException => null } } - private def getExtractPartFunc(partToExtract: UTF8String): URL => String = { + private def getExtractPartFunc(partToExtract: UTF8String): URI => String = { partToExtract match { case HOST => _.getHost case PATH => _.getPath case QUERY => _.getQuery - case REF => _.getRef - case PROTOCOL => _.getProtocol - case FILE => _.getFile + case REF => _.getRawFragment + case PROTOCOL => _.getScheme + case FILE => _.toURL().getFile case AUTHORITY => _.getAuthority case USERINFO => _.getUserInfo - case _ => (url: URL) => null + case _ => (url: URI) => null } } @@ -780,7 +780,7 @@ case class ParseUrl(children: Seq[Expression]) } } - private def extractFromUrl(url: URL, partToExtract: UTF8String): UTF8String = { + private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = { if (cachedExtractPartFunc ne null) { UTF8String.fromString(cachedExtractPartFunc.apply(url)) } else { From 4ff8200714cc919ac99b681c4e452640e5936a71 Mon Sep 17 00:00:00 2001 From: Sylvain Zimmer Date: Wed, 3 Aug 2016 20:06:10 -0400 Subject: [PATCH 2/2] Add new tests for PARSE_URL() and fully switch to java.net.URI --- .../expressions/stringExpressions.scala | 33 +++++++++++---- .../spark/sql/StringFunctionsSuite.scala | 40 ++++++++++++++++--- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 10006fd1cf4b7..a8c23a8b0c536 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.expressions -import java.net.{MalformedURLException, URI} +import java.net.{URI, URISyntaxException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{HashMap, Locale, Map => JMap} import java.util.regex.Pattern @@ -753,20 +753,39 @@ case class ParseUrl(children: Seq[Expression]) try { new URI(url.toString) } catch { - case e: MalformedURLException => null + case e: URISyntaxException => null } } private def getExtractPartFunc(partToExtract: UTF8String): URI => String = { + + // partToExtract match { + // case HOST => _.toURL().getHost + // case PATH => _.toURL().getPath + // case QUERY => _.toURL().getQuery + // case REF => _.toURL().getRef + // case PROTOCOL => _.toURL().getProtocol + // case FILE => _.toURL().getFile + // case AUTHORITY => _.toURL().getAuthority + // case USERINFO => _.toURL().getUserInfo + // case _ => (url: URI) => null + // } + partToExtract match { case HOST => _.getHost - case PATH => _.getPath - case QUERY => _.getQuery + case PATH => _.getRawPath + case QUERY => _.getRawQuery case REF => _.getRawFragment case PROTOCOL => _.getScheme - case FILE => _.toURL().getFile - case AUTHORITY => _.getAuthority - case USERINFO => _.getUserInfo + case FILE => + (url: URI) => + if (url.getRawQuery ne null) { + url.getRawPath + "?" + url.getRawQuery + } else { + url.getRawPath + } + case AUTHORITY => _.getRawAuthority + case USERINFO => _.getRawUserInfo case _ => (url: URI) => null } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 524926e1e9b66..57ca5d9c4d7cf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -229,18 +229,48 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext { } test("string parse_url function") { - val df = Seq[String](("http://userinfo@spark.apache.org/path?query=1#Ref")) - .toDF("url") - checkAnswer( - df.selectExpr( + def testUrl(url: String, expected: Row) { + checkAnswer(Seq[String]((url)).toDF("url").selectExpr( "parse_url(url, 'HOST')", "parse_url(url, 'PATH')", "parse_url(url, 'QUERY')", "parse_url(url, 'REF')", "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')", "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')", - "parse_url(url, 'QUERY', 'query')"), + "parse_url(url, 'QUERY', 'query')"), expected) + } + + testUrl( + "http://userinfo@spark.apache.org/path?query=1#Ref", Row("spark.apache.org", "/path", "query=1", "Ref", "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1")) + + testUrl( + "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two", + Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two", + "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com", + "use%20r:pas%20s", "x%20y")) + + testUrl( + "http://user:pass@host", + Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/", + Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/?#", + Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/file;param?query;p2", + Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2", + "user:pass@host", "user:pass", null)) + + testUrl( + "inva lid://user:pass@host/file;param?query;p2", + Row(null, null, null, null, null, null, null, null, null)) + } test("string repeat function") {