apache · TaoZex · Jan 25, 2023 · Jan 25, 2023 · Jan 25, 2023 · Jan 26, 2023
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -344,6 +344,7 @@ public final class FunctionRegistry {
 
     system.registerGenericUDF("encode", GenericUDFEncode.class);
     system.registerGenericUDF("decode", GenericUDFDecode.class);
+    system.registerGenericUDF("convertCharset", GenericUDFConvertCharset.class);
 
     system.registerGenericUDF("upper", GenericUDFUpper.class);
     system.registerGenericUDF("lower", GenericUDFLower.class);

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFConvertCharset.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFConvertCharset.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+
+@Description(name = "convertCharset", value = "_FUNC_(str, str, str) - Converts the first argument from the second argument character set to the third argument character set", extended =
+    "Possible options for the character set are 'US-ASCII', 'ISO-8859-1',\n"
+        + "'UTF-8', 'UTF-16BE', 'UTF-16LE', and 'UTF-16'. If either argument\n"
+        + "is null, the result will also be null") public class GenericUDFConvertCharset extends GenericUDF {
+  private transient CharsetEncoder encoder = null;
+  private transient CharsetDecoder decoder = null;
+  private transient PrimitiveObjectInspector stringOI = null;
+  private transient PrimitiveObjectInspector fromCharsetOI = null;
+  private transient PrimitiveObjectInspector toCharsetOI = null;
+
+  @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+    if (arguments.length != 3) {
+      throw new UDFArgumentLengthException("ConvertCharset() requires exactly three arguments");
+    }
+
+    checkInputArgument(arguments, 0);
+    stringOI = (PrimitiveObjectInspector) arguments[0];
+
+    checkInputArgument(arguments, 1);
+    fromCharsetOI = (PrimitiveObjectInspector) arguments[1];
+
+    checkInputArgument(arguments, 2);
+    toCharsetOI = (PrimitiveObjectInspector) arguments[2];
+
+    // If the character set for encoding is constant, we can optimize that
+    if (fromCharsetOI instanceof ConstantObjectInspector) {
+      String charSetName = ((ConstantObjectInspector) arguments[1]).getWritableConstantValue().toString();
+      encoder = Charset.forName(charSetName).newEncoder().onMalformedInput(CodingErrorAction.REPORT)
+          .onUnmappableCharacter(CodingErrorAction.REPORT);
+    }
+
+    // If the character set for decoding is constant, we can optimize that
+    if (toCharsetOI instanceof ConstantObjectInspector) {
+      String charSetName = ((ConstantObjectInspector) arguments[2]).getWritableConstantValue().toString();
+      decoder = Charset.forName(charSetName).newDecoder().onMalformedInput(CodingErrorAction.REPORT)
+          .onUnmappableCharacter(CodingErrorAction.REPORT);
+    }
+
+    return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+  }
+
+  @Override public Object evaluate(DeferredObject[] arguments) throws HiveException {
+    String value = PrimitiveObjectInspectorUtils.getString(arguments[0].get(), stringOI);
+    if (value == null) {
+      return null;
+    }
+
+    ByteBuffer encoded;
+    if (encoder != null) {
+      try {
+        encoded = encoder.encode(CharBuffer.wrap(value));
+      } catch (CharacterCodingException e) {
+        throw new HiveException(e);
+      }
+    } else {
+      encoded =
+          Charset.forName(PrimitiveObjectInspectorUtils.getString(arguments[1].get(), fromCharsetOI)).encode(value);
+    }
+
+    CharBuffer decoded;
+    if (decoder != null) {
+      try {
+        decoded = decoder.decode(encoded);
+      } catch (CharacterCodingException e) {
+        throw new HiveException(e);
+      }
+    } else {
+      decoded =
+          Charset.forName(PrimitiveObjectInspectorUtils.getString(arguments[2].get(), toCharsetOI)).decode(encoded);
+    }
+    return decoded.toString();
+  }
+
+  @Override public String getDisplayString(String[] children) {
+    assert (children.length == 3);
+    return getStandardDisplayString("convertCharset", children, ",");
+  }
+
+  private void checkInputArgument(ObjectInspector[] arguments, int index) throws UDFArgumentTypeException {
+    if (arguments[index].getCategory() != ObjectInspector.Category.PRIMITIVE
+        || PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP
+        != PrimitiveObjectInspectorUtils.getPrimitiveGrouping(
+        ((PrimitiveObjectInspector) arguments[index]).getPrimitiveCategory())) {
+      throw new UDFArgumentTypeException(index, "The argument to ConvertCharset() must be a string/varchar");
+    }
+  }
+}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFConvertCharset.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFConvertCharset.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Test;
+
+import java.io.UnsupportedEncodingException;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestGenericUDFConvertCharset {
+  @Test public void testConvertCharset() throws UnsupportedEncodingException, HiveException {
+    String[] charsetNames = { "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16" };
+    for (String fromCharsetName : charsetNames) {
+      for (String toCharsetName : charsetNames) {
+        verifyConvertCharset("A sample string", fromCharsetName, toCharsetName);
+      }
+    }
+  }
+
+  public void verifyConvertCharset(String string, String fromCharsetName, String toCharsetName)
+      throws UnsupportedEncodingException, HiveException {
+    GenericUDFConvertCharset udf = new GenericUDFConvertCharset();
+    byte[] bs = string.getBytes(fromCharsetName);
+    String expected = new String(bs, toCharsetName);
+
+    ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+    ObjectInspector fromCharsetOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+    ObjectInspector toCharsetOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+    ObjectInspector[] initArguments = { valueOI, fromCharsetOI, toCharsetOI };
+    udf.initialize(initArguments);
+
+    GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject(string);
+    GenericUDF.DeferredObject fromCharsetObj = new GenericUDF.DeferredJavaObject(fromCharsetName);
+    GenericUDF.DeferredObject toCharsetObj = new GenericUDF.DeferredJavaObject(toCharsetName);
+    GenericUDF.DeferredObject[] arguments = { valueObj, fromCharsetObj, toCharsetObj };
+    String output = (String) udf.evaluate(arguments);
+
+    assertEquals("ConvertCharset failed from CharSet " + fromCharsetName + " to CharSet " + toCharsetName, expected,
+        output);
+  }
+}
diff --git a/ql/src/test/queries/clientpositive/udf_convert_charset.q b/ql/src/test/queries/clientpositive/udf_convert_charset.q
@@ -0,0 +1,9 @@
+DESCRIBE FUNCTION convertCharset;
+DESC FUNCTION EXTENDED convertCharset;
+
+explain select convertCharset('TestConvertCharset1', 'UTF-8', 'US-ASCII');
+
+select
+convertCharset('TestConvertCharset1', 'UTF-8', 'US-ASCII'),
+convertCharset('TestConvertCharset2', cast('UTF-8' as varchar(10)), 'US-ASCII'),
+convertCharset('TestConvertCharset3', cast('UTF-8' as char(5)), 'US-ASCII');
diff --git a/ql/src/test/results/clientpositive/llap/show_functions.q.out b/ql/src/test/results/clientpositive/llap/show_functions.q.out
@@ -82,6 +82,7 @@ concat
 concat_ws
 context_ngrams
 conv
+convertcharset
 corr
 cos
 cosh
@@ -528,6 +529,7 @@ concat
 concat_ws
 context_ngrams
 conv
+convertcharset
 corr
 cos
 cosh
@@ -700,6 +702,7 @@ concat
 concat_ws
 context_ngrams
 conv
+convertcharset
 corr
 cos
 cosh

diff --git a/ql/src/test/results/clientpositive/llap/udf_convert_charset.q.out b/ql/src/test/results/clientpositive/llap/udf_convert_charset.q.out
@@ -0,0 +1,55 @@
+PREHOOK: query: DESCRIBE FUNCTION convertCharset
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION convertCharset
+POSTHOOK: type: DESCFUNCTION
+convertCharset(str, str, str) - Converts the first argument from the second argument character set to the third argument character set
+PREHOOK: query: DESC FUNCTION EXTENDED convertCharset
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESC FUNCTION EXTENDED convertCharset
+POSTHOOK: type: DESCFUNCTION
+convertCharset(str, str, str) - Converts the first argument from the second argument character set to the third argument character set
+Synonyms: convertcharset
+Possible options for the character set are 'US-ASCII', 'ISO-8859-1',
+'UTF-8', 'UTF-16BE', 'UTF-16LE', and 'UTF-16'. If either argument
+is null, the result will also be null
+Function class:org.apache.hadoop.hive.ql.udf.generic.GenericUDFConvertCharset
+Function type:BUILTIN
+PREHOOK: query: explain select convertCharset('TestConvertCharset1', 'UTF-8', 'US-ASCII')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: explain select convertCharset('TestConvertCharset1', 'UTF-8', 'US-ASCII')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        TableScan
+          alias: _dummy_table
+          Row Limit Per Split: 1
+          Select Operator
+            expressions: 'TestConvertCharset1' (type: string)
+            outputColumnNames: _col0
+            ListSink
+
+PREHOOK: query: select
+convertCharset('TestConvertCharset1', 'UTF-8', 'US-ASCII'),
+convertCharset('TestConvertCharset2', cast('UTF-8' as varchar(10)), 'US-ASCII'),
+convertCharset('TestConvertCharset3', cast('UTF-8' as char(5)), 'US-ASCII')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+convertCharset('TestConvertCharset1', 'UTF-8', 'US-ASCII'),
+convertCharset('TestConvertCharset2', cast('UTF-8' as varchar(10)), 'US-ASCII'),
+convertCharset('TestConvertCharset3', cast('UTF-8' as char(5)), 'US-ASCII')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+TestConvertCharset1	TestConvertCharset2	TestConvertCharset3