From 50d3ee44ff19f5b5a867dd4c075009a7e1be1a38 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 13 May 2016 02:09:30 -0700 Subject: [PATCH] [SPARK-15244][PYSPARK] Type of column name created with createDataFrame is not consistent. --- python/pyspark/sql/session.py | 2 ++ python/pyspark/sql/tests.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index ae314359d512a..0781b442cbcd8 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -465,6 +465,8 @@ def prepare(obj): return (obj, ) schema = StructType().add("value", datatype) else: + if isinstance(schema, list): + schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] prepare = lambda obj: obj if isinstance(data, RDD): diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 0c73f58c3b246..0977c43a398da 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -228,6 +228,13 @@ def test_duplicated_column_names(self): self.assertRaises(AnalysisException, lambda: df.select(df.c).first()) self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first()) + def test_column_name_encoding(self): + """Ensure that created columns has `str` type consistently.""" + columns = self.spark.createDataFrame([('Alice', 1)], ['name', u'age']).columns + self.assertEqual(columns, ['name', 'age']) + self.assertTrue(isinstance(columns[0], str)) + self.assertTrue(isinstance(columns[1], str)) + def test_explode(self): from pyspark.sql.functions import explode d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]