From 6cd98c1878a9c5c6475ed5974643021ab27862a7 Mon Sep 17 00:00:00 2001 From: 0x0FFF Date: Wed, 2 Sep 2015 13:36:36 -0700 Subject: [PATCH] [SPARK-10417] [SQL] Iterating through Column results in infinite loop `pyspark.sql.column.Column` object has `__getitem__` method, which makes it iterable for Python. In fact it has `__getitem__` to address the case when the column might be a list or dict, for you to be able to access certain element of it in DF API. The ability to iterate over it is just a side effect that might cause confusion for the people getting familiar with Spark DF (as you might iterate this way on Pandas DF for instance) Issue reproduction: ``` df = sqlContext.jsonRDD(sc.parallelize(['{"name": "El Magnifico"}'])) for i in df["name"]: print i ``` Author: 0x0FFF Closes #8574 from 0x0FFF/SPARK-10417. --- python/pyspark/sql/column.py | 3 +++ python/pyspark/sql/tests.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 0948f9b27cd38..56e75e8caee88 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -226,6 +226,9 @@ def __getattr__(self, item): raise AttributeError(item) return self.getField(item) + def __iter__(self): + raise TypeError("Column is not iterable") + # string methods rlike = _bin_op("rlike") like = _bin_op("like") diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index fc778631d93a3..eb449e8679fa0 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1066,6 +1066,15 @@ def test_with_column_with_existing_name(self): keys = self.df.withColumn("key", self.df.key).select("key").collect() self.assertEqual([r.key for r in keys], list(range(100))) + # regression test for SPARK-10417 + def test_column_iterator(self): + + def foo(): + for x in self.df.key: + break + + self.assertRaises(TypeError, foo) + class HiveContextSQLTests(ReusedPySparkTestCase):