From fdf3ddb28253d34f9c29f52fc9e26584363837b8 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Fri, 24 Feb 2017 15:39:06 -0800 Subject: [PATCH 1/8] Added documentation for like, rlike, startswith, and endswith --- python/pyspark/sql/column.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index ec05c18d4f062..62e14bb21bfca 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -250,11 +250,39 @@ def __iter__(self): raise TypeError("Column is not iterable") # string methods + _rlike_doc = """ Return a Boolean :class:`Column` based on a regex match.\n + :param other: an extended regex expression\n + + >>> df.filter( df.name.rlike('ice$') ).collect() + [Row(name=u'Alice', age=1)] + """ + _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match.\n + :param other: a SQL LIKE pattern\n + See :func:`pyspark.sql.Column.rlike` for a regex version\n + + >>> df.filter( df.name.like('Al%') ).collect() + [Row(name=u'Alice', age=1)] + """ + _startswith_doc = ''' Return a Boolean :class:`Column` based on a string match.\n + :param other: string at end of line (do not use a regex `^`)\n + >>> df.filter(df.name.startswith('Al')).collect() + [Row(name=u'Alice', age=1)] + >>> df.filter(df.name.startswith('^Al')).collect() + [] + ''' + _endswith_doc = ''' Return a Boolean :class:`Column` based on matching end of string.\n + :param other: string at end of line (do not use a regex `$`)\n + >>> df.filter(df.name.endswith('ice')).collect() + [Row(name=u'Alice', age=1)] + >>> df.filter(df.name.endswith('ice$')).collect() + [] + ''' + contains = _bin_op("contains") - rlike = _bin_op("rlike") - like = _bin_op("like") - startswith = _bin_op("startsWith") - endswith = _bin_op("endsWith") + rlike = _bin_op("rlike", _rlike_doc) + like = _bin_op("like", _like_doc) + startswith = _bin_op("startsWith", _startswith_doc) + endswith = _bin_op("endsWith", _endswith_doc) @ignore_unicode_prefix @since(1.3) From c22c0d9e3bb46be5265a832079cc951cfc738f10 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Fri, 31 Mar 2017 11:02:10 -0700 Subject: [PATCH 2/8] Fixed formatting issues: indents, and block quotes --- python/pyspark/sql/column.py | 46 +++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 62e14bb21bfca..27c0a8f0a9f5f 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -251,32 +251,34 @@ def __iter__(self): # string methods _rlike_doc = """ Return a Boolean :class:`Column` based on a regex match.\n - :param other: an extended regex expression\n + :param other: an extended regex expression - >>> df.filter( df.name.rlike('ice$') ).collect() - [Row(name=u'Alice', age=1)] - """ + >>> df.filter( df.name.rlike('ice$') ).collect() + [Row(name=u'Alice', age=1)] + """ _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match.\n - :param other: a SQL LIKE pattern\n - See :func:`pyspark.sql.Column.rlike` for a regex version\n + :param other: a SQL LIKE pattern\n + See :func:`pyspark.sql.Column.rlike` for a regex version - >>> df.filter( df.name.like('Al%') ).collect() - [Row(name=u'Alice', age=1)] + >>> df.filter( df.name.like('Al%') ).collect() + [Row(name=u'Alice', age=1)] """ - _startswith_doc = ''' Return a Boolean :class:`Column` based on a string match.\n - :param other: string at end of line (do not use a regex `^`)\n - >>> df.filter(df.name.startswith('Al')).collect() - [Row(name=u'Alice', age=1)] - >>> df.filter(df.name.startswith('^Al')).collect() - [] - ''' - _endswith_doc = ''' Return a Boolean :class:`Column` based on matching end of string.\n - :param other: string at end of line (do not use a regex `$`)\n - >>> df.filter(df.name.endswith('ice')).collect() - [Row(name=u'Alice', age=1)] - >>> df.filter(df.name.endswith('ice$')).collect() - [] - ''' + _startswith_doc = """ Return a Boolean :class:`Column` based on a string match.\n + :param other: string at end of line (do not use a regex `^`) + + >>> df.filter(df.name.startswith('Al')).collect() + [Row(name=u'Alice', age=1)] + >>> df.filter(df.name.startswith('^Al')).collect() + [] + """ + _endswith_doc = """ Return a Boolean :class:`Column` based on matching end of string.\n + :param other: string at end of line (do not use a regex `$`) + + >>> df.filter(df.name.endswith('ice')).collect() + [Row(name=u'Alice', age=1)] + >>> df.filter(df.name.endswith('ice$')).collect() + [] + """ contains = _bin_op("contains") rlike = _bin_op("rlike", _rlike_doc) From aaa365cc5d2a8a49b922187b5f996b2473011087 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Fri, 31 Mar 2017 13:04:56 -0700 Subject: [PATCH 3/8] Added docs for isNull and isNotNull --- python/pyspark/sql/column.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 27c0a8f0a9f5f..a3943760a97c0 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -258,7 +258,7 @@ def __iter__(self): """ _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match.\n :param other: a SQL LIKE pattern\n - See :func:`pyspark.sql.Column.rlike` for a regex version + See :func:`rlike` for a regex version >>> df.filter( df.name.like('Al%') ).collect() [Row(name=u'Alice', age=1)] @@ -333,8 +333,25 @@ def isin(self, *cols): desc = _unary_op("desc", "Returns a sort expression based on the" " descending order of the given column name.") - isNull = _unary_op("isNull", "True if the current expression is null.") - isNotNull = _unary_op("isNotNull", "True if the current expression is not null.") + _isNull_doc = ''' True if the current expression is null. Often combined with + :func:`DataFrame.filter` to select rows with null values. + + >>> df2.collect() + [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] + >>> df2.filter( df2.height.isNull ).collect() + [Row(name=u'Alice', height=None)] + ''' + _isNotNull_doc = ''' True if the current expression is null. Often combined with + :func:`DataFrame.filter` to select rows with non-null values. + + >>> df2.collect() + [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] + >>> df2.filter( df2.height.isNotNull ).collect() + [Row(name=u'Tom', height=80)] + ''' + + isNull = _unary_op("isNull", _isNull_doc ) + isNotNull = _unary_op("isNotNull", _isNotNull_doc) @since(1.3) def alias(self, *alias, **kwargs): From 83dc3d5a5e95920f83a57c1375776545a6291071 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Mon, 3 Apr 2017 10:35:03 -0700 Subject: [PATCH 4/8] Tuned whitespace --- python/pyspark/sql/column.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index a3943760a97c0..af14cec1d602f 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -250,17 +250,17 @@ def __iter__(self): raise TypeError("Column is not iterable") # string methods - _rlike_doc = """ Return a Boolean :class:`Column` based on a regex match.\n + _rlike_doc = """ Return a Boolean :class:`Column` based on a regex match. :param other: an extended regex expression - >>> df.filter( df.name.rlike('ice$') ).collect() + >>> df.filter(df.name.rlike('ice$')).collect() [Row(name=u'Alice', age=1)] """ - _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match.\n + _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match. :param other: a SQL LIKE pattern\n See :func:`rlike` for a regex version - >>> df.filter( df.name.like('Al%') ).collect() + >>> df.filter(df.name.like('Al%')).collect() [Row(name=u'Alice', age=1)] """ _startswith_doc = """ Return a Boolean :class:`Column` based on a string match.\n @@ -333,24 +333,24 @@ def isin(self, *cols): desc = _unary_op("desc", "Returns a sort expression based on the" " descending order of the given column name.") - _isNull_doc = ''' True if the current expression is null. Often combined with + _isNull_doc = """ True if the current expression is null. Often combined with :func:`DataFrame.filter` to select rows with null values. >>> df2.collect() [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] - >>> df2.filter( df2.height.isNull ).collect() + >>> df2.filter(df2.height.isNull).collect() [Row(name=u'Alice', height=None)] - ''' - _isNotNull_doc = ''' True if the current expression is null. Often combined with + """ + _isNotNull_doc = """ True if the current expression is null. Often combined with :func:`DataFrame.filter` to select rows with non-null values. >>> df2.collect() [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] - >>> df2.filter( df2.height.isNotNull ).collect() + >>> df2.filter(df2.height.isNotNull).collect() [Row(name=u'Tom', height=80)] - ''' + """ - isNull = _unary_op("isNull", _isNull_doc ) + isNull = _unary_op("isNull", _isNull_doc) isNotNull = _unary_op("isNotNull", _isNotNull_doc) @since(1.3) From 5afc442e33c7195fb0918773103cb939da8b1aee Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Wed, 5 Apr 2017 11:05:56 -0700 Subject: [PATCH 5/8] Fixed formatting issues: parentheses, block quotes, and newline --- python/pyspark/sql/column.py | 101 +++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index af14cec1d602f..7859c74805323 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -250,35 +250,44 @@ def __iter__(self): raise TypeError("Column is not iterable") # string methods - _rlike_doc = """ Return a Boolean :class:`Column` based on a regex match. - :param other: an extended regex expression - - >>> df.filter(df.name.rlike('ice$')).collect() - [Row(name=u'Alice', age=1)] - """ - _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match. - :param other: a SQL LIKE pattern\n - See :func:`rlike` for a regex version - - >>> df.filter(df.name.like('Al%')).collect() - [Row(name=u'Alice', age=1)] - """ - _startswith_doc = """ Return a Boolean :class:`Column` based on a string match.\n - :param other: string at end of line (do not use a regex `^`) - - >>> df.filter(df.name.startswith('Al')).collect() - [Row(name=u'Alice', age=1)] - >>> df.filter(df.name.startswith('^Al')).collect() - [] - """ - _endswith_doc = """ Return a Boolean :class:`Column` based on matching end of string.\n - :param other: string at end of line (do not use a regex `$`) - - >>> df.filter(df.name.endswith('ice')).collect() - [Row(name=u'Alice', age=1)] - >>> df.filter(df.name.endswith('ice$')).collect() - [] - """ + _rlike_doc = """ + Return a Boolean :class:`Column` based on a regex match. + + :param other: an extended regex expression + + >>> df.filter(df.name.rlike('ice$')).collect() + [Row(name=u'Alice', age=1)] + """ + _like_doc = """ + Return a Boolean :class:`Column` based on a SQL LIKE match. + + :param other: a SQL LIKE pattern + + See :func:`rlike` for a regex version + + >>> df.filter(df.name.like('Al%')).collect() + [Row(name=u'Alice', age=1)] + """ + _startswith_doc = """ + Return a Boolean :class:`Column` based on a string match. + + :param other: string at end of line (do not use a regex `^`) + + >>> df.filter(df.name.startswith('Al')).collect() + [Row(name=u'Alice', age=1)] + >>> df.filter(df.name.startswith('^Al')).collect() + [] + """ + _endswith_doc = """ + Return a Boolean :class:`Column` based on matching end of string. + + :param other: string at end of line (do not use a regex `$`) + + >>> df.filter(df.name.endswith('ice')).collect() + [Row(name=u'Alice', age=1)] + >>> df.filter(df.name.endswith('ice$')).collect() + [] + """ contains = _bin_op("contains") rlike = _bin_op("rlike", _rlike_doc) @@ -333,24 +342,26 @@ def isin(self, *cols): desc = _unary_op("desc", "Returns a sort expression based on the" " descending order of the given column name.") - _isNull_doc = """ True if the current expression is null. Often combined with - :func:`DataFrame.filter` to select rows with null values. - - >>> df2.collect() - [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] - >>> df2.filter(df2.height.isNull).collect() - [Row(name=u'Alice', height=None)] - """ - _isNotNull_doc = """ True if the current expression is null. Often combined with - :func:`DataFrame.filter` to select rows with non-null values. + _isNull_doc = """ + True if the current expression is null. Often combined with + :func:`DataFrame.filter` to select rows with null values. - >>> df2.collect() - [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] - >>> df2.filter(df2.height.isNotNull).collect() - [Row(name=u'Tom', height=80)] - """ + >>> df2.collect() + [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] + >>> df2.filter(df2.height.isNull).collect() + [Row(name=u'Alice', height=None)] + """ + _isNotNull_doc = """ + True if the current expression is null. Often combined with + :func:`DataFrame.filter` to select rows with non-null values. + + >>> df2.collect() + [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] + >>> df2.filter(df2.height.isNotNull).collect() + [Row(name=u'Tom', height=80)] + """ - isNull = _unary_op("isNull", _isNull_doc) + isNull = _unary_op("isNull", _isNull_doc) isNotNull = _unary_op("isNotNull", _isNotNull_doc) @since(1.3) From 0e3007dd372f0d3f2d249ff503f1a35a60098545 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Tue, 11 Apr 2017 11:20:48 -0700 Subject: [PATCH 6/8] Fixed documentation to match unit tests --- python/pyspark/sql/column.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 7859c74805323..6b100c0dd48c5 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -256,7 +256,7 @@ def __iter__(self): :param other: an extended regex expression >>> df.filter(df.name.rlike('ice$')).collect() - [Row(name=u'Alice', age=1)] + [Row(age=2, name=u'Alice')] """ _like_doc = """ Return a Boolean :class:`Column` based on a SQL LIKE match. @@ -266,7 +266,7 @@ def __iter__(self): See :func:`rlike` for a regex version >>> df.filter(df.name.like('Al%')).collect() - [Row(name=u'Alice', age=1)] + [Row(age=2, name=u'Alice')] """ _startswith_doc = """ Return a Boolean :class:`Column` based on a string match. @@ -274,7 +274,7 @@ def __iter__(self): :param other: string at end of line (do not use a regex `^`) >>> df.filter(df.name.startswith('Al')).collect() - [Row(name=u'Alice', age=1)] + [Row(age=2, name=u'Alice')] >>> df.filter(df.name.startswith('^Al')).collect() [] """ @@ -284,7 +284,7 @@ def __iter__(self): :param other: string at end of line (do not use a regex `$`) >>> df.filter(df.name.endswith('ice')).collect() - [Row(name=u'Alice', age=1)] + [Row(age=2, name=u'Alice')] >>> df.filter(df.name.endswith('ice$')).collect() [] """ @@ -346,19 +346,17 @@ def isin(self, *cols): True if the current expression is null. Often combined with :func:`DataFrame.filter` to select rows with null values. - >>> df2.collect() - [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] + >>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() >>> df2.filter(df2.height.isNull).collect() - [Row(name=u'Alice', height=None)] + [Row(height=None, name=u'Alice')] """ _isNotNull_doc = """ True if the current expression is null. Often combined with :func:`DataFrame.filter` to select rows with non-null values. - >>> df2.collect() - [Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)] + >>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() >>> df2.filter(df2.height.isNotNull).collect() - [Row(name=u'Tom', height=80)] + [Row(height=80, name=u'Tom')] """ isNull = _unary_op("isNull", _isNull_doc) From e785dbc9bd32a45d0e415f58ee8446cd03276b10 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Thu, 13 Apr 2017 16:51:09 -0700 Subject: [PATCH 7/8] Added imports and fixed functions --- python/pyspark/sql/column.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 6b100c0dd48c5..e7c7aed796e50 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -346,16 +346,18 @@ def isin(self, *cols): True if the current expression is null. Often combined with :func:`DataFrame.filter` to select rows with null values. + >>> from pyspark.sql import Row >>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() - >>> df2.filter(df2.height.isNull).collect() + >>> df2.filter(df2.height.isNull()).collect() [Row(height=None, name=u'Alice')] """ _isNotNull_doc = """ True if the current expression is null. Often combined with :func:`DataFrame.filter` to select rows with non-null values. + >>> from pyspark.sql import Row >>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() - >>> df2.filter(df2.height.isNotNull).collect() + >>> df2.filter(df2.height.isNotNull()).collect() [Row(height=80, name=u'Tom')] """ From b52765f5ef156862bd3cc4793a0d3fbd4d334449 Mon Sep 17 00:00:00 2001 From: Michael Patterson Date: Thu, 20 Apr 2017 13:19:20 -0700 Subject: [PATCH 8/8] Added ignore_unicode_prefix to avoid string error --- python/pyspark/sql/column.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index e7c7aed796e50..46c1707cb6c37 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -290,10 +290,10 @@ def __iter__(self): """ contains = _bin_op("contains") - rlike = _bin_op("rlike", _rlike_doc) - like = _bin_op("like", _like_doc) - startswith = _bin_op("startsWith", _startswith_doc) - endswith = _bin_op("endsWith", _endswith_doc) + rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) + like = ignore_unicode_prefix(_bin_op("like", _like_doc)) + startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc)) + endswith = ignore_unicode_prefix(_bin_op("endsWith", _endswith_doc)) @ignore_unicode_prefix @since(1.3) @@ -361,8 +361,8 @@ def isin(self, *cols): [Row(height=80, name=u'Tom')] """ - isNull = _unary_op("isNull", _isNull_doc) - isNotNull = _unary_op("isNotNull", _isNotNull_doc) + isNull = ignore_unicode_prefix(_unary_op("isNull", _isNull_doc)) + isNotNull = ignore_unicode_prefix(_unary_op("isNotNull", _isNotNull_doc)) @since(1.3) def alias(self, *alias, **kwargs):