Skip to content

Commit

Permalink
Add first/last ignoreNulls in python
Browse files Browse the repository at this point in the history
  • Loading branch information
hvanhovell committed Jan 31, 2016
1 parent f9cad44 commit b002d60
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 2 deletions.
16 changes: 14 additions & 2 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ def _():

'max': 'Aggregate function: returns the maximum value of the expression in a group.',
'min': 'Aggregate function: returns the minimum value of the expression in a group.',
'first': 'Aggregate function: returns the first value in a group.',
'last': 'Aggregate function: returns the last value in a group.',
'count': 'Aggregate function: returns the number of items in a group.',
'sum': 'Aggregate function: returns the sum of all values in the expression.',
'avg': 'Aggregate function: returns the average of the values in a group.',
Expand Down Expand Up @@ -277,6 +275,13 @@ def countDistinct(col, *cols):
jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
return Column(jc)

@since(1.3)
def first(col, ignorenulls=False):
"""Aggregate function: returns the first value in a group.
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.first(_to_java_column(col), ignorenulls)
return Column(jc)

@since(1.6)
def input_file_name():
Expand Down Expand Up @@ -309,6 +314,13 @@ def isnull(col):
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.isnull(_to_java_column(col)))

@since(1.3)
def last(col, ignorenulls=False):
"""Aggregate function: returns the last value in a group.
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.last(_to_java_column(col), ignorenulls)
return Column(jc)

@since(1.6)
def monotonically_increasing_id():
Expand Down
10 changes: 10 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,16 @@ def test_aggregator(self):
self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])

def test_first_last_ignorenulls(self):
from pyspark.sql import functions
df = self.sqlCtx.range(0, 100)
df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
df3 = df2.select(functions.first(df2.id, False).alias('a'),
functions.first(df2.id, True).alias('b'),
functions.last(df2.id, False).alias('c'),
functions.last(df2.id, True).alias('d'))
self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())

def test_corr(self):
import math
df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF()
Expand Down

0 comments on commit b002d60

Please sign in to comment.