From 54b38250228ff08bf6c72b776e16056d7e1e0038 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sun, 13 Oct 2024 13:01:50 +0200 Subject: [PATCH 1/2] feat: add head, tail methods --- python/datafusion/dataframe.py | 24 ++++++++++++++++++++++++ python/tests/test_dataframe.py | 22 ++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c5ac0bb89..74a5aa40c 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -223,6 +223,30 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: """ return DataFrame(self.df.limit(count, offset)) + def head(self, n: int) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Args: + n: Number of rows to take from the head of the DataFrame. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(n, 0)) + + def tail(self, n: int) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Be aware this could be potentially expensive due to the size of the frame. + + Args: + n: Number of rows to take from the tail of the DataFrame. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(n, max(0, self.count() - n))) + def collect(self) -> list[pa.RecordBatch]: """Execute this :py:class:`DataFrame` and collect results into memory. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e89c57159..2c376f905 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -190,6 +190,28 @@ def test_limit_with_offset(df): assert len(result.column(1)) == 1 +def test_head(df): + df = df.head(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([1]) + assert result.column(1) == pa.array([4]) + assert result.column(2) == pa.array([8]) + + +def test_tail(df): + df = df.tail(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([3]) + assert result.column(1) == pa.array([6]) + assert result.column(2) == pa.array([8]) + + def test_with_column(df): df = df.with_column("c", column("a") + column("b")) From 6c0cf5688fc80e7524b6a248dd58742f19101d07 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sun, 13 Oct 2024 16:59:52 +0200 Subject: [PATCH 2/2] chore: add default head/tail --- python/datafusion/dataframe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 74a5aa40c..bafc21d55 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -223,7 +223,7 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: """ return DataFrame(self.df.limit(count, offset)) - def head(self, n: int) -> DataFrame: + def head(self, n: int = 5) -> DataFrame: """Return a new :py:class:`DataFrame` with a limited number of rows. Args: @@ -234,10 +234,11 @@ def head(self, n: int) -> DataFrame: """ return DataFrame(self.df.limit(n, 0)) - def tail(self, n: int) -> DataFrame: + def tail(self, n: int = 5) -> DataFrame: """Return a new :py:class:`DataFrame` with a limited number of rows. - Be aware this could be potentially expensive due to the size of the frame. + Be aware this could be potentially expensive since the row size needs to be + determined of the dataframe. This is done by collecting it. Args: n: Number of rows to take from the tail of the DataFrame.