diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c5ac0bb89..bafc21d55 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -223,6 +223,31 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: """ return DataFrame(self.df.limit(count, offset)) + def head(self, n: int = 5) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Args: + n: Number of rows to take from the head of the DataFrame. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(n, 0)) + + def tail(self, n: int = 5) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Be aware this could be potentially expensive since the row size needs to be + determined of the dataframe. This is done by collecting it. + + Args: + n: Number of rows to take from the tail of the DataFrame. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(n, max(0, self.count() - n))) + def collect(self) -> list[pa.RecordBatch]: """Execute this :py:class:`DataFrame` and collect results into memory. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e89c57159..2c376f905 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -190,6 +190,28 @@ def test_limit_with_offset(df): assert len(result.column(1)) == 1 +def test_head(df): + df = df.head(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([1]) + assert result.column(1) == pa.array([4]) + assert result.column(2) == pa.array([8]) + + +def test_tail(df): + df = df.tail(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([3]) + assert result.column(1) == pa.array([6]) + assert result.column(2) == pa.array([8]) + + def test_with_column(df): df = df.with_column("c", column("a") + column("b"))