[SPARK-41331][CONNECT][PYTHON] Add orderBy and drop_duplicates

### What changes were proposed in this pull request? Add `orderBy` and `drop_duplicates` ### Why are the changes needed? For API coverage ### Does this PR introduce _any_ user-facing change? yes, new api ### How was this patch tested? added test cases, since they are only alias, I just test them in plan-only Closes #38846 from zhengruifeng/connect_df_orderby_dropduplicate. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
apache · Nov 30, 2022 · 92847d9 · 92847d9
1 parent f65129d
commit 92847d9
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 0 deletions.
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -299,6 +299,8 @@ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
                 plan.Deduplicate(child=self._plan, column_names=subset), session=self._session
             )
 
+    drop_duplicates = dropDuplicates
+
     def distinct(self) -> "DataFrame":
         """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
 
@@ -513,6 +515,8 @@ def sort(self, *cols: "ColumnOrName") -> "DataFrame":
             plan.Sort(self._plan, columns=list(cols), is_global=True), session=self._session
         )
 
+    orderBy = sort
+
     def sortWithinPartitions(self, *cols: "ColumnOrName") -> "DataFrame":
         """Sort within each partition by a specific column"""
         return DataFrame.withPlan(

diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py
@@ -193,6 +193,16 @@ def test_sort(self):
         )
         self.assertEqual(plan.root.sort.is_global, True)
 
+        plan = df.filter(df.col_name > 3).orderBy("col_a", "col_b")._plan.to_proto(self.connect)
+        self.assertEqual(
+            [
+                f.expression.unresolved_attribute.unparsed_identifier
+                for f in plan.root.sort.sort_fields
+            ],
+            ["col_a", "col_b"],
+        )
+        self.assertEqual(plan.root.sort.is_global, True)
+
         plan = (
             df.filter(df.col_name > 3)
             .sortWithinPartitions("col_a", "col_b")
@@ -236,6 +246,10 @@ def test_deduplicate(self):
         self.assertEqual(deduplicate_on_all_columns_plan.root.deduplicate.all_columns_as_keys, True)
         self.assertEqual(len(deduplicate_on_all_columns_plan.root.deduplicate.column_names), 0)
 
+        deduplicate_on_all_columns_plan = df.drop_duplicates()._plan.to_proto(self.connect)
+        self.assertEqual(deduplicate_on_all_columns_plan.root.deduplicate.all_columns_as_keys, True)
+        self.assertEqual(len(deduplicate_on_all_columns_plan.root.deduplicate.column_names), 0)
+
         deduplicate_on_subset_columns_plan = df.dropDuplicates(["name", "height"])._plan.to_proto(
             self.connect
         )