apache · HyukjinKwon · Dec 27, 2022 · Dec 27, 2022 · Dec 27, 2022 · HyukjinKwon
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -504,7 +504,7 @@ def __hash__(self):
     source_file_regexes=["python/pyspark/sql/connect"],
     python_test_goals=[
         # doctests
-        # No doctests yet.
+        "pyspark.sql.connect.catalog",
         # unittests
         "pyspark.sql.tests.connect.test_connect_column_expressions",
         "pyspark.sql.tests.connect.test_connect_plan_only",

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
@@ -246,6 +246,8 @@ def getDatabase(self, dbName: str) -> Database:
             locationUri=jdb.locationUri(),
         )
 
+    # TODO(SPARK-41725): we don't have to `collect` for every `sql` but
+    #  Spark Connect requires it. We should remove them out.
     def databaseExists(self, dbName: str) -> bool:
         """Check if the database with the specified name exists.
 
@@ -273,15 +275,15 @@ def databaseExists(self, dbName: str) -> bool:
 
         >>> spark.catalog.databaseExists("test_new_database")
         False
-        >>> _ = spark.sql("CREATE DATABASE test_new_database")
+        >>> _ = spark.sql("CREATE DATABASE test_new_database").collect()
         >>> spark.catalog.databaseExists("test_new_database")
         True
 
         Using the fully qualified name with the catalog name.
 
         >>> spark.catalog.databaseExists("spark_catalog.test_new_database")
         True
-        >>> _ = spark.sql("DROP DATABASE test_new_database")
+        >>> _ = spark.sql("DROP DATABASE test_new_database").collect()
         """
         return self._jcatalog.databaseExists(dbName)
 
@@ -370,8 +372,8 @@ def getTable(self, tableName: str) -> Table:
 
         Examples
         --------
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.getTable("tbl1")
         Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
 
@@ -381,14 +383,14 @@ def getTable(self, tableName: str) -> Table:
         Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
         >>> spark.catalog.getTable("spark_catalog.default.tbl1")
         Table(name='tbl1', catalog='spark_catalog', namespace=['default'], ...
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
 
         Throw an analysis exception when the table does not exist.
 
-        >>> spark.catalog.getTable("tbl1")
+        >>> spark.catalog.getTable("tbl1")  # doctest: +SKIP
         Traceback (most recent call last):
             ...
-        pyspark.sql.utils.AnalysisException: ...
+        AnalysisException: ...
         """
         jtable = self._jcatalog.getTable(tableName)
         jnamespace = jtable.namespace()
@@ -532,7 +534,8 @@ def getFunction(self, functionName: str) -> Function:
 
         Examples
         --------
-        >>> func = spark.sql("CREATE FUNCTION my_func1 AS 'test.org.apache.spark.sql.MyDoubleAvg'")
+        >>> _ = spark.sql(
+        ...     "CREATE FUNCTION my_func1 AS 'test.org.apache.spark.sql.MyDoubleAvg'").collect()
         >>> spark.catalog.getFunction("my_func1")
         Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ...
 
@@ -545,10 +548,10 @@ def getFunction(self, functionName: str) -> Function:
 
         Throw an analysis exception when the function does not exists.
 
-        >>> spark.catalog.getFunction("my_func2")
+        >>> spark.catalog.getFunction("my_func2")  # doctest: +SKIP
         Traceback (most recent call last):
             ...
-        pyspark.sql.utils.AnalysisException: ...
+        AnalysisException: ...
         """
         jfunction = self._jcatalog.getFunction(functionName)
         jnamespace = jfunction.namespace()
@@ -599,11 +602,11 @@ def listColumns(self, tableName: str, dbName: Optional[str] = None) -> List[Colu
 
         Examples
         --------
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tblA (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.listColumns("tblA")
         [Column(name='name', description=None, dataType='string', nullable=True, ...
-        >>> _ = spark.sql("DROP TABLE tblA")
+        >>> _ = spark.sql("DROP TABLE tblA").collect()
         """
         if dbName is None:
             iter = self._jcatalog.listColumns(tableName).toLocalIterator()
@@ -664,8 +667,8 @@ def tableExists(self, tableName: str, dbName: Optional[str] = None) -> bool:
 
         >>> spark.catalog.tableExists("unexisting_table")
         False
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.tableExists("tbl1")
         True
 
@@ -677,13 +680,13 @@ def tableExists(self, tableName: str, dbName: Optional[str] = None) -> bool:
         True
         >>> spark.catalog.tableExists("tbl1", "default")
         True
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
 
         Check if views exist:
 
         >>> spark.catalog.tableExists("view1")
         False
-        >>> _ = spark.sql("CREATE VIEW view1 AS SELECT 1")
+        >>> _ = spark.sql("CREATE VIEW view1 AS SELECT 1").collect()
         >>> spark.catalog.tableExists("view1")
         True
 
@@ -695,14 +698,14 @@ def tableExists(self, tableName: str, dbName: Optional[str] = None) -> bool:
         True
         >>> spark.catalog.tableExists("view1", "default")
         True
-        >>> _ = spark.sql("DROP VIEW view1")
+        >>> _ = spark.sql("DROP VIEW view1").collect()
 
         Check if temporary views exist:
 
-        >>> _ = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1")
+        >>> _ = spark.sql("CREATE TEMPORARY VIEW view1 AS SELECT 1").collect()
         >>> spark.catalog.tableExists("view1")
         True
-        >>> df = spark.sql("DROP VIEW view1")
+        >>> df = spark.sql("DROP VIEW view1").collect()
         >>> spark.catalog.tableExists("view1")
         False
         """
@@ -803,15 +806,15 @@ def createTable(
         Creating a managed table.
 
         >>> _ = spark.catalog.createTable("tbl1", schema=spark.range(1).schema, source='parquet')
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
 
         Creating an external table
 
         >>> import tempfile
         >>> with tempfile.TemporaryDirectory() as d:
         ...     _ = spark.catalog.createTable(
         ...         "tbl2", schema=spark.range(1).schema, path=d, source='parquet')
-        >>> _ = spark.sql("DROP TABLE tbl2")
+        >>> _ = spark.sql("DROP TABLE tbl2").collect()
         """
         if path is not None:
             options["path"] = path
@@ -864,7 +867,7 @@ def dropTempView(self, viewName: str) -> bool:
 
         Throw an exception if the temporary view does not exists.
 
-        >>> spark.table("my_table")
+        >>> spark.table("my_table")  # doctest: +SKIP
         Traceback (most recent call last):
             ...
         AnalysisException: ...
@@ -904,7 +907,7 @@ def dropGlobalTempView(self, viewName: str) -> bool:
 
         Throw an exception if the global view does not exists.
 
-        >>> spark.table("global_temp.my_table")
+        >>> spark.table("global_temp.my_table")  # doctest: +SKIP
         Traceback (most recent call last):
             ...
         AnalysisException: ...
@@ -945,8 +948,8 @@ def isCached(self, tableName: str) -> bool:
 
         Examples
         --------
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.cacheTable("tbl1")
         >>> spark.catalog.isCached("tbl1")
         True
@@ -956,14 +959,14 @@ def isCached(self, tableName: str) -> bool:
         >>> spark.catalog.isCached("not_existing_table")
         Traceback (most recent call last):
             ...
-        pyspark.sql.utils.AnalysisException: ...
+        AnalysisException: ...
 
         Using the fully qualified name for the table.
 
         >>> spark.catalog.isCached("spark_catalog.default.tbl1")
         True
         >>> spark.catalog.uncacheTable("tbl1")
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         return self._jcatalog.isCached(tableName)
 
@@ -982,22 +985,22 @@ def cacheTable(self, tableName: str) -> None:
 
         Examples
         --------
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.cacheTable("tbl1")
 
         Throw an analysis exception when the table does not exist.
 
         >>> spark.catalog.cacheTable("not_existing_table")
         Traceback (most recent call last):
             ...
-        pyspark.sql.utils.AnalysisException: ...
+        AnalysisException: ...
 
         Using the fully qualified name for the table.
 
         >>> spark.catalog.cacheTable("spark_catalog.default.tbl1")
         >>> spark.catalog.uncacheTable("tbl1")
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         self._jcatalog.cacheTable(tableName)
 
@@ -1016,8 +1019,8 @@ def uncacheTable(self, tableName: str) -> None:
 
         Examples
         --------
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.cacheTable("tbl1")
         >>> spark.catalog.uncacheTable("tbl1")
         >>> spark.catalog.isCached("tbl1")
@@ -1028,14 +1031,14 @@ def uncacheTable(self, tableName: str) -> None:
         >>> spark.catalog.uncacheTable("not_existing_table")  # doctest: +IGNORE_EXCEPTION_DETAIL
         Traceback (most recent call last):
             ...
-        pyspark.sql.utils.AnalysisException: ...
+        AnalysisException: ...
 
         Using the fully qualified name for the table.
 
         >>> spark.catalog.uncacheTable("spark_catalog.default.tbl1")
         >>> spark.catalog.isCached("tbl1")
         False
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         self._jcatalog.uncacheTable(tableName)
 
@@ -1049,12 +1052,12 @@ def clearCache(self) -> None:
 
         Examples
         --------
-        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet").collect()
         >>> spark.catalog.clearCache()
         >>> spark.catalog.isCached("tbl1")
         False
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         self._jcatalog.clearCache()
 
@@ -1080,9 +1083,10 @@ def refreshTable(self, tableName: str) -> None:
 
         >>> import tempfile
         >>> with tempfile.TemporaryDirectory() as d:
-        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        ...     _ = spark.sql("CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
-        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        ...     _ = spark.sql(
+        ...         "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d)).collect()
+        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'").collect()
         ...     spark.catalog.cacheTable("tbl1")
         ...     spark.table("tbl1").show()
         +---+
@@ -1105,7 +1109,7 @@ def refreshTable(self, tableName: str) -> None:
         Using the fully qualified name for the table.
 
         >>> spark.catalog.refreshTable("spark_catalog.default.tbl1")
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         self._jcatalog.refreshTable(tableName)
 
@@ -1133,12 +1137,12 @@ def recoverPartitions(self, tableName: str) -> None:
 
         >>> import tempfile
         >>> with tempfile.TemporaryDirectory() as d:
-        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
         ...     spark.range(1).selectExpr(
         ...         "id as key", "id as value").write.partitionBy("key").mode("overwrite").save(d)
         ...     _ = spark.sql(
         ...          "CREATE TABLE tbl1 (key LONG, value LONG)"
-        ...          "USING parquet OPTIONS (path '{}') PARTITIONED BY (key)".format(d))
+        ...          "USING parquet OPTIONS (path '{}') PARTITIONED BY (key)".format(d)).collect()
         ...     spark.table("tbl1").show()
         ...     spark.catalog.recoverPartitions("tbl1")
         ...     spark.table("tbl1").show()
@@ -1151,7 +1155,7 @@ def recoverPartitions(self, tableName: str) -> None:
         +-----+---+
         |    0|  0|
         +-----+---+
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         self._jcatalog.recoverPartitions(tableName)
 
@@ -1175,9 +1179,10 @@ def refreshByPath(self, path: str) -> None:
 
         >>> import tempfile
         >>> with tempfile.TemporaryDirectory() as d:
-        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
-        ...     _ = spark.sql("CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
-        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1").collect()
+        ...     _ = spark.sql(
+        ...         "CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d)).collect()
+        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'").collect()
         ...     spark.catalog.cacheTable("tbl1")
         ...     spark.table("tbl1").show()
         +---+
@@ -1197,7 +1202,7 @@ def refreshByPath(self, path: str) -> None:
         >>> spark.table("tbl1").count()
         0
 
-        >>> _ = spark.sql("DROP TABLE tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1").collect()
         """
         self._jcatalog.refreshByPath(path)