Improve consistency between Connect and Classic

apache · May 21, 2024 · acbe9ca · acbe9ca
1 parent 72abb12
commit acbe9ca
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 20 deletions.
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -571,6 +571,8 @@ def createDataFrame(
         elif isinstance(data, pa.Table):
             prefer_timestamp_ntz = is_timestamp_ntz_preferred()
 
+            (timezone,) = self._client.get_configs("spark.sql.session.timeZone")
+
             # If no schema supplied by user then get the names of columns only
             if schema is None:
                 _cols = data.column_names
@@ -582,14 +584,9 @@ def createDataFrame(
             if not isinstance(schema, StructType):
                 schema = from_arrow_schema(data.schema, prefer_timestamp_ntz=prefer_timestamp_ntz)
 
-            _table = data
-
-            (timezone,) = self._client.get_configs("spark.sql.session.timeZone")
-            assert isinstance(schema, StructType)
-            arrow_schema = to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True)
             _table = (
-                _check_arrow_table_timestamps_localize(_table, schema, True, timezone)
-                .cast(arrow_schema)
+                _check_arrow_table_timestamps_localize(data, schema, True, timezone)
+                .cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True))
                 .rename_columns(schema.names)
             )
 

diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
@@ -236,7 +236,7 @@ def toArrow(self) -> "pa.Table":
         from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
 
         require_minimum_pyarrow_version()
-        schema = to_arrow_schema(self.schema)
+        schema = to_arrow_schema(self.schema, error_on_duplicated_field_names_in_struct=True)
 
         import pyarrow as pa
 
@@ -763,11 +763,12 @@ def _create_from_arrow_table(
 
         require_minimum_pyarrow_version()
 
+        prefer_timestamp_ntz = is_timestamp_ntz_preferred()
+
         # Create the Spark schema from list of names passed in with Arrow types
         if isinstance(schema, (list, tuple)):
             table = table.rename_columns(schema)
             arrow_schema = table.schema
-            prefer_timestamp_ntz = is_timestamp_ntz_preferred()
             struct = StructType()
             for name, field in zip(schema, arrow_schema):
                 struct.add(
@@ -777,19 +778,12 @@ def _create_from_arrow_table(
                 )
             schema = struct
 
-        if isinstance(schema, StructType):
-            pass
-        elif isinstance(schema, DataType):
-            raise PySparkTypeError(
-                error_class="UNSUPPORTED_DATA_TYPE_FOR_ARROW",
-                message_parameters={"data_type": str(schema)},
-            )
-        else:
-            prefer_timestamp_ntz = is_timestamp_ntz_preferred()
+        if not isinstance(schema, StructType):
             schema = from_arrow_schema(table.schema, prefer_timestamp_ntz=prefer_timestamp_ntz)
 
-        table = _check_arrow_table_timestamps_localize(table, schema, True, timezone)
-        table = table.cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True))
+        table = _check_arrow_table_timestamps_localize(table, schema, True, timezone).cast(
+            to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True)
+        )
 
         # Chunk the Arrow Table into RecordBatches
         chunk_size = self._jconf.arrowMaxRecordsPerBatch()