Skip to content

Commit

Permalink
Improve consistency between Connect and Classic
Browse files Browse the repository at this point in the history
  • Loading branch information
ianmcook committed May 21, 2024
1 parent 72abb12 commit acbe9ca
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 20 deletions.
11 changes: 4 additions & 7 deletions python/pyspark/sql/connect/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,8 @@ def createDataFrame(
elif isinstance(data, pa.Table):
prefer_timestamp_ntz = is_timestamp_ntz_preferred()

(timezone,) = self._client.get_configs("spark.sql.session.timeZone")

# If no schema supplied by user then get the names of columns only
if schema is None:
_cols = data.column_names
Expand All @@ -582,14 +584,9 @@ def createDataFrame(
if not isinstance(schema, StructType):
schema = from_arrow_schema(data.schema, prefer_timestamp_ntz=prefer_timestamp_ntz)

_table = data

(timezone,) = self._client.get_configs("spark.sql.session.timeZone")
assert isinstance(schema, StructType)
arrow_schema = to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True)
_table = (
_check_arrow_table_timestamps_localize(_table, schema, True, timezone)
.cast(arrow_schema)
_check_arrow_table_timestamps_localize(data, schema, True, timezone)
.cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True))
.rename_columns(schema.names)
)

Expand Down
20 changes: 7 additions & 13 deletions python/pyspark/sql/pandas/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def toArrow(self) -> "pa.Table":
from pyspark.sql.pandas.utils import require_minimum_pyarrow_version

require_minimum_pyarrow_version()
schema = to_arrow_schema(self.schema)
schema = to_arrow_schema(self.schema, error_on_duplicated_field_names_in_struct=True)

import pyarrow as pa

Expand Down Expand Up @@ -763,11 +763,12 @@ def _create_from_arrow_table(

require_minimum_pyarrow_version()

prefer_timestamp_ntz = is_timestamp_ntz_preferred()

# Create the Spark schema from list of names passed in with Arrow types
if isinstance(schema, (list, tuple)):
table = table.rename_columns(schema)
arrow_schema = table.schema
prefer_timestamp_ntz = is_timestamp_ntz_preferred()
struct = StructType()
for name, field in zip(schema, arrow_schema):
struct.add(
Expand All @@ -777,19 +778,12 @@ def _create_from_arrow_table(
)
schema = struct

if isinstance(schema, StructType):
pass
elif isinstance(schema, DataType):
raise PySparkTypeError(
error_class="UNSUPPORTED_DATA_TYPE_FOR_ARROW",
message_parameters={"data_type": str(schema)},
)
else:
prefer_timestamp_ntz = is_timestamp_ntz_preferred()
if not isinstance(schema, StructType):
schema = from_arrow_schema(table.schema, prefer_timestamp_ntz=prefer_timestamp_ntz)

table = _check_arrow_table_timestamps_localize(table, schema, True, timezone)
table = table.cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True))
table = _check_arrow_table_timestamps_localize(table, schema, True, timezone).cast(
to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True)
)

# Chunk the Arrow Table into RecordBatches
chunk_size = self._jconf.arrowMaxRecordsPerBatch()
Expand Down

0 comments on commit acbe9ca

Please sign in to comment.