Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyiceberg/avro/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def struct(self, file_schema: StructType, record_struct: IcebergType | None, fil
record_struct_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(record_struct.fields)}
results: List[Tuple[int | None, Writer]] = []

for writer, file_field in zip(file_writers, file_schema.fields):
for writer, file_field in zip(file_writers, file_schema.fields, strict=True):
if file_field.field_id in record_struct_positions:
results.append((record_struct_positions[file_field.field_id], writer))
elif file_field.required:
Expand Down Expand Up @@ -408,7 +408,7 @@ def struct(self, struct: StructType, expected_struct: IcebergType | None, field_
# Check if we need to convert it to an Enum
result_reader if not (enum_type := self.read_enums.get(field.field_id)) else EnumReader(enum_type, result_reader),
)
for field, result_reader in zip(struct.fields, field_readers)
for field, result_reader in zip(struct.fields, field_readers, strict=True)
]

file_fields = {field.field_id for field in struct.fields}
Expand Down
8 changes: 4 additions & 4 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1898,7 +1898,7 @@ def struct(self, struct: StructType, struct_array: pa.Array | None, field_result
return None
field_arrays: List[pa.Array] = []
fields: List[pa.Field] = []
for field, field_array in zip(struct.fields, field_results):
for field, field_array in zip(struct.fields, field_results, strict=True):
if field_array is not None:
array = self._cast_if_needed(field, field_array)
field_arrays.append(array)
Expand Down Expand Up @@ -2840,7 +2840,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
# to avoid conflicts
partition_fields = [f"_partition_{field.name}" for field in spec.fields]

for partition, name in zip(spec.fields, partition_fields):
for partition, name in zip(spec.fields, partition_fields, strict=True):
source_field = schema.find_field(partition.source_id)
full_field_name = schema.find_column_name(partition.source_id)
if full_field_name is None:
Expand All @@ -2854,7 +2854,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
partition_key = PartitionKey(
field_values=[
PartitionFieldValue(field=field, value=unique_partition[name])
for field, name in zip(spec.fields, partition_fields)
for field, name in zip(spec.fields, partition_fields, strict=True)
],
partition_spec=spec,
schema=schema,
Expand All @@ -2868,7 +2868,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
if unique_partition[partition_field_name] is not None
else pc.field(partition_field_name).is_null()
)
for field, partition_field_name in zip(spec.fields, partition_fields)
for field, partition_field_name in zip(spec.fields, partition_fields, strict=True)
],
)
)
Expand Down
4 changes: 2 additions & 2 deletions pyiceberg/partitioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def compatible_with(self, other: PartitionSpec) -> bool:
this_field.source_id == that_field.source_id
and this_field.transform == that_field.transform
and this_field.name == that_field.name
for this_field, that_field in zip(self.fields, other.fields)
for this_field, that_field in zip(self.fields, other.fields, strict=True)
)

def partition_type(self, schema: Schema) -> StructType:
Expand Down Expand Up @@ -242,7 +242,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
value_strs.append(quote_plus(value_str, safe=""))
field_strs.append(quote_plus(partition_field.name, safe=""))

path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs, strict=True)])
return path


Expand Down
4 changes: 2 additions & 2 deletions pyiceberg/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def __eq__(self, other: Any) -> bool:
return False

identifier_field_ids_is_equal = self.identifier_field_ids == other.identifier_field_ids
schema_is_equal = all(lhs == rhs for lhs, rhs in zip(self.columns, other.columns))
schema_is_equal = all(lhs == rhs for lhs, rhs in zip(self.columns, other.columns, strict=True))

return identifier_field_ids_is_equal and schema_is_equal

Expand Down Expand Up @@ -1317,7 +1317,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], StructType]) -> Sch
def struct(self, struct: StructType, field_results: List[Callable[[], IcebergType]]) -> StructType:
new_ids = [self._get_and_increment(field.field_id) for field in struct.fields]
new_fields = []
for field_id, field, field_type in zip(new_ids, struct.fields, field_results):
for field_id, field, field_type in zip(new_ids, struct.fields, field_results, strict=True):
new_fields.append(
NestedField(
field_id=field_id,
Expand Down
4 changes: 3 additions & 1 deletion pyiceberg/table/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,9 @@ def _generate_snapshot_id() -> int:
"""
rnd_uuid = uuid.uuid4()
snapshot_id = int.from_bytes(
bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder="little", signed=True
bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16], strict=True)),
byteorder="little",
signed=True,
)
snapshot_id = snapshot_id if snapshot_id >= 0 else snapshot_id * -1

Expand Down
2 changes: 1 addition & 1 deletion pyiceberg/table/name_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def schema(self, schema: Schema, struct_result: List[MappedField]) -> List[Mappe
def struct(self, struct: StructType, field_results: List[List[MappedField]]) -> List[MappedField]:
return [
MappedField(field_id=field.field_id, names=[field.name], fields=result)
for field, result in zip(struct.fields, field_results)
for field, result in zip(struct.fields, field_results, strict=True)
]

def field(self, field: NestedField, field_result: List[MappedField]) -> List[MappedField]:
Expand Down
4 changes: 3 additions & 1 deletion pyiceberg/table/upsert_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
# Step 4: Compare all rows using Python
to_update_indices = []
for source_idx, target_idx in zip(
matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(), matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist()
matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(),
matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist(),
strict=True,
):
source_row = source_table.slice(source_idx, 1)
target_row = target_table.slice(target_idx, 1)
Expand Down
2 changes: 1 addition & 1 deletion pyiceberg/utils/lazydict.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, contents: Sequence[Sequence[K | V]]):
def _build_dict(self) -> Dict[K, V]:
self._dict = {}
for item in self._contents:
self._dict.update(dict(zip(cast(Sequence[K], item[::2]), cast(Sequence[V], item[1::2]))))
self._dict.update(dict(zip(cast(Sequence[K], item[::2]), cast(Sequence[V], item[1::2]), strict=True)))

return self._dict

Expand Down
10 changes: 9 additions & 1 deletion ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,15 @@ select = [
"I", # isort
"UP", # pyupgrade
]
ignore = ["E501","E203","B024","B028","UP037", "UP035", "UP006", "B905"]
ignore = [
"E501",
"E203",
"B024",
"B028",
"UP037",
"UP035",
"UP006"
]

# Allow autofix for all enabled rules (when `--fix`) is provided.
fixable = ["ALL"]
Expand Down
6 changes: 3 additions & 3 deletions tests/integration/test_add_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ def test_add_file_with_valid_nullability_diff(spark: SparkSession, session_catal
rhs = written_arrow_table.to_pandas()

for column in written_arrow_table.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
assert left == right


Expand Down Expand Up @@ -755,7 +755,7 @@ def test_add_files_with_valid_upcast(
rhs = written_arrow_table.to_pandas()

for column in written_arrow_table.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if column == "map":
# Arrow returns a list of tuples, instead of a dict
right = dict(right)
Expand Down Expand Up @@ -802,7 +802,7 @@ def test_add_files_subset_of_schema(spark: SparkSession, session_catalog: Catalo
rhs = written_arrow_table.to_pandas()

for column in written_arrow_table.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
assert left == right


Expand Down
18 changes: 9 additions & 9 deletions tests/integration/test_inspect_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _inspect_files_asserts(df: pa.Table, spark_df: DataFrame) -> None:
if column == "partition":
# Spark leaves out the partition if the table is unpartitioned
continue
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right):
# NaN != NaN in Python
continue
Expand Down Expand Up @@ -209,7 +209,7 @@ def _check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> No
lhs = df.to_pandas().sort_values("last_updated_at")
rhs = spark_df.toPandas().sort_values("last_updated_at")
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
assert left == right, f"Difference in column {column}: {left} != {right}"


Expand Down Expand Up @@ -284,7 +284,7 @@ def test_inspect_snapshots(
lhs = spark.table(f"{identifier}.snapshots").toPandas()
rhs = df.to_pandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if column == "summary":
# Arrow returns a list of tuples, instead of a dict
right = dict(right)
Expand Down Expand Up @@ -332,7 +332,7 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non
assert len(lhs) == len(rhs)

for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if column == "data_file":
for df_column in left.keys():
if df_column == "partition":
Expand Down Expand Up @@ -485,7 +485,7 @@ def test_inspect_refs(
lhs = spark.table(f"{identifier}.refs").toPandas()
rhs = df.to_pandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right):
# NaN != NaN in Python
continue
Expand Down Expand Up @@ -535,7 +535,7 @@ def test_inspect_partitions_unpartitioned(
lhs = df.to_pandas()
rhs = spark.table(f"{identifier}.partitions").toPandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
assert left == right, f"Difference in column {column}: {left} != {right}"


Expand Down Expand Up @@ -755,7 +755,7 @@ def test_inspect_manifests(spark: SparkSession, session_catalog: Catalog, format
lhs = spark.table(f"{identifier}.manifests").toPandas()
rhs = df.to_pandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
assert left == right, f"Difference in column {column}: {left} != {right}"


Expand Down Expand Up @@ -793,7 +793,7 @@ def test_inspect_metadata_log_entries(
assert_frame_equal(left_before_last, right_before_last, check_dtype=False)
# compare the last row, except for the timestamp
for column in df.column_names:
for left, right in zip(left_last[column], right_last[column]):
for left, right in zip(left_last[column], right_last[column], strict=True):
if column == "timestamp":
continue
assert left == right, f"Difference in column {column}: {left} != {right}"
Expand Down Expand Up @@ -861,7 +861,7 @@ def test_inspect_history(spark: SparkSession, session_catalog: Catalog, format_v
lhs = spark.table(f"{identifier}.history").toPandas()
rhs = df.to_pandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right):
# NaN != NaN in Python
continue
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_partitioning_key.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ def test_partition_key(
) -> None:
field_values = [
PartitionFieldValue(field, field.transform.transform(TABLE_SCHEMA.find_field(field.source_id).field_type)(value))
for field, value in zip(partition_fields, partition_values)
for field, value in zip(partition_fields, partition_values, strict=True)
]
spec = PartitionSpec(*partition_fields)

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_rest_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
# direct comparison with the dicts returned by fastavro
def todict(obj: Any, spec_keys: List[str]) -> Any:
if type(obj) is Record:
return {key: obj[pos] for key, pos in zip(spec_keys, range(len(obj)))}
return {key: obj[pos] for key, pos in zip(spec_keys, range(len(obj)), strict=True)}
if isinstance(obj, dict) or isinstance(obj, LazyDict):
data = []
for k, v in obj.items():
Expand Down
12 changes: 7 additions & 5 deletions tests/integration/test_writes/test_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,9 @@ def test_spark_writes_orc_pyiceberg_reads(spark: SparkSession, session_catalog:
]

# Verify PyIceberg results contain the expected data (appears twice due to create + append)
pyiceberg_data = list(zip(pyiceberg_df["id"], pyiceberg_df["name"], pyiceberg_df["age"], pyiceberg_df["is_active"]))
pyiceberg_data = list(
zip(pyiceberg_df["id"], pyiceberg_df["name"], pyiceberg_df["age"], pyiceberg_df["is_active"], strict=True)
)
assert pyiceberg_data == expected_data + expected_data # Data should appear twice

# Verify PyIceberg data types are correct
Expand Down Expand Up @@ -1170,7 +1172,7 @@ def test_inspect_snapshots(
lhs = spark.table(f"{identifier}.snapshots").toPandas()
rhs = df.to_pandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if column == "summary":
# Arrow returns a list of tuples, instead of a dict
right = dict(right)
Expand Down Expand Up @@ -1466,7 +1468,7 @@ def test_table_write_schema_with_valid_nullability_diff(
rhs = written_arrow_table.to_pandas()

for column in written_arrow_table.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
assert left == right


Expand Down Expand Up @@ -1506,7 +1508,7 @@ def test_table_write_schema_with_valid_upcast(
rhs = written_arrow_table.to_pandas()

for column in written_arrow_table.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if column == "map":
# Arrow returns a list of tuples, instead of a dict
right = dict(right)
Expand Down Expand Up @@ -1552,7 +1554,7 @@ def test_write_all_timestamp_precision(
rhs = written_arrow_table.to_pandas()

for column in written_arrow_table.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
for left, right in zip(lhs[column].to_list(), rhs[column].to_list(), strict=True):
if pd.isnull(left):
assert pd.isnull(right)
else:
Expand Down
Loading