From e53ad98192a1104ad3d23445f10b52ab2ff3f185 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Tue, 11 Nov 2025 14:55:28 -0800 Subject: [PATCH] Remove linter rule --- pyiceberg/avro/codecs/__init__.py | 6 +- pyiceberg/avro/decoder.py | 13 +- pyiceberg/avro/file.py | 25 +-- pyiceberg/avro/reader.py | 16 +- pyiceberg/avro/resolver.py | 33 ++- pyiceberg/avro/writer.py | 9 +- pyiceberg/catalog/__init__.py | 53 +++-- pyiceberg/catalog/bigquery_metastore.py | 14 +- pyiceberg/catalog/dynamodb.py | 38 ++-- pyiceberg/catalog/glue.py | 26 +-- pyiceberg/catalog/hive.py | 31 ++- pyiceberg/catalog/noop.py | 13 +- pyiceberg/catalog/rest/__init__.py | 38 ++-- pyiceberg/catalog/rest/auth.py | 14 +- pyiceberg/catalog/rest/response.py | 6 +- pyiceberg/catalog/sql.py | 13 +- pyiceberg/cli/console.py | 6 +- pyiceberg/cli/output.py | 15 +- pyiceberg/expressions/__init__.py | 112 +++++----- pyiceberg/expressions/literals.py | 4 +- pyiceberg/expressions/visitors.py | 136 ++++++------ pyiceberg/io/__init__.py | 9 +- pyiceberg/io/fsspec.py | 10 +- pyiceberg/io/pyarrow.py | 197 +++++++++--------- pyiceberg/manifest.py | 52 +++-- pyiceberg/partitioning.py | 16 +- pyiceberg/schema.py | 143 +++++++------ pyiceberg/table/__init__.py | 111 +++++----- pyiceberg/table/inspect.py | 20 +- pyiceberg/table/metadata.py | 52 ++--- pyiceberg/table/name_mapping.py | 83 ++++---- pyiceberg/table/puffin.py | 18 +- pyiceberg/table/snapshots.py | 20 +- pyiceberg/table/sorting.py | 6 +- pyiceberg/table/statistics.py | 12 +- pyiceberg/table/update/__init__.py | 30 +-- pyiceberg/table/update/schema.py | 73 +++---- pyiceberg/table/update/snapshot.py | 86 ++++---- pyiceberg/table/update/sorting.py | 10 +- pyiceberg/table/update/spec.py | 30 +-- pyiceberg/table/update/statistics.py | 4 +- pyiceberg/table/update/validate.py | 6 +- pyiceberg/typedef.py | 24 +-- pyiceberg/types.py | 22 +- pyiceberg/utils/bin_packing.py | 11 +- pyiceberg/utils/config.py | 5 +- pyiceberg/utils/lazydict.py | 7 +- pyiceberg/utils/properties.py | 7 +- pyiceberg/utils/schema_conversion.py | 33 ++- pyiceberg/utils/singleton.py | 6 +- ruff.toml | 3 +- tests/avro/test_decoder.py | 4 +- tests/catalog/integration_test_dynamodb.py | 6 +- tests/catalog/integration_test_glue.py | 8 +- tests/catalog/test_dynamodb.py | 5 +- tests/catalog/test_glue.py | 5 +- tests/catalog/test_rest.py | 40 ++-- tests/conftest.py | 56 +++-- tests/expressions/test_literals.py | 11 +- tests/expressions/test_visitors.py | 64 +++--- tests/integration/test_catalog.py | 6 +- tests/integration/test_delete_count.py | 4 +- tests/integration/test_deletes.py | 4 +- tests/integration/test_partitioning_key.py | 6 +- tests/integration/test_rest_manifest.py | 4 +- .../test_writes/test_partitioned_writes.py | 4 +- tests/integration/test_writes/test_writes.py | 10 +- tests/integration/test_writes/utils.py | 4 +- tests/io/test_fsspec.py | 5 +- tests/io/test_pyarrow.py | 8 +- tests/io/test_pyarrow_stats.py | 19 +- tests/table/test_expire_snapshots.py | 3 +- tests/table/test_init.py | 8 +- tests/table/test_metadata.py | 38 ++-- tests/table/test_puffin.py | 3 +- tests/table/test_sorting.py | 4 +- tests/test_avro_sanitization.py | 16 +- tests/test_schema.py | 12 +- tests/test_serializers.py | 6 +- tests/test_types.py | 5 +- tests/utils/test_bin_packing.py | 9 +- tests/utils/test_concurrent.py | 4 +- tests/utils/test_config.py | 4 +- tests/utils/test_manifest.py | 3 +- tests/utils/test_schema_conversion.py | 8 +- 85 files changed, 1020 insertions(+), 1103 deletions(-) diff --git a/pyiceberg/avro/codecs/__init__.py b/pyiceberg/avro/codecs/__init__.py index d5d3a7c4e5..f33f25f1cd 100644 --- a/pyiceberg/avro/codecs/__init__.py +++ b/pyiceberg/avro/codecs/__init__.py @@ -26,7 +26,7 @@ from __future__ import annotations -from typing import Dict, Literal, Type +from typing import Literal from typing_extensions import TypeAlias @@ -40,7 +40,7 @@ AVRO_CODEC_KEY = "avro.codec" -KNOWN_CODECS: Dict[AvroCompressionCodec, Type[Codec] | None] = { +KNOWN_CODECS: dict[AvroCompressionCodec, type[Codec] | None] = { "null": None, "bzip2": BZip2Codec, "snappy": SnappyCodec, @@ -49,4 +49,4 @@ } # Map to convert the naming from Iceberg to Avro -CODEC_MAPPING_ICEBERG_TO_AVRO: Dict[str, str] = {"gzip": "deflate", "zstd": "zstandard"} +CODEC_MAPPING_ICEBERG_TO_AVRO: dict[str, str] = {"gzip": "deflate", "zstd": "zstandard"} diff --git a/pyiceberg/avro/decoder.py b/pyiceberg/avro/decoder.py index d30475acf1..e971d52d48 100644 --- a/pyiceberg/avro/decoder.py +++ b/pyiceberg/avro/decoder.py @@ -18,9 +18,6 @@ from abc import ABC, abstractmethod from io import SEEK_CUR from typing import ( - Dict, - List, - Tuple, cast, ) @@ -67,11 +64,11 @@ def read_int(self) -> int: datum = (n >> 1) ^ -(n & 1) return datum - def read_ints(self, n: int) -> Tuple[int, ...]: + def read_ints(self, n: int) -> tuple[int, ...]: """Read a list of integers.""" return tuple(self.read_int() for _ in range(n)) - def read_int_bytes_dict(self, n: int, dest: Dict[int, bytes]) -> None: + def read_int_bytes_dict(self, n: int, dest: dict[int, bytes]) -> None: """Read a dictionary of integers for keys and bytes for values into a destination dictionary.""" for _ in range(n): k = self.read_int() @@ -85,7 +82,7 @@ def read_float(self) -> float: The float is converted into a 32-bit integer using a method equivalent to Java's floatToIntBits and then encoded in little-endian format. """ - return float(cast(Tuple[float, ...], STRUCT_FLOAT.unpack(self.read(4)))[0]) + return float(cast(tuple[float, ...], STRUCT_FLOAT.unpack(self.read(4)))[0]) def read_double(self) -> float: """Read a value from the stream as a double. @@ -94,7 +91,7 @@ def read_double(self) -> float: The double is converted into a 64-bit integer using a method equivalent to Java's doubleToLongBits and then encoded in little-endian format. """ - return float(cast(Tuple[float, ...], STRUCT_DOUBLE.unpack(self.read(8)))[0]) + return float(cast(tuple[float, ...], STRUCT_DOUBLE.unpack(self.read(8)))[0]) def read_bytes(self) -> bytes: """Bytes are encoded as a long followed by that many bytes of data.""" @@ -152,7 +149,7 @@ def read(self, n: int) -> bytes: """Read n bytes.""" if n < 0: raise ValueError(f"Requested {n} bytes to read, expected positive integer.") - data: List[bytes] = [] + data: list[bytes] = [] n_remaining = n while n_remaining > 0: diff --git a/pyiceberg/avro/file.py b/pyiceberg/avro/file.py index 3b91d70d85..f1be9c3928 100644 --- a/pyiceberg/avro/file.py +++ b/pyiceberg/avro/file.py @@ -27,10 +27,7 @@ from types import TracebackType from typing import ( Callable, - Dict, Generic, - List, - Type, TypeVar, ) @@ -77,14 +74,14 @@ def magic(self) -> bytes: return self._data[0] @property - def meta(self) -> Dict[str, str]: + def meta(self) -> dict[str, str]: return self._data[1] @property def sync(self) -> bytes: return self._data[2] - def compression_codec(self) -> Type[Codec] | None: + def compression_codec(self) -> type[Codec] | None: """Get the file's compression codec algorithm from the file's metadata. In the case of a null codec, we return a None indicating that we @@ -146,8 +143,8 @@ class AvroFile(Generic[D]): ) input_file: InputFile read_schema: Schema | None - read_types: Dict[int, Callable[..., StructProtocol]] - read_enums: Dict[int, Callable[..., Enum]] + read_types: dict[int, Callable[..., StructProtocol]] + read_enums: dict[int, Callable[..., Enum]] header: AvroFileHeader schema: Schema reader: Reader @@ -159,8 +156,8 @@ def __init__( self, input_file: InputFile, read_schema: Schema | None = None, - read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, - read_enums: Dict[int, Callable[..., Enum]] = EMPTY_DICT, + read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, + read_enums: dict[int, Callable[..., Enum]] = EMPTY_DICT, ) -> None: self.input_file = input_file self.read_schema = read_schema @@ -185,7 +182,7 @@ def __enter__(self) -> AvroFile[D]: return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" def __iter__(self) -> AvroFile[D]: @@ -240,7 +237,7 @@ def __init__( file_schema: Schema, schema_name: str, record_schema: Schema | None = None, - metadata: Dict[str, str] = EMPTY_DICT, + metadata: dict[str, str] = EMPTY_DICT, ) -> None: self.output_file = output_file self.file_schema = file_schema @@ -267,7 +264,7 @@ def __enter__(self) -> AvroOutputFile[D]: return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" self.output_stream.close() @@ -284,7 +281,7 @@ def _write_header(self) -> None: header = AvroFileHeader(MAGIC, meta, self.sync_bytes) construct_writer(META_SCHEMA).write(self.encoder, header) - def compression_codec(self) -> Type[Codec] | None: + def compression_codec(self) -> type[Codec] | None: """Get the file's compression codec algorithm from the file's metadata. In the case of a null codec, we return a None indicating that we @@ -302,7 +299,7 @@ def compression_codec(self) -> Type[Codec] | None: return KNOWN_CODECS[codec_name] # type: ignore - def write_block(self, objects: List[D]) -> None: + def write_block(self, objects: list[D]) -> None: in_memory = io.BytesIO() block_content_encoder = BinaryEncoder(output_stream=in_memory) for obj in objects: diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py index 97c41be473..d33c054beb 100644 --- a/pyiceberg/avro/reader.py +++ b/pyiceberg/avro/reader.py @@ -33,9 +33,7 @@ from typing import ( Any, Callable, - List, Mapping, - Tuple, ) from uuid import UUID @@ -319,14 +317,14 @@ class StructReader(Reader): "_hash", "_max_pos", ) - field_readers: Tuple[Tuple[int | None, Reader], ...] + field_readers: tuple[tuple[int | None, Reader], ...] create_struct: Callable[..., StructProtocol] struct: StructType - field_reader_functions = Tuple[Tuple[str | None, int, Callable[[BinaryDecoder], Any] | None], ...] + field_reader_functions = tuple[tuple[str | None, int, Callable[[BinaryDecoder], Any] | None], ...] def __init__( self, - field_readers: Tuple[Tuple[int | None, Reader], ...], + field_readers: tuple[tuple[int | None, Reader], ...], create_struct: Callable[..., StructProtocol], struct: StructType, ) -> None: @@ -338,7 +336,7 @@ def __init__( if not isinstance(self.create_struct(), StructProtocol): raise ValueError(f"Incompatible with StructProtocol: {self.create_struct}") - reading_callbacks: List[Tuple[int | None, Callable[[BinaryDecoder], Any]]] = [] + reading_callbacks: list[tuple[int | None, Callable[[BinaryDecoder], Any]]] = [] max_pos = -1 for pos, field in field_readers: if pos is not None: @@ -394,8 +392,8 @@ def __init__(self, element: Reader) -> None: self._hash = hash(self.element) self._is_int_list = isinstance(self.element, IntegerReader) - def read(self, decoder: BinaryDecoder) -> List[Any]: - read_items: List[Any] = [] + def read(self, decoder: BinaryDecoder) -> list[Any]: + read_items: list[Any] = [] block_count = decoder.read_int() while block_count != 0: if block_count < 0: @@ -461,7 +459,7 @@ def _read_int_int(self, decoder: BinaryDecoder) -> Mapping[int, int]: if block_count == 0: return EMPTY_DICT - contents_array: List[Tuple[int, ...]] = [] + contents_array: list[tuple[int, ...]] = [] while block_count != 0: if block_count < 0: diff --git a/pyiceberg/avro/resolver.py b/pyiceberg/avro/resolver.py index 84805640eb..900f47dab6 100644 --- a/pyiceberg/avro/resolver.py +++ b/pyiceberg/avro/resolver.py @@ -18,9 +18,6 @@ from enum import Enum from typing import ( Callable, - Dict, - List, - Tuple, ) from pyiceberg.avro.decoder import BinaryDecoder @@ -114,7 +111,7 @@ def construct_reader( - file_schema: Schema | IcebergType, read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT + file_schema: Schema | IcebergType, read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT ) -> Reader: """Construct a reader from a file schema. @@ -146,7 +143,7 @@ class ConstructWriter(SchemaVisitorPerPrimitiveType[Writer]): def schema(self, schema: Schema, struct_result: Writer) -> Writer: return struct_result - def struct(self, struct: StructType, field_results: List[Writer]) -> Writer: + def struct(self, struct: StructType, field_results: list[Writer]) -> Writer: return StructWriter(tuple((pos, result) for pos, result in enumerate(field_results))) def field(self, field: NestedField, field_result: Writer) -> Writer: @@ -234,8 +231,8 @@ def resolve_writer( def resolve_reader( file_schema: Schema | IcebergType, read_schema: Schema | IcebergType, - read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, - read_enums: Dict[int, Callable[..., Enum]] = EMPTY_DICT, + read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, + read_enums: dict[int, Callable[..., Enum]] = EMPTY_DICT, ) -> Reader: """Resolve the file and read schema to produce a reader. @@ -274,12 +271,12 @@ class WriteSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Writer]): def schema(self, file_schema: Schema, record_schema: IcebergType | None, result: Writer) -> Writer: return result - def struct(self, file_schema: StructType, record_struct: IcebergType | None, file_writers: List[Writer]) -> Writer: + def struct(self, file_schema: StructType, record_struct: IcebergType | None, file_writers: list[Writer]) -> Writer: if not isinstance(record_struct, StructType): raise ResolveError(f"File/write schema are not aligned for struct, got {record_struct}") - record_struct_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(record_struct.fields)} - results: List[Tuple[int | None, Writer]] = [] + record_struct_positions: dict[int, int] = {field.field_id: pos for pos, field in enumerate(record_struct.fields)} + results: list[tuple[int | None, Writer]] = [] for writer, file_field in zip(file_writers, file_schema.fields, strict=True): if file_field.field_id in record_struct_positions: @@ -367,14 +364,14 @@ def visit_unknown(self, unknown_type: UnknownType, partner: IcebergType | None) class ReadSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]): __slots__ = ("read_types", "read_enums", "context") - read_types: Dict[int, Callable[..., StructProtocol]] - read_enums: Dict[int, Callable[..., Enum]] - context: List[int] + read_types: dict[int, Callable[..., StructProtocol]] + read_enums: dict[int, Callable[..., Enum]] + context: list[int] def __init__( self, - read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, - read_enums: Dict[int, Callable[..., Enum]] = EMPTY_DICT, + read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, + read_enums: dict[int, Callable[..., Enum]] = EMPTY_DICT, ) -> None: self.read_types = read_types self.read_enums = read_enums @@ -389,7 +386,7 @@ def before_field(self, field: NestedField, field_partner: NestedField | None) -> def after_field(self, field: NestedField, field_partner: NestedField | None) -> None: self.context.pop() - def struct(self, struct: StructType, expected_struct: IcebergType | None, field_readers: List[Reader]) -> Reader: + def struct(self, struct: StructType, expected_struct: IcebergType | None, field_readers: list[Reader]) -> Reader: read_struct_id = self.context[STRUCT_ROOT] if len(self.context) > 0 else STRUCT_ROOT struct_callable = self.read_types.get(read_struct_id, Record) @@ -399,10 +396,10 @@ def struct(self, struct: StructType, expected_struct: IcebergType | None, field_ if not isinstance(expected_struct, StructType): raise ResolveError(f"File/read schema are not aligned for struct, got {expected_struct}") - expected_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)} + expected_positions: dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)} # first, add readers for the file fields that must be in order - results: List[Tuple[int | None, Reader]] = [ + results: list[tuple[int | None, Reader]] = [ ( expected_positions.get(field.field_id), # Check if we need to convert it to an Enum diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py index ba66d3003c..f78d1a486e 100644 --- a/pyiceberg/avro/writer.py +++ b/pyiceberg/avro/writer.py @@ -28,9 +28,6 @@ from dataclasses import field as dataclassfield from typing import ( Any, - Dict, - List, - Tuple, ) from uuid import UUID @@ -186,7 +183,7 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None: @dataclass(frozen=True) class StructWriter(Writer): - field_writers: Tuple[Tuple[int | None, Writer], ...] = dataclassfield() + field_writers: tuple[tuple[int | None, Writer], ...] = dataclassfield() def write(self, encoder: BinaryEncoder, val: Record) -> None: for pos, writer in self.field_writers: @@ -210,7 +207,7 @@ def __hash__(self) -> int: class ListWriter(Writer): element_writer: Writer - def write(self, encoder: BinaryEncoder, val: List[Any]) -> None: + def write(self, encoder: BinaryEncoder, val: list[Any]) -> None: encoder.write_int(len(val)) for v in val: self.element_writer.write(encoder, v) @@ -223,7 +220,7 @@ class MapWriter(Writer): key_writer: Writer value_writer: Writer - def write(self, encoder: BinaryEncoder, val: Dict[Any, Any]) -> None: + def write(self, encoder: BinaryEncoder, val: dict[Any, Any]) -> None: encoder.write_int(len(val)) for k, v in val.items(): self.key_writer.write(encoder, k) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 7e467cd6c2..b13d9294a0 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -28,11 +28,6 @@ TYPE_CHECKING, Any, Callable, - Dict, - List, - Set, - Tuple, - Type, cast, ) @@ -268,16 +263,16 @@ def load_catalog(name: str | None = None, **properties: str | None) -> Catalog: catalog_type = infer_catalog_type(name, conf) if catalog_type: - return AVAILABLE_CATALOGS[catalog_type](name, cast(Dict[str, str], conf)) + return AVAILABLE_CATALOGS[catalog_type](name, cast(dict[str, str], conf)) raise ValueError(f"Could not initialize catalog with the following properties: {properties}") -def list_catalogs() -> List[str]: +def list_catalogs() -> list[str]: return _ENV_CONFIG.get_known_catalogs() -def delete_files(io: FileIO, files_to_delete: Set[str], file_type: str) -> None: +def delete_files(io: FileIO, files_to_delete: set[str], file_type: str) -> None: """Delete files. Log warnings if failing to delete any file. @@ -294,7 +289,7 @@ def delete_files(io: FileIO, files_to_delete: Set[str], file_type: str) -> None: logger.warning(msg=f"Failed to delete {file_type} file {file}", exc_info=exc) -def delete_data_files(io: FileIO, manifests_to_delete: List[ManifestFile]) -> None: +def delete_data_files(io: FileIO, manifests_to_delete: list[ManifestFile]) -> None: """Delete data files linked to given manifests. Log warnings if failing to delete any file. @@ -331,9 +326,9 @@ def _import_catalog(name: str, catalog_impl: str, properties: Properties) -> Cat @dataclass class PropertiesUpdateSummary: - removed: List[str] - updated: List[str] - missing: List[str] + removed: list[str] + updated: list[str] + missing: list[str] class Catalog(ABC): @@ -531,7 +526,7 @@ def rename_table(self, from_identifier: str | Identifier, to_identifier: str | I @abstractmethod def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -586,7 +581,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: """ @abstractmethod - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List tables under the given namespace in the catalog. Args: @@ -600,7 +595,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: """ @abstractmethod - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Args: @@ -614,7 +609,7 @@ def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: """ @abstractmethod - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: """List views under the given namespace in the catalog. Args: @@ -643,7 +638,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: @abstractmethod def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and updates properties for a namespace. @@ -707,7 +702,7 @@ def namespace_from(identifier: str | Identifier) -> Identifier: return Catalog.identifier_to_tuple(identifier)[:-1] @staticmethod - def namespace_to_string(identifier: str | Identifier, err: Type[ValueError] | Type[NoSuchNamespaceError] = ValueError) -> str: + def namespace_to_string(identifier: str | Identifier, err: type[ValueError] | type[NoSuchNamespaceError] = ValueError) -> str: """Transform a namespace identifier into a string. Args: @@ -729,7 +724,7 @@ def namespace_to_string(identifier: str | Identifier, err: Type[ValueError] | Ty @staticmethod def identifier_to_database( - identifier: str | Identifier, err: Type[ValueError] | Type[NoSuchNamespaceError] = ValueError + identifier: str | Identifier, err: type[ValueError] | type[NoSuchNamespaceError] = ValueError ) -> str: tuple_identifier = Catalog.identifier_to_tuple(identifier) if len(tuple_identifier) != 1: @@ -740,8 +735,8 @@ def identifier_to_database( @staticmethod def identifier_to_database_and_table( identifier: str | Identifier, - err: Type[ValueError] | Type[NoSuchTableError] | Type[NoSuchNamespaceError] = ValueError, - ) -> Tuple[str, str]: + err: type[ValueError] | type[NoSuchTableError] | type[NoSuchNamespaceError] = ValueError, + ) -> tuple[str, str]: tuple_identifier = Catalog.identifier_to_tuple(identifier) if len(tuple_identifier) != 2: raise err(f"Invalid path, hierarchical namespaces are not supported: {identifier}") @@ -852,7 +847,7 @@ def purge_table(self, identifier: str | Identifier) -> None: io = load_file_io(self.properties, table.metadata_location) metadata = table.metadata manifest_lists_to_delete = set() - manifests_to_delete: List[ManifestFile] = [] + manifests_to_delete: list[ManifestFile] = [] for snapshot in metadata.snapshots: manifests_to_delete += snapshot.manifests(io) manifest_lists_to_delete.add(snapshot.manifest_list) @@ -914,8 +909,8 @@ def _update_and_stage_table( self, current_table: Table | None, table_identifier: Identifier, - requirements: Tuple[TableRequirement, ...], - updates: Tuple[TableUpdate, ...], + requirements: tuple[TableRequirement, ...], + updates: tuple[TableUpdate, ...], ) -> StagedTable: for requirement in requirements: requirement.validate(current_table.metadata if current_table else None) @@ -940,13 +935,13 @@ def _update_and_stage_table( ) def _get_updated_props_and_update_summary( - self, current_properties: Properties, removals: Set[str] | None, updates: Properties - ) -> Tuple[PropertiesUpdateSummary, Properties]: + self, current_properties: Properties, removals: set[str] | None, updates: Properties + ) -> tuple[PropertiesUpdateSummary, Properties]: self._check_for_overlap(updates=updates, removals=removals) updated_properties = dict(current_properties) - removed: Set[str] = set() - updated: Set[str] = set() + removed: set[str] = set() + updated: set[str] = set() if removals: for key in removals: @@ -1028,7 +1023,7 @@ def _parse_metadata_version(metadata_location: str) -> int: return -1 @staticmethod - def _check_for_overlap(removals: Set[str] | None, updates: Properties) -> None: + def _check_for_overlap(removals: set[str] | None, updates: Properties) -> None: if updates and removals: overlap = set(removals) & set(updates.keys()) if overlap: diff --git a/pyiceberg/catalog/bigquery_metastore.py b/pyiceberg/catalog/bigquery_metastore.py index 4b1b922b41..b762c1047c 100644 --- a/pyiceberg/catalog/bigquery_metastore.py +++ b/pyiceberg/catalog/bigquery_metastore.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import json -from typing import TYPE_CHECKING, Any, List, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Union from google.api_core.exceptions import NotFound from google.cloud.bigquery import Client, Dataset, DatasetReference, TableReference @@ -227,7 +227,7 @@ def drop_table(self, identifier: str | Identifier) -> None: raise NoSuchTableError(f"Table does not exist: {dataset_name}.{table_name}") from e def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: raise NotImplementedError @@ -244,9 +244,9 @@ def drop_namespace(self, namespace: str | Identifier) -> None: except NotFound as e: raise NoSuchNamespaceError(f"Namespace {namespace} does not exist.") from e - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: database_name = self.identifier_to_database(namespace) - iceberg_tables: List[Identifier] = [] + iceberg_tables: list[Identifier] = [] try: dataset_ref = DatasetReference(project=self.project_id, dataset_id=database_name) # The list_tables method returns an iterator of TableListItem @@ -258,7 +258,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: raise NoSuchNamespaceError(f"Namespace (dataset) '{database_name}' not found.") from None return iceberg_tables - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: # Since this catalog only supports one-level namespaces, it always returns an empty list unless # passed an empty namespace to list all namespaces within the catalog. if namespace: @@ -299,7 +299,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self.load_table(identifier=identifier) - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def drop_view(self, identifier: str | Identifier) -> None: @@ -321,7 +321,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return {} def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: raise NotImplementedError diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index 59ce9f1b13..2d35b2c5e2 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -19,11 +19,7 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, Optional, - Set, - Tuple, Union, ) @@ -228,7 +224,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - raise NotImplementedError def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -400,7 +396,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: except ConditionalCheckFailedException as e: raise NoSuchNamespaceError(f"Database does not exist: {database_name}") from e - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List Iceberg tables under the given namespace in the catalog. Args: @@ -444,7 +440,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: return table_identifiers - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List top-level namespaces from the catalog. We do not support hierarchical namespace. @@ -505,7 +501,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return _get_namespace_properties(namespace_dict=namespace_dict) def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """ Remove or update provided property keys for a namespace. @@ -541,7 +537,7 @@ def update_namespace_properties( return properties_update_summary - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def drop_view(self, identifier: str | Identifier) -> None: @@ -550,22 +546,22 @@ def drop_view(self, identifier: str | Identifier) -> None: def view_exists(self, identifier: str | Identifier) -> bool: raise NotImplementedError - def _get_iceberg_table_item(self, database_name: str, table_name: str) -> Dict[str, Any]: + def _get_iceberg_table_item(self, database_name: str, table_name: str) -> dict[str, Any]: try: return self._get_dynamo_item(identifier=f"{database_name}.{table_name}", namespace=database_name) except ValueError as e: raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e - def _get_iceberg_namespace_item(self, database_name: str) -> Dict[str, Any]: + def _get_iceberg_namespace_item(self, database_name: str) -> dict[str, Any]: try: return self._get_dynamo_item(identifier=DYNAMODB_NAMESPACE, namespace=database_name) except ValueError as e: raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}") from e - def _ensure_namespace_exists(self, database_name: str) -> Dict[str, Any]: + def _ensure_namespace_exists(self, database_name: str) -> dict[str, Any]: return self._get_iceberg_namespace_item(database_name) - def _get_dynamo_item(self, identifier: str, namespace: str) -> Dict[str, Any]: + def _get_dynamo_item(self, identifier: str, namespace: str) -> dict[str, Any]: try: response = self.dynamodb.get_item( TableName=self.dynamodb_table_name, @@ -592,7 +588,7 @@ def _get_dynamo_item(self, identifier: str, namespace: str) -> Dict[str, Any]: ) as e: raise GenericDynamoDbError(e.message) from e - def _put_dynamo_item(self, item: Dict[str, Any], condition_expression: str) -> None: + def _put_dynamo_item(self, item: dict[str, Any], condition_expression: str) -> None: try: self.dynamodb.put_item(TableName=self.dynamodb_table_name, Item=item, ConditionExpression=condition_expression) except self.dynamodb.exceptions.ConditionalCheckFailedException as e: @@ -635,7 +631,7 @@ def _delete_dynamo_item(self, namespace: str, identifier: str, condition_express ) as e: raise GenericDynamoDbError(e.message) from e - def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: Dict[str, Any]) -> Table: + def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: dict[str, Any]) -> Table: table_dict = _convert_dynamo_item_to_regular_dict(dynamo_table_item) for prop in [_add_property_prefix(prop) for prop in (TABLE_TYPE, METADATA_LOCATION)] + [ @@ -672,7 +668,7 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - return self._get_hive_style_warehouse_location(database_name, table_name) -def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> Dict[str, Any]: +def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) _dict = { DYNAMODB_COL_IDENTIFIER: { @@ -702,7 +698,7 @@ def _get_create_table_item(database_name: str, table_name: str, properties: Prop return _dict -def _get_rename_table_item(from_dynamo_table_item: Dict[str, Any], to_database_name: str, to_table_name: str) -> Dict[str, Any]: +def _get_rename_table_item(from_dynamo_table_item: dict[str, Any], to_database_name: str, to_table_name: str) -> dict[str, Any]: _dict = from_dynamo_table_item current_timestamp_ms = str(round(time() * 1000)) _dict[DYNAMODB_COL_IDENTIFIER]["S"] = f"{to_database_name}.{to_table_name}" @@ -712,7 +708,7 @@ def _get_rename_table_item(from_dynamo_table_item: Dict[str, Any], to_database_n return _dict -def _get_create_database_item(database_name: str, properties: Properties) -> Dict[str, Any]: +def _get_create_database_item(database_name: str, properties: Properties) -> dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) _dict = { DYNAMODB_COL_IDENTIFIER: { @@ -738,7 +734,7 @@ def _get_create_database_item(database_name: str, properties: Properties) -> Dic return _dict -def _get_update_database_item(namespace_item: Dict[str, Any], updated_properties: Properties) -> Dict[str, Any]: +def _get_update_database_item(namespace_item: dict[str, Any], updated_properties: Properties) -> dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) _dict = { @@ -802,11 +798,11 @@ def _get_update_database_item(namespace_item: Dict[str, Any], updated_properties ] -def _get_namespace_properties(namespace_dict: Dict[str, str]) -> Properties: +def _get_namespace_properties(namespace_dict: dict[str, str]) -> Properties: return {_remove_property_prefix(key): val for key, val in namespace_dict.items() if key.startswith(PROPERTY_KEY_PREFIX)} -def _convert_dynamo_item_to_regular_dict(dynamo_json: Dict[str, Any]) -> Dict[str, str]: +def _convert_dynamo_item_to_regular_dict(dynamo_json: dict[str, Any]) -> dict[str, str]: """Convert a dynamo json to a regular json. Example of a dynamo json: diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 2474f0e784..7260b29447 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -19,11 +19,7 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, Optional, - Set, - Tuple, Union, cast, ) @@ -178,7 +174,7 @@ class _IcebergSchemaToGlueType(SchemaVisitor[str]): def schema(self, schema: Schema, struct_result: str) -> str: return struct_result - def struct(self, struct: StructType, field_results: List[str]) -> str: + def struct(self, struct: StructType, field_results: list[str]) -> str: return f"struct<{','.join(field_results)}>" def field(self, field: NestedField, field_result: str) -> str: @@ -198,8 +194,8 @@ def primitive(self, primitive: PrimitiveType) -> str: return GLUE_PRIMITIVE_TYPES[primitive_type] -def _to_columns(metadata: TableMetadata) -> List["ColumnTypeDef"]: - results: Dict[str, ColumnTypeDef] = {} +def _to_columns(metadata: TableMetadata) -> list["ColumnTypeDef"]: + results: dict[str, ColumnTypeDef] = {} def _append_to_results(field: NestedField, is_current: bool) -> None: if field.name in results: @@ -305,7 +301,7 @@ def _register_glue_catalog_id_with_glue_client(glue: "GlueClient", glue_catalog_ """ event_system = glue.meta.events - def add_glue_catalog_id(params: Dict[str, str], **kwargs: Any) -> None: + def add_glue_catalog_id(params: dict[str, str], **kwargs: Any) -> None: if "CatalogId" not in params: params["CatalogId"] = glue_catalog_id @@ -487,7 +483,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self.load_table(identifier=identifier) def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -705,7 +701,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: ) self.glue.delete_database(Name=database_name) - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List Iceberg tables under the given namespace in the catalog. Args: @@ -718,7 +714,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: NoSuchNamespaceError: If a namespace with the given name does not exist, or the identifier is invalid. """ database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) - table_list: List[TableTypeDef] = [] + table_list: list[TableTypeDef] = [] next_token: str | None = None try: while True: @@ -736,7 +732,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: raise NoSuchNamespaceError(f"Database does not exist: {database_name}") from e return [(database_name, table["Name"]) for table in table_list if self.__is_iceberg_table(table)] - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Returns: @@ -746,7 +742,7 @@ def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: if namespace: return [] - database_list: List[DatabaseTypeDef] = [] + database_list: list[DatabaseTypeDef] = [] next_token: str | None = None while True: @@ -789,7 +785,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return properties def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and updates properties for a namespace. @@ -812,7 +808,7 @@ def update_namespace_properties( return properties_update_summary - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def drop_view(self, identifier: str | Identifier) -> None: diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index a6f7131b06..e096470451 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -22,11 +22,6 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, - Set, - Tuple, - Type, Union, ) from urllib.parse import urlparse @@ -148,7 +143,7 @@ class _HiveClient: """Helper class to nicely open and close the transport.""" _transport: TTransport - _ugi: List[str] | None + _ugi: list[str] | None def __init__( self, @@ -194,7 +189,7 @@ def __enter__(self) -> Client: self._transport.open() return self._client() # recreate the client - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Close transport if it was opened.""" if self._transport.isOpen(): self._transport.close() @@ -223,7 +218,7 @@ def _construct_hive_storage_descriptor(schema: Schema, location: str | None, hiv def _construct_parameters( metadata_location: str, previous_metadata_location: str | None = None, metadata_properties: Properties | None = None -) -> Dict[str, Any]: +) -> dict[str, Any]: properties = {PROP_EXTERNAL: "TRUE", PROP_TABLE_TYPE: "ICEBERG", PROP_METADATA_LOCATION: metadata_location} if previous_metadata_location: properties[PROP_PREVIOUS_METADATA_LOCATION] = previous_metadata_location @@ -276,7 +271,7 @@ def __init__(self, hive2_compatible: bool): def schema(self, schema: Schema, struct_result: str) -> str: return struct_result - def struct(self, struct: StructType, field_results: List[str]) -> str: + def struct(self, struct: StructType, field_results: list[str]) -> str: return f"struct<{','.join(field_results)}>" def field(self, field: NestedField, field_result: str) -> str: @@ -315,7 +310,7 @@ def __init__(self, name: str, **properties: str): ) @staticmethod - def _create_hive_client(properties: Dict[str, str]) -> _HiveClient: + def _create_hive_client(properties: dict[str, str]) -> _HiveClient: last_exception = None for uri in properties[URI].split(","): try: @@ -333,7 +328,7 @@ def _create_hive_client(properties: Dict[str, str]) -> _HiveClient: raise ValueError(f"Unable to connect to hive using uri: {properties[URI]}") def _convert_hive_into_iceberg(self, table: HiveTable) -> Table: - properties: Dict[str, str] = table.parameters + properties: dict[str, str] = table.parameters if TABLE_TYPE not in properties: raise NoSuchPropertyException( f"Property table_type missing, could not determine type: {table.dbName}.{table.tableName}" @@ -469,7 +464,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self._convert_hive_into_iceberg(hive_table) - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def view_exists(self, identifier: str | Identifier) -> bool: @@ -505,7 +500,7 @@ def _do_wait_for_lock() -> LockResponse: return _do_wait_for_lock() def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -715,7 +710,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: except MetaException as e: raise NoSuchNamespaceError(f"Database does not exists: {database_name}") from e - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List Iceberg tables under the given namespace in the catalog. When the database doesn't exist, it will just return an empty list. @@ -739,7 +734,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: if table.parameters.get(TABLE_TYPE, "").lower() == ICEBERG ] - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Returns: @@ -777,7 +772,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: raise NoSuchNamespaceError(f"Database does not exists: {database_name}") from e def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and update properties for a namespace. @@ -799,8 +794,8 @@ def update_namespace_properties( except NoSuchObjectException as e: raise NoSuchNamespaceError(f"Database does not exists: {database_name}") from e - removed: Set[str] = set() - updated: Set[str] = set() + removed: set[str] = set() + updated: set[str] = set() if removals: for key in removals: diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py index 08b71d90af..ac2423c198 100644 --- a/pyiceberg/catalog/noop.py +++ b/pyiceberg/catalog/noop.py @@ -16,9 +16,6 @@ # under the License. from typing import ( TYPE_CHECKING, - List, - Set, - Tuple, Union, ) @@ -95,7 +92,7 @@ def rename_table(self, from_identifier: str | Identifier, to_identifier: str | I raise NotImplementedError def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: raise NotImplementedError @@ -105,21 +102,21 @@ def create_namespace(self, namespace: str | Identifier, properties: Properties = def drop_namespace(self, namespace: str | Identifier) -> None: raise NotImplementedError - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: raise NotImplementedError def load_namespace_properties(self, namespace: str | Identifier) -> Properties: raise NotImplementedError def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: raise NotImplementedError - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def view_exists(self, identifier: str | Identifier) -> bool: diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index e9571aa491..3b77fd47f0 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -18,10 +18,6 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, - Set, - Tuple, Union, ) @@ -164,11 +160,11 @@ class CreateTableRequest(IcebergBaseModel): partition_spec: PartitionSpec | None = Field(alias="partition-spec") write_order: SortOrder | None = Field(alias="write-order") stage_create: bool = Field(alias="stage-create", default=False) - properties: Dict[str, str] = Field(default_factory=dict) + properties: dict[str, str] = Field(default_factory=dict) # validators @field_validator("properties", mode="before") - def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: + def transform_properties_dict_value_to_str(cls, properties: Properties) -> dict[str, str]: return transform_dict_value_to_str(properties) @@ -183,7 +179,7 @@ class ConfigResponse(IcebergBaseModel): class ListNamespaceResponse(IcebergBaseModel): - namespaces: List[Identifier] = Field() + namespaces: list[Identifier] = Field() class NamespaceResponse(IcebergBaseModel): @@ -192,9 +188,9 @@ class NamespaceResponse(IcebergBaseModel): class UpdateNamespacePropertiesResponse(IcebergBaseModel): - removed: List[str] = Field() - updated: List[str] = Field() - missing: List[str] = Field() + removed: list[str] = Field() + updated: list[str] = Field() + missing: list[str] = Field() class ListTableResponseEntry(IcebergBaseModel): @@ -208,11 +204,11 @@ class ListViewResponseEntry(IcebergBaseModel): class ListTablesResponse(IcebergBaseModel): - identifiers: List[ListTableResponseEntry] = Field() + identifiers: list[ListTableResponseEntry] = Field() class ListViewsResponse(IcebergBaseModel): - identifiers: List[ListViewResponseEntry] = Field() + identifiers: list[ListViewResponseEntry] = Field() class RestCatalog(Catalog): @@ -346,7 +342,7 @@ def _warn_oauth_tokens_deprecation(self) -> None: "endpoint is explicitly configured. See https://github.com/apache/iceberg/issues/10537", ) - def _extract_optional_oauth_params(self) -> Dict[str, str]: + def _extract_optional_oauth_params(self) -> dict[str, str]: optional_oauth_param = {SCOPE: self.properties.get(SCOPE) or CATALOG_SCOPE} set_of_optional_params = {AUDIENCE, RESOURCE} for param in set_of_optional_params: @@ -391,7 +387,7 @@ def _split_identifier_for_path( return {"namespace": NAMESPACE_SEPARATOR.join(identifier_tuple[:-1]), kind.value: identifier_tuple[-1]} - def _split_identifier_for_json(self, identifier: str | Identifier) -> Dict[str, Identifier | str]: + def _split_identifier_for_json(self, identifier: str | Identifier) -> dict[str, Identifier | str]: identifier_tuple = self._identifier_to_validated_tuple(identifier) return {"namespace": identifier_tuple[:-1], "name": identifier_tuple[-1]} @@ -447,7 +443,7 @@ def add_headers(self, request: PreparedRequest, **kwargs: Any) -> None: # pylin session.mount(self.uri, SigV4Adapter(**self.properties)) - def _response_to_table(self, identifier_tuple: Tuple[str, ...], table_response: TableResponse) -> Table: + def _response_to_table(self, identifier_tuple: tuple[str, ...], table_response: TableResponse) -> Table: return Table( identifier=identifier_tuple, metadata_location=table_response.metadata_location, # type: ignore @@ -459,7 +455,7 @@ def _response_to_table(self, identifier_tuple: Tuple[str, ...], table_response: config=table_response.config, ) - def _response_to_staged_table(self, identifier_tuple: Tuple[str, ...], table_response: TableResponse) -> StagedTable: + def _response_to_staged_table(self, identifier_tuple: tuple[str, ...], table_response: TableResponse) -> StagedTable: return StagedTable( identifier=identifier_tuple, metadata_location=table_response.metadata_location, # type: ignore @@ -602,7 +598,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self._response_to_table(self.identifier_to_tuple(identifier), table_response) @retry(**_RETRY_ARGS) - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: namespace_tuple = self._check_valid_namespace_identifier(namespace) namespace_concat = NAMESPACE_SEPARATOR.join(namespace_tuple) response = self._session.get(self.url(Endpoints.list_tables, namespace=namespace_concat)) @@ -683,7 +679,7 @@ def _remove_catalog_name_from_table_request_identifier(self, table_request: Comm return table_request @retry(**_RETRY_ARGS) - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: namespace_tuple = self._check_valid_namespace_identifier(namespace) namespace_concat = NAMESPACE_SEPARATOR.join(namespace_tuple) response = self._session.get(self.url(Endpoints.list_views, namespace=namespace_concat)) @@ -695,7 +691,7 @@ def list_views(self, namespace: str | Identifier) -> List[Identifier]: @retry(**_RETRY_ARGS) def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -760,7 +756,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: _handle_non_200_response(exc, {404: NoSuchNamespaceError, 409: NamespaceNotEmptyError}) @retry(**_RETRY_ARGS) - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: namespace_tuple = self.identifier_to_tuple(namespace) response = self._session.get( self.url( @@ -790,7 +786,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: @retry(**_RETRY_ARGS) def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: namespace_tuple = self._check_valid_namespace_identifier(namespace) namespace = NAMESPACE_SEPARATOR.join(namespace_tuple) diff --git a/pyiceberg/catalog/rest/auth.py b/pyiceberg/catalog/rest/auth.py index 7f56f6300b..3fdc837c19 100644 --- a/pyiceberg/catalog/rest/auth.py +++ b/pyiceberg/catalog/rest/auth.py @@ -22,7 +22,7 @@ import time from abc import ABC, abstractmethod from functools import cached_property -from typing import Any, Dict, List, Type +from typing import Any import requests from requests import HTTPError, PreparedRequest, Session @@ -76,7 +76,7 @@ class LegacyOAuth2AuthManager(AuthManager): _auth_url: str | None _token: str | None _credential: str | None - _optional_oauth_params: Dict[str, str] | None + _optional_oauth_params: dict[str, str] | None def __init__( self, @@ -84,7 +84,7 @@ def __init__( auth_url: str | None = None, credential: str | None = None, initial_token: str | None = None, - optional_oauth_params: Dict[str, str] | None = None, + optional_oauth_params: dict[str, str] | None = None, ): self._session = session self._auth_url = auth_url @@ -220,7 +220,7 @@ def auth_header(self) -> str: class GoogleAuthManager(AuthManager): """An auth manager that is responsible for handling Google credentials.""" - def __init__(self, credentials_path: str | None = None, scopes: List[str] | None = None): + def __init__(self, credentials_path: str | None = None, scopes: list[str] | None = None): """ Initialize GoogleAuthManager. @@ -280,10 +280,10 @@ def __call__(self, request: PreparedRequest) -> PreparedRequest: class AuthManagerFactory: - _registry: Dict[str, Type["AuthManager"]] = {} + _registry: dict[str, type["AuthManager"]] = {} @classmethod - def register(cls, name: str, auth_manager_class: Type["AuthManager"]) -> None: + def register(cls, name: str, auth_manager_class: type["AuthManager"]) -> None: """ Register a string name to a known AuthManager class. @@ -297,7 +297,7 @@ def register(cls, name: str, auth_manager_class: Type["AuthManager"]) -> None: cls._registry[name] = auth_manager_class @classmethod - def create(cls, class_or_name: str, config: Dict[str, Any]) -> AuthManager: + def create(cls, class_or_name: str, config: dict[str, Any]) -> AuthManager: """ Create an AuthManager by name or fully-qualified class path. diff --git a/pyiceberg/catalog/rest/response.py b/pyiceberg/catalog/rest/response.py index d28a7c3f71..157e4bfa16 100644 --- a/pyiceberg/catalog/rest/response.py +++ b/pyiceberg/catalog/rest/response.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. from json import JSONDecodeError -from typing import Dict, Literal, Type +from typing import Literal from pydantic import Field, ValidationError from requests import HTTPError @@ -60,8 +60,8 @@ class OAuthErrorResponse(IcebergBaseModel): error_uri: str | None = None -def _handle_non_200_response(exc: HTTPError, error_handler: Dict[int, Type[Exception]]) -> None: - exception: Type[Exception] +def _handle_non_200_response(exc: HTTPError, error_handler: dict[int, type[Exception]]) -> None: + exception: type[Exception] if exc.response is None: raise ValueError("Did not receive a response") diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index cefb22b95b..2b6fa74517 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -17,9 +17,6 @@ from typing import ( TYPE_CHECKING, - List, - Set, - Tuple, Union, ) @@ -402,7 +399,7 @@ def rename_table(self, from_identifier: str | Identifier, to_identifier: str | I return self.load_table(to_identifier) def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -583,7 +580,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: ) session.commit() - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List tables under the given namespace in the catalog. Args: @@ -604,7 +601,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: result = session.scalars(stmt) return [(Catalog.identifier_to_tuple(table.table_namespace) + (table.table_name,)) for table in result] - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Args: @@ -669,7 +666,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return {props.property_key: props.property_value for props in result} def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and update properties for a namespace. @@ -724,7 +721,7 @@ def update_namespace_properties( session.commit() return properties_update_summary - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def view_exists(self, identifier: str | Identifier) -> bool: diff --git a/pyiceberg/cli/console.py b/pyiceberg/cli/console.py index f3adf830b2..e7774e3b2f 100644 --- a/pyiceberg/cli/console.py +++ b/pyiceberg/cli/console.py @@ -19,9 +19,7 @@ from typing import ( Any, Callable, - Dict, Literal, - Tuple, ) import click @@ -97,7 +95,7 @@ def run( ctx.exit(1) -def _catalog_and_output(ctx: Context) -> Tuple[Catalog, Output]: +def _catalog_and_output(ctx: Context) -> tuple[Catalog, Output]: """Small helper to set the types.""" return ctx.obj["catalog"], ctx.obj["output"] @@ -430,7 +428,7 @@ def list_refs(ctx: Context, identifier: str, type: str, verbose: bool) -> None: output.describe_refs(relevant_refs) -def _retention_properties(ref: SnapshotRef, table_properties: Dict[str, str]) -> Dict[str, str]: +def _retention_properties(ref: SnapshotRef, table_properties: dict[str, str]) -> dict[str, str]: retention_properties = {} if ref.snapshot_ref_type == SnapshotRefType.BRANCH: default_min_snapshots_to_keep = property_as_int( diff --git a/pyiceberg/cli/output.py b/pyiceberg/cli/output.py index b546877fac..332221008c 100644 --- a/pyiceberg/cli/output.py +++ b/pyiceberg/cli/output.py @@ -18,9 +18,6 @@ from abc import ABC, abstractmethod from typing import ( Any, - Dict, - List, - Tuple, ) from uuid import UUID @@ -43,7 +40,7 @@ class Output(ABC): def exception(self, ex: Exception) -> None: ... @abstractmethod - def identifiers(self, identifiers: List[Identifier]) -> None: ... + def identifiers(self, identifiers: list[Identifier]) -> None: ... @abstractmethod def describe_table(self, table: Table) -> None: ... @@ -70,7 +67,7 @@ def uuid(self, uuid: UUID | None) -> None: ... def version(self, version: str) -> None: ... @abstractmethod - def describe_refs(self, refs: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: ... + def describe_refs(self, refs: list[tuple[str, SnapshotRefType, dict[str, str]]]) -> None: ... class ConsoleOutput(Output): @@ -91,7 +88,7 @@ def exception(self, ex: Exception) -> None: else: Console(stderr=True).print(ex) - def identifiers(self, identifiers: List[Identifier]) -> None: + def identifiers(self, identifiers: list[Identifier]) -> None: table = self._table for identifier in identifiers: table.add_row(".".join(identifier)) @@ -174,7 +171,7 @@ def uuid(self, uuid: UUID | None) -> None: def version(self, version: str) -> None: Console().print(version) - def describe_refs(self, ref_details: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: + def describe_refs(self, ref_details: list[tuple[str, SnapshotRefType, dict[str, str]]]) -> None: refs_table = RichTable(title="Snapshot Refs") refs_table.add_column("Ref") refs_table.add_column("Type") @@ -202,7 +199,7 @@ def _out(self, d: Any) -> None: def exception(self, ex: Exception) -> None: self._out({"type": ex.__class__.__name__, "message": str(ex)}) - def identifiers(self, identifiers: List[Identifier]) -> None: + def identifiers(self, identifiers: list[Identifier]) -> None: self._out([".".join(identifier) for identifier in identifiers]) def describe_table(self, table: Table) -> None: @@ -240,7 +237,7 @@ def uuid(self, uuid: UUID | None) -> None: def version(self, version: str) -> None: self._out({"version": version}) - def describe_refs(self, refs: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: + def describe_refs(self, refs: list[tuple[str, SnapshotRefType, dict[str, str]]]) -> None: self._out( [ {"name": name, "type": type, detail_key: detail_val} diff --git a/pyiceberg/expressions/__init__.py b/pyiceberg/expressions/__init__.py index 45486c3a37..20df6e548c 100644 --- a/pyiceberg/expressions/__init__.py +++ b/pyiceberg/expressions/__init__.py @@ -17,6 +17,7 @@ from __future__ import annotations +import builtins from abc import ABC, abstractmethod from functools import cached_property from typing import ( @@ -25,9 +26,6 @@ Generic, Iterable, Sequence, - Set, - Tuple, - Type, TypeVar, ) from typing import Literal as TypingLiteral @@ -55,7 +53,7 @@ def _to_unbound_term(term: str | UnboundTerm[Any]) -> UnboundTerm[Any]: return Reference(term) if isinstance(term, str) else term -def _to_literal_set(values: Iterable[L] | Iterable[Literal[L]]) -> Set[Literal[L]]: +def _to_literal_set(values: Iterable[L] | Iterable[Literal[L]]) -> set[Literal[L]]: return {_to_literal(v) for v in values} @@ -146,7 +144,7 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> B: ... @property @abstractmethod - def as_bound(self) -> Type[Bound]: ... + def as_bound(self) -> type[Bound]: ... class BoundTerm(Term[L], Bound, ABC): @@ -254,7 +252,7 @@ def name(self) -> str: return self.root @property - def as_bound(self) -> Type[BoundReference[L]]: + def as_bound(self) -> type[BoundReference[L]]: return BoundReference[L] @@ -296,7 +294,7 @@ def __invert__(self) -> BooleanExpression: # De Morgan's law: not (A and B) = (not A) or (not B) return Or(~self.left, ~self.right) - def __getnewargs__(self) -> Tuple[BooleanExpression, BooleanExpression]: + def __getnewargs__(self) -> tuple[BooleanExpression, BooleanExpression]: """Pickle the And class.""" return (self.left, self.right) @@ -344,7 +342,7 @@ def __invert__(self) -> BooleanExpression: # De Morgan's law: not (A or B) = (not A) and (not B) return And(~self.left, ~self.right) - def __getnewargs__(self) -> Tuple[BooleanExpression, BooleanExpression]: + def __getnewargs__(self) -> tuple[BooleanExpression, BooleanExpression]: """Pickle the Or class.""" return (self.left, self.right) @@ -386,7 +384,7 @@ def __invert__(self) -> BooleanExpression: """Transform the Expression into its negated version.""" return self.child - def __getnewargs__(self) -> Tuple[BooleanExpression]: + def __getnewargs__(self) -> tuple[BooleanExpression]: """Pickle the Not class.""" return (self.child,) @@ -441,7 +439,7 @@ def __eq__(self, other: Any) -> bool: @property @abstractmethod - def as_unbound(self) -> Type[UnboundPredicate[Any]]: ... + def as_unbound(self) -> type[UnboundPredicate[Any]]: ... class UnboundPredicate(Generic[L], Unbound[BooleanExpression], BooleanExpression, ABC): @@ -459,7 +457,7 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BooleanExpression @property @abstractmethod - def as_bound(self) -> Type[BoundPredicate[L]]: ... + def as_bound(self) -> type[BoundPredicate[L]]: ... class UnaryPredicate(IcebergBaseModel, UnboundPredicate[Any], ABC): @@ -478,7 +476,7 @@ def __str__(self) -> str: def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundUnaryPredicate[Any]: bound_term = self.term.bind(schema, case_sensitive) - return self.as_bound(bound_term) + return self.as_bound(bound_term) # type: ignore def __repr__(self) -> str: """Return the string representation of the UnaryPredicate class.""" @@ -486,7 +484,7 @@ def __repr__(self) -> str: @property @abstractmethod - def as_bound(self) -> Type[BoundUnaryPredicate[Any]]: ... + def as_bound(self) -> type[BoundUnaryPredicate[Any]]: ... # type: ignore class BoundUnaryPredicate(BoundPredicate[L], ABC): @@ -496,9 +494,9 @@ def __repr__(self) -> str: @property @abstractmethod - def as_unbound(self) -> Type[UnaryPredicate]: ... + def as_unbound(self) -> type[UnaryPredicate]: ... - def __getnewargs__(self) -> Tuple[BoundTerm[L]]: + def __getnewargs__(self) -> tuple[BoundTerm[L]]: """Pickle the BoundUnaryPredicate class.""" return (self.term,) @@ -514,7 +512,7 @@ def __invert__(self) -> BoundNotNull[L]: return BoundNotNull(self.term) @property - def as_unbound(self) -> Type[IsNull]: + def as_unbound(self) -> type[IsNull]: return IsNull @@ -529,7 +527,7 @@ def __invert__(self) -> BoundIsNull[L]: return BoundIsNull(self.term) @property - def as_unbound(self) -> Type[NotNull]: + def as_unbound(self) -> type[NotNull]: return NotNull @@ -541,7 +539,7 @@ def __invert__(self) -> NotNull: return NotNull(self.term) @property - def as_bound(self) -> Type[BoundIsNull[L]]: + def as_bound(self) -> builtins.type[BoundIsNull[L]]: return BoundIsNull[L] @@ -553,7 +551,7 @@ def __invert__(self) -> IsNull: return IsNull(self.term) @property - def as_bound(self) -> Type[BoundNotNull[L]]: + def as_bound(self) -> builtins.type[BoundNotNull[L]]: return BoundNotNull[L] @@ -569,7 +567,7 @@ def __invert__(self) -> BoundNotNaN[L]: return BoundNotNaN(self.term) @property - def as_unbound(self) -> Type[IsNaN]: + def as_unbound(self) -> type[IsNaN]: return IsNaN @@ -585,7 +583,7 @@ def __invert__(self) -> BoundIsNaN[L]: return BoundIsNaN(self.term) @property - def as_unbound(self) -> Type[NotNaN]: + def as_unbound(self) -> type[NotNaN]: return NotNaN @@ -597,7 +595,7 @@ def __invert__(self) -> NotNaN: return NotNaN(self.term) @property - def as_bound(self) -> Type[BoundIsNaN[L]]: + def as_bound(self) -> builtins.type[BoundIsNaN[L]]: return BoundIsNaN[L] @@ -609,7 +607,7 @@ def __invert__(self) -> IsNaN: return IsNaN(self.term) @property - def as_bound(self) -> Type[BoundNotNaN[L]]: + def as_bound(self) -> builtins.type[BoundNotNaN[L]]: return BoundNotNaN[L] @@ -617,7 +615,7 @@ class SetPredicate(IcebergBaseModel, UnboundPredicate[L], ABC): model_config = ConfigDict(arbitrary_types_allowed=True) type: TypingLiteral["in", "not-in"] = Field(default="in") - literals: Set[Literal[L]] = Field(alias="items") + literals: set[Literal[L]] = Field(alias="items") def __init__(self, term: str | UnboundTerm[Any], literals: Iterable[L] | Iterable[Literal[L]]): super().__init__(term=_to_unbound_term(term), items=_to_literal_set(literals)) # type: ignore @@ -640,26 +638,26 @@ def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the SetPredicate class.""" return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False - def __getnewargs__(self) -> Tuple[UnboundTerm[L], Set[Literal[L]]]: + def __getnewargs__(self) -> tuple[UnboundTerm[L], set[Literal[L]]]: """Pickle the SetPredicate class.""" return (self.term, self.literals) @property @abstractmethod - def as_bound(self) -> Type[BoundSetPredicate[L]]: + def as_bound(self) -> builtins.type[BoundSetPredicate[L]]: return BoundSetPredicate[L] class BoundSetPredicate(BoundPredicate[L], ABC): - literals: Set[Literal[L]] + literals: set[Literal[L]] - def __init__(self, term: BoundTerm[L], literals: Set[Literal[L]]): + def __init__(self, term: BoundTerm[L], literals: set[Literal[L]]): # Since we don't know the type of BoundPredicate[L], we have to ignore this one super().__init__(term) # type: ignore self.literals = _to_literal_set(literals) # pylint: disable=W0621 @cached_property - def value_set(self) -> Set[L]: + def value_set(self) -> set[L]: return {lit.value for lit in self.literals} def __str__(self) -> str: @@ -676,17 +674,17 @@ def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the BoundSetPredicate class.""" return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False - def __getnewargs__(self) -> Tuple[BoundTerm[L], Set[Literal[L]]]: + def __getnewargs__(self) -> tuple[BoundTerm[L], set[Literal[L]]]: """Pickle the BoundSetPredicate class.""" return (self.term, self.literals) @property @abstractmethod - def as_unbound(self) -> Type[SetPredicate[L]]: ... + def as_unbound(self) -> type[SetPredicate[L]]: ... class BoundIn(BoundSetPredicate[L]): - def __new__(cls, term: BoundTerm[L], literals: Set[Literal[L]]) -> BooleanExpression: # type: ignore # pylint: disable=W0221 + def __new__(cls, term: BoundTerm[L], literals: set[Literal[L]]) -> BooleanExpression: # type: ignore # pylint: disable=W0221 count = len(literals) if count == 0: return AlwaysFalse() @@ -704,7 +702,7 @@ def __eq__(self, other: Any) -> bool: return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False @property - def as_unbound(self) -> Type[In[L]]: + def as_unbound(self) -> type[In[L]]: return In @@ -712,7 +710,7 @@ class BoundNotIn(BoundSetPredicate[L]): def __new__( # type: ignore # pylint: disable=W0221 cls, term: BoundTerm[L], - literals: Set[Literal[L]], + literals: set[Literal[L]], ) -> BooleanExpression: count = len(literals) if count == 0: @@ -727,7 +725,7 @@ def __invert__(self) -> BoundIn[L]: return BoundIn(self.term, self.literals) @property - def as_unbound(self) -> Type[NotIn[L]]: + def as_unbound(self) -> type[NotIn[L]]: return NotIn @@ -737,7 +735,7 @@ class In(SetPredicate[L]): def __new__( # type: ignore # pylint: disable=W0221 cls, term: str | UnboundTerm[Any], literals: Iterable[L] | Iterable[Literal[L]] ) -> BooleanExpression: - literals_set: Set[Literal[L]] = _to_literal_set(literals) + literals_set: set[Literal[L]] = _to_literal_set(literals) count = len(literals_set) if count == 0: return AlwaysFalse() @@ -751,7 +749,7 @@ def __invert__(self) -> NotIn[L]: return NotIn[L](self.term, self.literals) @property - def as_bound(self) -> Type[BoundIn[L]]: + def as_bound(self) -> builtins.type[BoundIn[L]]: return BoundIn[L] @@ -761,7 +759,7 @@ class NotIn(SetPredicate[L], ABC): def __new__( # type: ignore # pylint: disable=W0221 cls, term: str | UnboundTerm[Any], literals: Iterable[L] | Iterable[Literal[L]] ) -> BooleanExpression: - literals_set: Set[Literal[L]] = _to_literal_set(literals) + literals_set: set[Literal[L]] = _to_literal_set(literals) count = len(literals_set) if count == 0: return AlwaysTrue() @@ -775,7 +773,7 @@ def __invert__(self) -> In[L]: return In[L](self.term, self.literals) @property - def as_bound(self) -> Type[BoundNotIn[L]]: + def as_bound(self) -> builtins.type[BoundNotIn[L]]: return BoundNotIn[L] @@ -825,7 +823,7 @@ def __repr__(self) -> str: @property @abstractmethod - def as_bound(self) -> Type[BoundLiteralPredicate[L]]: ... + def as_bound(self) -> builtins.type[BoundLiteralPredicate[L]]: ... class BoundLiteralPredicate(BoundPredicate[L], ABC): @@ -848,7 +846,7 @@ def __repr__(self) -> str: @property @abstractmethod - def as_unbound(self) -> Type[LiteralPredicate[L]]: ... + def as_unbound(self) -> type[LiteralPredicate[L]]: ... class BoundEqualTo(BoundLiteralPredicate[L]): @@ -857,7 +855,7 @@ def __invert__(self) -> BoundNotEqualTo[L]: return BoundNotEqualTo[L](self.term, self.literal) @property - def as_unbound(self) -> Type[EqualTo[L]]: + def as_unbound(self) -> type[EqualTo[L]]: return EqualTo @@ -867,7 +865,7 @@ def __invert__(self) -> BoundEqualTo[L]: return BoundEqualTo[L](self.term, self.literal) @property - def as_unbound(self) -> Type[NotEqualTo[L]]: + def as_unbound(self) -> type[NotEqualTo[L]]: return NotEqualTo @@ -877,7 +875,7 @@ def __invert__(self) -> BoundLessThan[L]: return BoundLessThan[L](self.term, self.literal) @property - def as_unbound(self) -> Type[GreaterThanOrEqual[L]]: + def as_unbound(self) -> type[GreaterThanOrEqual[L]]: return GreaterThanOrEqual[L] @@ -887,7 +885,7 @@ def __invert__(self) -> BoundLessThanOrEqual[L]: return BoundLessThanOrEqual(self.term, self.literal) @property - def as_unbound(self) -> Type[GreaterThan[L]]: + def as_unbound(self) -> type[GreaterThan[L]]: return GreaterThan[L] @@ -897,7 +895,7 @@ def __invert__(self) -> BoundGreaterThanOrEqual[L]: return BoundGreaterThanOrEqual[L](self.term, self.literal) @property - def as_unbound(self) -> Type[LessThan[L]]: + def as_unbound(self) -> type[LessThan[L]]: return LessThan[L] @@ -907,7 +905,7 @@ def __invert__(self) -> BoundGreaterThan[L]: return BoundGreaterThan[L](self.term, self.literal) @property - def as_unbound(self) -> Type[LessThanOrEqual[L]]: + def as_unbound(self) -> type[LessThanOrEqual[L]]: return LessThanOrEqual[L] @@ -917,7 +915,7 @@ def __invert__(self) -> BoundNotStartsWith[L]: return BoundNotStartsWith[L](self.term, self.literal) @property - def as_unbound(self) -> Type[StartsWith[L]]: + def as_unbound(self) -> type[StartsWith[L]]: return StartsWith[L] @@ -927,7 +925,7 @@ def __invert__(self) -> BoundStartsWith[L]: return BoundStartsWith[L](self.term, self.literal) @property - def as_unbound(self) -> Type[NotStartsWith[L]]: + def as_unbound(self) -> type[NotStartsWith[L]]: return NotStartsWith[L] @@ -939,7 +937,7 @@ def __invert__(self) -> NotEqualTo[L]: return NotEqualTo[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundEqualTo[L]]: + def as_bound(self) -> builtins.type[BoundEqualTo[L]]: return BoundEqualTo[L] @@ -951,7 +949,7 @@ def __invert__(self) -> EqualTo[L]: return EqualTo[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundNotEqualTo[L]]: + def as_bound(self) -> builtins.type[BoundNotEqualTo[L]]: return BoundNotEqualTo[L] @@ -963,7 +961,7 @@ def __invert__(self) -> GreaterThanOrEqual[L]: return GreaterThanOrEqual[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundLessThan[L]]: + def as_bound(self) -> builtins.type[BoundLessThan[L]]: return BoundLessThan[L] @@ -975,7 +973,7 @@ def __invert__(self) -> LessThan[L]: return LessThan[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundGreaterThanOrEqual[L]]: + def as_bound(self) -> builtins.type[BoundGreaterThanOrEqual[L]]: return BoundGreaterThanOrEqual[L] @@ -987,7 +985,7 @@ def __invert__(self) -> LessThanOrEqual[L]: return LessThanOrEqual[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundGreaterThan[L]]: + def as_bound(self) -> builtins.type[BoundGreaterThan[L]]: return BoundGreaterThan[L] @@ -999,7 +997,7 @@ def __invert__(self) -> GreaterThan[L]: return GreaterThan[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundLessThanOrEqual[L]]: + def as_bound(self) -> builtins.type[BoundLessThanOrEqual[L]]: return BoundLessThanOrEqual[L] @@ -1011,7 +1009,7 @@ def __invert__(self) -> NotStartsWith[L]: return NotStartsWith[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundStartsWith[L]]: + def as_bound(self) -> builtins.type[BoundStartsWith[L]]: return BoundStartsWith[L] @@ -1023,5 +1021,5 @@ def __invert__(self) -> StartsWith[L]: return StartsWith[L](self.term, self.literal) @property - def as_bound(self) -> Type[BoundNotStartsWith[L]]: + def as_bound(self) -> builtins.type[BoundNotStartsWith[L]]: return BoundNotStartsWith[L] diff --git a/pyiceberg/expressions/literals.py b/pyiceberg/expressions/literals.py index 0847f19c84..5bf70990b9 100644 --- a/pyiceberg/expressions/literals.py +++ b/pyiceberg/expressions/literals.py @@ -27,7 +27,7 @@ from decimal import ROUND_HALF_UP, Decimal from functools import singledispatchmethod from math import isnan -from typing import Any, Generic, Type +from typing import Any, Generic from uuid import UUID from pydantic import Field, model_serializer @@ -73,7 +73,7 @@ class Literal(IcebergRootModel[L], Generic[L], ABC): # type: ignore root: L = Field() - def __init__(self, value: L, value_type: Type[L], /, **data): # type: ignore + def __init__(self, value: L, value_type: type[L], /, **data): # type: ignore if value is None: raise TypeError("Invalid literal value: None") diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index ee8d1e930a..4c096f1215 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -20,12 +20,8 @@ from typing import ( Any, Callable, - Dict, Generic, - List, - Set, SupportsFloat, - Tuple, TypeVar, ) @@ -255,11 +251,11 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi class BoundBooleanExpressionVisitor(BooleanExpressionVisitor[T], ABC): @abstractmethod - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> T: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> T: """Visit a bound In predicate.""" @abstractmethod - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> T: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> T: """Visit a bound NotIn predicate.""" @abstractmethod @@ -469,10 +465,10 @@ def eval(self, struct: StructProtocol) -> bool: self.struct = struct return visit(self.bound, self) - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> bool: return term.eval(self.struct) in literals - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> bool: return term.eval(self.struct) not in literals def visit_is_nan(self, term: BoundTerm[L]) -> bool: @@ -548,7 +544,7 @@ def _from_byte_buffer(field_type: IcebergType, val: bytes) -> Any: class _ManifestEvalVisitor(BoundBooleanExpressionVisitor[bool]): - partition_fields: List[PartitionFieldSummary] + partition_fields: list[PartitionFieldSummary] partition_filter: BooleanExpression def __init__(self, partition_struct_schema: Schema, partition_filter: BooleanExpression, case_sensitive: bool) -> None: @@ -562,7 +558,7 @@ def eval(self, manifest: ManifestFile) -> bool: # No partition information return ROWS_MIGHT_MATCH - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -584,7 +580,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> bool: # because the bounds are not necessarily a min or max value, this cannot be answered using # them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. return ROWS_MIGHT_MATCH @@ -869,9 +865,9 @@ class _ColumnNameTranslator(BooleanExpressionVisitor[BooleanExpression]): file_schema: Schema case_sensitive: bool - projected_field_values: Dict[int, Any] + projected_field_values: dict[int, Any] - def __init__(self, file_schema: Schema, case_sensitive: bool, projected_field_values: Dict[int, Any] = EMPTY_DICT) -> None: + def __init__(self, file_schema: Schema, case_sensitive: bool, projected_field_values: dict[int, Any] = EMPTY_DICT) -> None: self.file_schema = file_schema self.case_sensitive = case_sensitive self.projected_field_values = projected_field_values @@ -935,53 +931,53 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi def translate_column_names( - expr: BooleanExpression, file_schema: Schema, case_sensitive: bool = True, projected_field_values: Dict[int, Any] = EMPTY_DICT + expr: BooleanExpression, file_schema: Schema, case_sensitive: bool = True, projected_field_values: dict[int, Any] = EMPTY_DICT ) -> BooleanExpression: return visit(expr, _ColumnNameTranslator(file_schema, case_sensitive, projected_field_values)) -class _ExpressionFieldIDs(BooleanExpressionVisitor[Set[int]]): +class _ExpressionFieldIDs(BooleanExpressionVisitor[set[int]]): """Extracts the field IDs used in the BooleanExpression.""" - def visit_true(self) -> Set[int]: + def visit_true(self) -> set[int]: return set() - def visit_false(self) -> Set[int]: + def visit_false(self) -> set[int]: return set() - def visit_not(self, child_result: Set[int]) -> Set[int]: + def visit_not(self, child_result: set[int]) -> set[int]: return child_result - def visit_and(self, left_result: Set[int], right_result: Set[int]) -> Set[int]: + def visit_and(self, left_result: set[int], right_result: set[int]) -> set[int]: return left_result.union(right_result) - def visit_or(self, left_result: Set[int], right_result: Set[int]) -> Set[int]: + def visit_or(self, left_result: set[int], right_result: set[int]) -> set[int]: return left_result.union(right_result) - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> Set[int]: + def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> set[int]: raise ValueError("Only works on bound records") - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> Set[int]: + def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> set[int]: return {predicate.term.ref().field.field_id} -def extract_field_ids(expr: BooleanExpression) -> Set[int]: +def extract_field_ids(expr: BooleanExpression) -> set[int]: return visit(expr, _ExpressionFieldIDs()) -class _RewriteToDNF(BooleanExpressionVisitor[Tuple[BooleanExpression, ...]]): - def visit_true(self) -> Tuple[BooleanExpression, ...]: +class _RewriteToDNF(BooleanExpressionVisitor[tuple[BooleanExpression, ...]]): + def visit_true(self) -> tuple[BooleanExpression, ...]: return (AlwaysTrue(),) - def visit_false(self) -> Tuple[BooleanExpression, ...]: + def visit_false(self) -> tuple[BooleanExpression, ...]: return (AlwaysFalse(),) - def visit_not(self, child_result: Tuple[BooleanExpression, ...]) -> Tuple[BooleanExpression, ...]: + def visit_not(self, child_result: tuple[BooleanExpression, ...]) -> tuple[BooleanExpression, ...]: raise ValueError(f"Not expressions are not allowed: {child_result}") def visit_and( - self, left_result: Tuple[BooleanExpression, ...], right_result: Tuple[BooleanExpression, ...] - ) -> Tuple[BooleanExpression, ...]: + self, left_result: tuple[BooleanExpression, ...], right_result: tuple[BooleanExpression, ...] + ) -> tuple[BooleanExpression, ...]: # Distributive law: # ((P OR Q) AND (R OR S)) AND (((P AND R) OR (P AND S)) OR ((Q AND R) OR ((Q AND S))) # A AND (B OR C) = (A AND B) OR (A AND C) @@ -989,31 +985,31 @@ def visit_and( return tuple(And(le, re) for le in left_result for re in right_result) def visit_or( - self, left_result: Tuple[BooleanExpression, ...], right_result: Tuple[BooleanExpression, ...] - ) -> Tuple[BooleanExpression, ...]: + self, left_result: tuple[BooleanExpression, ...], right_result: tuple[BooleanExpression, ...] + ) -> tuple[BooleanExpression, ...]: return left_result + right_result - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> Tuple[BooleanExpression, ...]: + def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> tuple[BooleanExpression, ...]: return (predicate,) - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> Tuple[BooleanExpression, ...]: + def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> tuple[BooleanExpression, ...]: return (predicate,) -def rewrite_to_dnf(expr: BooleanExpression) -> Tuple[BooleanExpression, ...]: +def rewrite_to_dnf(expr: BooleanExpression) -> tuple[BooleanExpression, ...]: # Rewrites an arbitrary boolean expression to disjunctive normal form (DNF): # (A AND NOT(B) AND C) OR (NOT(D) AND E AND F) OR (G) expr_without_not = rewrite_not(expr) return visit(expr_without_not, _RewriteToDNF()) -class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]): +class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[list[tuple[str, str, Any]]]): cast_int_to_date: bool def __init__(self, cast_int_to_date: bool = False) -> None: self.cast_int_to_date = cast_int_to_date - def _cast_if_necessary(self, iceberg_type: IcebergType, literal: L | Set[L]) -> L | Set[L]: + def _cast_if_necessary(self, iceberg_type: IcebergType, literal: L | set[L]) -> L | set[L]: if self.cast_int_to_date: iceberg_type_class = type(iceberg_type) conversions = {TimestampType: micros_to_timestamp, TimestamptzType: micros_to_timestamptz} @@ -1025,73 +1021,73 @@ def _cast_if_necessary(self, iceberg_type: IcebergType, literal: L | Set[L]) -> return conversion_function(literal) # type: ignore return literal - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> list[tuple[str, str, Any]]: field = term.ref().field return [(term.ref().field.name, "in", self._cast_if_necessary(field.field_type, literals))] - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> list[tuple[str, str, Any]]: field = term.ref().field return [(field.name, "not in", self._cast_if_necessary(field.field_type, literals))] - def visit_is_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_is_nan(self, term: BoundTerm[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "==", float("nan"))] - def visit_not_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_not_nan(self, term: BoundTerm[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "!=", float("nan"))] - def visit_is_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_is_null(self, term: BoundTerm[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "==", None)] - def visit_not_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_not_null(self, term: BoundTerm[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "!=", None)] - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "==", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "!=", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, ">=", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, ">", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "<", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "<=", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [] - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> list[tuple[str, str, Any]]: return [] - def visit_true(self) -> List[Tuple[str, str, Any]]: + def visit_true(self) -> list[tuple[str, str, Any]]: return [] # Not supported - def visit_false(self) -> List[Tuple[str, str, Any]]: + def visit_false(self) -> list[tuple[str, str, Any]]: raise ValueError("Not supported: AlwaysFalse") - def visit_not(self, child_result: List[Tuple[str, str, Any]]) -> List[Tuple[str, str, Any]]: + def visit_not(self, child_result: list[tuple[str, str, Any]]) -> list[tuple[str, str, Any]]: raise ValueError(f"Not allowed: {child_result}") def visit_and( - self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] - ) -> List[Tuple[str, str, Any]]: + self, left_result: list[tuple[str, str, Any]], right_result: list[tuple[str, str, Any]] + ) -> list[tuple[str, str, Any]]: return left_result + right_result def visit_or( - self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] - ) -> List[Tuple[str, str, Any]]: + self, left_result: list[tuple[str, str, Any]], right_result: list[tuple[str, str, Any]] + ) -> list[tuple[str, str, Any]]: raise ValueError(f"Not allowed: {left_result} || {right_result}") def expression_to_plain_format( - expressions: Tuple[BooleanExpression, ...], cast_int_to_datetime: bool = False -) -> List[List[Tuple[str, str, Any]]]: + expressions: tuple[BooleanExpression, ...], cast_int_to_datetime: bool = False +) -> list[list[tuple[str, str, Any]]]: """Format a Disjunctive Normal Form expression. These are the formats that the expression can be fed into: @@ -1117,11 +1113,11 @@ def expression_to_plain_format( class _MetricsEvaluator(BoundBooleanExpressionVisitor[bool], ABC): - value_counts: Dict[int, int] - null_counts: Dict[int, int] - nan_counts: Dict[int, int] - lower_bounds: Dict[int, bytes] - upper_bounds: Dict[int, bytes] + value_counts: dict[int, int] + null_counts: dict[int, int] + nan_counts: dict[int, int] + lower_bounds: dict[int, bytes] + upper_bounds: dict[int, bytes] def visit_true(self) -> bool: # all rows match @@ -1353,7 +1349,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> bool: field = term.ref().field field_id = field.field_id @@ -1389,7 +1385,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> bool: # because the bounds are not necessarily a min or max value, this cannot be answered using # them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. return ROWS_MIGHT_MATCH @@ -1678,7 +1674,7 @@ def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> bool: field_id = term.ref().field.field_id if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id): @@ -1707,7 +1703,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> bool: field_id = term.ref().field.field_id if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id): @@ -1865,13 +1861,13 @@ def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExp else: return self.visit_false() - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> BooleanExpression: + def visit_in(self, term: BoundTerm[L], literals: set[L]) -> BooleanExpression: if term.eval(self.struct) in literals: return self.visit_true() else: return self.visit_false() - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> BooleanExpression: + def visit_not_in(self, term: BoundTerm[L], literals: set[L]) -> BooleanExpression: if term.eval(self.struct) not in literals: return self.visit_true() else: diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 981e03394f..c7109993c9 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -32,10 +32,7 @@ from io import SEEK_SET from types import TracebackType from typing import ( - Dict, - List, Protocol, - Type, runtime_checkable, ) from urllib.parse import urlparse @@ -126,7 +123,7 @@ def __enter__(self) -> InputStream: """Provide setup when opening an InputStream using a 'with' statement.""" @abstractmethod - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" @@ -149,7 +146,7 @@ def __enter__(self) -> OutputStream: """Provide setup when opening an OutputStream using a 'with' statement.""" @abstractmethod - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" @@ -298,7 +295,7 @@ def delete(self, location: str | InputFile | OutputFile) -> None: # Mappings from the Java FileIO impl to a Python one. The list is ordered by preference. # If an implementation isn't installed, it will fall back to the next one. -SCHEMA_TO_FILE_IO: Dict[str, List[str]] = { +SCHEMA_TO_FILE_IO: dict[str, list[str]] = { "s3": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3a": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3n": [ARROW_FILE_IO, FSSPEC_FILE_IO], diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 8f2fcc4312..1eef9c1eee 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -28,8 +28,6 @@ TYPE_CHECKING, Any, Callable, - Dict, - Type, ) from urllib.parse import urlparse @@ -148,7 +146,7 @@ def __call__(self, request: "AWSRequest", **_: Any) -> None: request.url = response_json["uri"] -SIGNERS: Dict[str, Type[S3RequestSigner]] = {"S3V4RestSigner": S3V4RestSigner} +SIGNERS: dict[str, type[S3RequestSigner]] = {"S3V4RestSigner": S3V4RestSigner} def _file(_: Properties) -> LocalFileSystem: @@ -166,7 +164,7 @@ def _s3(properties: Properties) -> AbstractFileSystem: "region_name": get_first_property_value(properties, S3_REGION, AWS_REGION), } config_kwargs = {} - register_events: Dict[str, Callable[[AWSRequest], None]] = {} + register_events: dict[str, Callable[[AWSRequest], None]] = {} if signer := properties.get(S3_SIGNER): logger.info("Loading signer %s", signer) @@ -455,13 +453,13 @@ def _get_fs(self, scheme: str) -> AbstractFileSystem: raise ValueError(f"No registered filesystem for scheme: {scheme}") return self._scheme_to_fs[scheme](self.properties) - def __getstate__(self) -> Dict[str, Any]: + def __getstate__(self) -> dict[str, Any]: """Create a dictionary of the FsSpecFileIO fields used when pickling.""" fileio_copy = copy(self.__dict__) del fileio_copy["_thread_locals"] return fileio_copy - def __setstate__(self, state: Dict[str, Any]) -> None: + def __setstate__(self, state: dict[str, Any]) -> None: """Deserialize the state into a FsSpecFileIO instance.""" self.__dict__ = state self._thread_locals = threading.local() diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 179e9e8928..cd19d43906 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -25,6 +25,7 @@ from __future__ import annotations +import builtins import fnmatch import functools import importlib @@ -44,13 +45,9 @@ TYPE_CHECKING, Any, Callable, - Dict, Generic, Iterable, Iterator, - List, - Set, - Tuple, TypeVar, cast, ) @@ -392,7 +389,7 @@ def __init__(self, properties: Properties = EMPTY_DICT): super().__init__(properties=properties) @staticmethod - def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]: + def parse_location(location: str, properties: Properties = EMPTY_DICT) -> tuple[str, str, str]: """Return (scheme, netloc, path) for the given location. Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing. @@ -434,7 +431,7 @@ def _initialize_fs(self, scheme: str, netloc: str | None = None) -> FileSystem: def _initialize_oss_fs(self) -> FileSystem: from pyarrow.fs import S3FileSystem - client_kwargs: Dict[str, Any] = { + client_kwargs: dict[str, Any] = { "endpoint_override": self.properties.get(S3_ENDPOINT), "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), @@ -481,7 +478,7 @@ def _initialize_s3_fs(self, netloc: str | None) -> FileSystem: else: bucket_region = provided_region - client_kwargs: Dict[str, Any] = { + client_kwargs: dict[str, Any] = { "endpoint_override": self.properties.get(S3_ENDPOINT), "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), @@ -530,7 +527,7 @@ def _initialize_azure_fs(self) -> FileSystem: from pyarrow.fs import AzureFileSystem - client_kwargs: Dict[str, str] = {} + client_kwargs: dict[str, str] = {} if account_name := self.properties.get(ADLS_ACCOUNT_NAME): client_kwargs["account_name"] = account_name @@ -576,7 +573,7 @@ def _initialize_azure_fs(self) -> FileSystem: def _initialize_hdfs_fs(self, scheme: str, netloc: str | None) -> FileSystem: from pyarrow.fs import HadoopFileSystem - hdfs_kwargs: Dict[str, Any] = {} + hdfs_kwargs: dict[str, Any] = {} if netloc: return HadoopFileSystem.from_uri(f"{scheme}://{netloc}") if host := self.properties.get(HDFS_HOST): @@ -594,7 +591,7 @@ def _initialize_hdfs_fs(self, scheme: str, netloc: str | None) -> FileSystem: def _initialize_gcs_fs(self) -> FileSystem: from pyarrow.fs import GcsFileSystem - gcs_kwargs: Dict[str, Any] = {} + gcs_kwargs: dict[str, Any] = {} if access_token := self.properties.get(GCS_TOKEN): gcs_kwargs["access_token"] = access_token if expiration := self.properties.get(GCS_TOKEN_EXPIRES_AT_MS): @@ -674,13 +671,13 @@ def delete(self, location: str | InputFile | OutputFile) -> None: raise PermissionError(f"Cannot delete file, access denied: {location}") from e raise # pragma: no cover - If some other kind of OSError, raise the raw error - def __getstate__(self) -> Dict[str, Any]: + def __getstate__(self) -> dict[str, Any]: """Create a dictionary of the PyArrowFileIO fields used when pickling.""" fileio_copy = copy(self.__dict__) fileio_copy["fs_by_scheme"] = None return fileio_copy - def __setstate__(self, state: Dict[str, Any]) -> None: + def __setstate__(self, state: dict[str, Any]) -> None: """Deserialize the state into a PyArrowFileIO instance.""" self.__dict__ = state self.fs_by_scheme = lru_cache(self._initialize_fs) @@ -688,7 +685,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None: def schema_to_pyarrow( schema: Schema | IcebergType, - metadata: Dict[bytes, bytes] = EMPTY_DICT, + metadata: dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat = FileFormat.PARQUET, ) -> pa.schema: @@ -696,10 +693,10 @@ def schema_to_pyarrow( class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]): - _metadata: Dict[bytes, bytes] + _metadata: dict[bytes, bytes] def __init__( - self, metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat | None = None + self, metadata: dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat | None = None ) -> None: self._metadata = metadata self._include_field_ids = include_field_ids @@ -708,7 +705,7 @@ def __init__( def schema(self, _: Schema, struct_result: pa.StructType) -> pa.schema: return pa.schema(list(struct_result), metadata=self._metadata) - def struct(self, _: StructType, field_results: List[pa.DataType]) -> pa.DataType: + def struct(self, _: StructType, field_results: builtins.list[pa.DataType]) -> pa.DataType: return pa.struct(field_results) def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field: @@ -820,7 +817,7 @@ class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]): def __init__(self, schema: Schema | None = None): self._schema = schema - def _get_field_name(self, term: BoundTerm[Any]) -> str | Tuple[str, ...]: + def _get_field_name(self, term: BoundTerm[Any]) -> str | tuple[str, ...]: """Get the field name or nested field path for a bound term. For nested struct fields, returns a tuple of field names (e.g., ("mazeMetadata", "run_id")). @@ -840,11 +837,11 @@ def _get_field_name(self, term: BoundTerm[Any]) -> str | Tuple[str, ...]: # Fallback to just the field name if schema is not available return term.ref().field.name - def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> pc.Expression: + def visit_in(self, term: BoundTerm[Any], literals: set[Any]) -> pc.Expression: pyarrow_literals = pa.array(literals, type=schema_to_pyarrow(term.ref().field.field_type)) return pc.field(self._get_field_name(term)).isin(pyarrow_literals) - def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> pc.Expression: + def visit_not_in(self, term: BoundTerm[Any], literals: set[Any]) -> pc.Expression: pyarrow_literals = pa.array(literals, type=schema_to_pyarrow(term.ref().field.field_type)) return ~pc.field(self._get_field_name(term)).isin(pyarrow_literals) @@ -941,11 +938,11 @@ def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None: if term not in self.is_nan_or_not_bound_terms: self.nan_unmentioned_bound_terms.add(term) - def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> None: + def visit_in(self, term: BoundTerm[Any], literals: set[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> None: + def visit_not_in(self, term: BoundTerm[Any], literals: set[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) @@ -1043,10 +1040,10 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression, schema: Schema collector.collect(expr) # Convert the set of terms to a sorted list so that layout of the expression to build is deterministic. - null_unmentioned_bound_terms: List[BoundTerm[Any]] = sorted( + null_unmentioned_bound_terms: list[BoundTerm[Any]] = sorted( collector.null_unmentioned_bound_terms, key=lambda term: term.ref().field.name ) - nan_unmentioned_bound_terms: List[BoundTerm[Any]] = sorted( + nan_unmentioned_bound_terms: list[BoundTerm[Any]] = sorted( collector.nan_unmentioned_bound_terms, key=lambda term: term.ref().field.name ) @@ -1059,7 +1056,7 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression, schema: Schema @lru_cache -def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.FileFormat: +def _get_file_format(file_format: FileFormat, **kwargs: dict[str, Any]) -> ds.FileFormat: if file_format == FileFormat.PARQUET: return ds.ParquetFileFormat(**kwargs) elif file_format == FileFormat.ORC: @@ -1070,7 +1067,7 @@ def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.Fi raise ValueError(f"Unsupported file format: {file_format}") -def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray]: +def _read_deletes(io: FileIO, data_file: DataFile) -> dict[str, pa.ChunkedArray]: if data_file.file_format == FileFormat.PARQUET: with io.new_input(data_file.file_path).open() as fi: delete_fragment = _get_file_format( @@ -1100,7 +1097,7 @@ def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray] raise ValueError(f"Delete file format not supported: {data_file.file_format}") -def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start_index: int, end_index: int) -> pa.Array: +def _combine_positional_deletes(positional_deletes: list[pa.ChunkedArray], start_index: int, end_index: int) -> pa.Array: if len(positional_deletes) == 1: all_chunks = positional_deletes[0] else: @@ -1271,7 +1268,7 @@ def schema(self, schema: pa.Schema, struct_result: T) -> T: """Visit a schema.""" @abstractmethod - def struct(self, struct: pa.StructType, field_results: List[T]) -> T: + def struct(self, struct: pa.StructType, field_results: builtins.list[T]) -> T: """Visit a struct.""" @abstractmethod @@ -1309,7 +1306,7 @@ class _HasIds(PyArrowSchemaVisitor[bool]): def schema(self, schema: pa.Schema, struct_result: bool) -> bool: return struct_result - def struct(self, struct: pa.StructType, field_results: List[bool]) -> bool: + def struct(self, struct: pa.StructType, field_results: builtins.list[bool]) -> bool: return all(field_results) def field(self, field: pa.Field, field_result: bool) -> bool: @@ -1334,7 +1331,7 @@ def primitive(self, primitive: pa.DataType) -> bool: class _ConvertToIceberg(PyArrowSchemaVisitor[IcebergType | Schema]): """Converts PyArrowSchema to Iceberg Schema. Applies the IDs from name_mapping if provided.""" - _field_names: List[str] + _field_names: builtins.list[str] def __init__( self, downcast_ns_timestamp_to_us: bool = False, format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION @@ -1352,7 +1349,7 @@ def _field_id(self, field: pa.Field) -> int: def schema(self, schema: pa.Schema, struct_result: StructType) -> Schema: return Schema(*struct_result.fields) - def struct(self, struct: pa.StructType, field_results: List[NestedField]) -> StructType: + def struct(self, struct: pa.StructType, field_results: builtins.list[NestedField]) -> StructType: return StructType(*field_results) def field(self, field: pa.Field, field_result: IcebergType) -> NestedField: @@ -1472,7 +1469,7 @@ class _ConvertToLargeTypes(PyArrowSchemaVisitor[IcebergType | pa.Schema]): def schema(self, schema: pa.Schema, struct_result: pa.StructType) -> pa.Schema: return pa.schema(struct_result) - def struct(self, struct: pa.StructType, field_results: List[pa.Field]) -> pa.StructType: + def struct(self, struct: pa.StructType, field_results: builtins.list[pa.Field]) -> pa.StructType: return pa.struct(field_results) def field(self, field: pa.Field, field_result: pa.DataType) -> pa.Field: @@ -1496,7 +1493,7 @@ class _ConvertToSmallTypes(PyArrowSchemaVisitor[IcebergType | pa.Schema]): def schema(self, schema: pa.Schema, struct_result: pa.StructType) -> pa.Schema: return pa.schema(struct_result) - def struct(self, struct: pa.StructType, field_results: List[pa.Field]) -> pa.StructType: + def struct(self, struct: pa.StructType, field_results: builtins.list[pa.Field]) -> pa.StructType: return pa.struct(field_results) def field(self, field: pa.Field, field_result: pa.DataType) -> pa.Field: @@ -1536,8 +1533,8 @@ def _get_column_projection_values( projected_schema: Schema, table_schema: Schema, partition_spec: PartitionSpec | None, - file_project_field_ids: Set[int], -) -> Dict[int, Any]: + file_project_field_ids: set[int], +) -> dict[int, Any]: """Apply Column Projection rules to File Schema.""" project_schema_diff = projected_schema.field_ids.difference(file_project_field_ids) if len(project_schema_diff) == 0 or partition_spec is None: @@ -1562,8 +1559,8 @@ def _task_to_record_batches( bound_row_filter: BooleanExpression, projected_schema: Schema, table_schema: Schema, - projected_field_ids: Set[int], - positional_deletes: List[ChunkedArray] | None, + projected_field_ids: set[int], + positional_deletes: list[ChunkedArray] | None, case_sensitive: bool, name_mapping: NameMapping | None = None, partition_spec: PartitionSpec | None = None, @@ -1644,12 +1641,12 @@ def _task_to_record_batches( ) -def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: - deletes_per_file: Dict[str, List[ChunkedArray]] = {} +def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> dict[str, list[ChunkedArray]]: + deletes_per_file: dict[str, list[ChunkedArray]] = {} unique_deletes = set(itertools.chain.from_iterable([task.delete_files for task in tasks])) if len(unique_deletes) > 0: executor = ExecutorFactory.get_or_create() - deletes_per_files: Iterator[Dict[str, ChunkedArray]] = executor.map( + deletes_per_files: Iterator[dict[str, ChunkedArray]] = executor.map( lambda args: _read_deletes(*args), [(io, delete_file) for delete_file in unique_deletes], ) @@ -1700,7 +1697,7 @@ def __init__( self._downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) @property - def _projected_field_ids(self) -> Set[int]: + def _projected_field_ids(self) -> set[int]: """Set of field IDs that should be projected from the data files.""" return { id @@ -1773,7 +1770,7 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record total_row_count = 0 executor = ExecutorFactory.get_or_create() - def batches_for_task(task: FileScanTask) -> List[pa.RecordBatch]: + def batches_for_task(task: FileScanTask) -> list[pa.RecordBatch]: # Materialize the iterator here to ensure execution happens within the executor. # Otherwise, the iterator would be lazily consumed later (in the main thread), # defeating the purpose of using executor.map. @@ -1797,7 +1794,7 @@ def batches_for_task(task: FileScanTask) -> List[pa.RecordBatch]: break def _record_batches_from_scan_tasks_and_deletes( - self, tasks: Iterable[FileScanTask], deletes_per_file: Dict[str, List[ChunkedArray]] + self, tasks: Iterable[FileScanTask], deletes_per_file: dict[str, list[ChunkedArray]] ) -> Iterator[pa.RecordBatch]: total_row_count = 0 for task in tasks: @@ -1833,7 +1830,7 @@ def _to_requested_schema( batch: pa.RecordBatch, downcast_ns_timestamp_to_us: bool = False, include_field_ids: bool = False, - projected_missing_fields: Dict[int, Any] = EMPTY_DICT, + projected_missing_fields: dict[int, Any] = EMPTY_DICT, ) -> pa.RecordBatch: # We could reuse some of these visitors struct_array = visit_with_partner( @@ -1852,7 +1849,7 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None] _include_field_ids: bool _downcast_ns_timestamp_to_us: bool _use_large_types: bool | None - _projected_missing_fields: Dict[int, Any] + _projected_missing_fields: dict[int, Any] def __init__( self, @@ -1860,7 +1857,7 @@ def __init__( downcast_ns_timestamp_to_us: bool = False, include_field_ids: bool = False, use_large_types: bool | None = None, - projected_missing_fields: Dict[int, Any] = EMPTY_DICT, + projected_missing_fields: dict[int, Any] = EMPTY_DICT, ) -> None: self._file_schema = file_schema self._include_field_ids = include_field_ids @@ -1935,11 +1932,13 @@ def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Fi def schema(self, schema: Schema, schema_partner: pa.Array | None, struct_result: pa.Array | None) -> pa.Array | None: return struct_result - def struct(self, struct: StructType, struct_array: pa.Array | None, field_results: List[pa.Array | None]) -> pa.Array | None: + def struct( + self, struct: StructType, struct_array: pa.Array | None, field_results: builtins.list[pa.Array | None] + ) -> pa.Array | None: if struct_array is None: return None - field_arrays: List[pa.Array] = [] - fields: List[pa.Field] = [] + field_arrays: list[pa.Array] = [] + fields: list[pa.Field] = [] for field, field_array in zip(struct.fields, field_results, strict=True): if field_array is not None: array = self._cast_if_needed(field, field_array) @@ -2048,7 +2047,7 @@ class PrimitiveToPhysicalType(SchemaVisitorPerPrimitiveType[str]): def schema(self, schema: Schema, struct_result: str) -> str: raise ValueError(f"Expected primitive-type, got: {schema}") - def struct(self, struct: StructType, field_results: List[str]) -> str: + def struct(self, struct: StructType, field_results: builtins.list[str]) -> str: raise ValueError(f"Expected primitive-type, got: {struct}") def field(self, field: NestedField, field_result: str) -> str: @@ -2236,13 +2235,13 @@ class StatisticsCollector: column_name: str -class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector]]): +class PyArrowStatisticsCollector(PreOrderSchemaVisitor[list[StatisticsCollector]]): _field_id: int = 0 _schema: Schema - _properties: Dict[str, str] + _properties: dict[str, str] _default_mode: str - def __init__(self, schema: Schema, properties: Dict[str, str]): + def __init__(self, schema: Schema, properties: dict[str, str]): from pyiceberg.table import TableProperties self._schema = schema @@ -2251,35 +2250,41 @@ def __init__(self, schema: Schema, properties: Dict[str, str]): TableProperties.DEFAULT_WRITE_METRICS_MODE, TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT ) - def schema(self, schema: Schema, struct_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: + def schema( + self, schema: Schema, struct_result: Callable[[], builtins.list[StatisticsCollector]] + ) -> builtins.list[StatisticsCollector]: return struct_result() def struct( - self, struct: StructType, field_results: List[Callable[[], List[StatisticsCollector]]] - ) -> List[StatisticsCollector]: + self, struct: StructType, field_results: builtins.list[Callable[[], builtins.list[StatisticsCollector]]] + ) -> builtins.list[StatisticsCollector]: return list(itertools.chain(*[result() for result in field_results])) - def field(self, field: NestedField, field_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: + def field( + self, field: NestedField, field_result: Callable[[], builtins.list[StatisticsCollector]] + ) -> builtins.list[StatisticsCollector]: self._field_id = field.field_id return field_result() - def list(self, list_type: ListType, element_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: + def list( + self, list_type: ListType, element_result: Callable[[], builtins.list[StatisticsCollector]] + ) -> builtins.list[StatisticsCollector]: self._field_id = list_type.element_id return element_result() def map( self, map_type: MapType, - key_result: Callable[[], List[StatisticsCollector]], - value_result: Callable[[], List[StatisticsCollector]], - ) -> List[StatisticsCollector]: + key_result: Callable[[], builtins.list[StatisticsCollector]], + value_result: Callable[[], builtins.list[StatisticsCollector]], + ) -> builtins.list[StatisticsCollector]: self._field_id = map_type.key_id k = key_result() self._field_id = map_type.value_id v = value_result() return k + v - def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]: + def primitive(self, primitive: PrimitiveType) -> builtins.list[StatisticsCollector]: from pyiceberg.table import TableProperties column_name = self._schema.find_column_name(self._field_id) @@ -2308,8 +2313,8 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]: def compute_statistics_plan( schema: Schema, - table_properties: Dict[str, str], -) -> Dict[int, StatisticsCollector]: + table_properties: dict[str, str], +) -> dict[int, StatisticsCollector]: """ Compute the statistics plan for all columns. @@ -2325,7 +2330,7 @@ def compute_statistics_plan( used to set the mode for column metrics collection """ stats_cols = pre_order_visit(schema, PyArrowStatisticsCollector(schema, table_properties)) - result: Dict[int, StatisticsCollector] = {} + result: dict[int, StatisticsCollector] = {} for stats_col in stats_cols: result[stats_col.field_id] = stats_col return result @@ -2337,27 +2342,33 @@ class ID2ParquetPath: parquet_path: str -class ID2ParquetPathVisitor(PreOrderSchemaVisitor[List[ID2ParquetPath]]): +class ID2ParquetPathVisitor(PreOrderSchemaVisitor[list[ID2ParquetPath]]): _field_id: int = 0 - _path: List[str] + _path: builtins.list[str] def __init__(self) -> None: self._path = [] - def schema(self, schema: Schema, struct_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: + def schema(self, schema: Schema, struct_result: Callable[[], builtins.list[ID2ParquetPath]]) -> builtins.list[ID2ParquetPath]: return struct_result() - def struct(self, struct: StructType, field_results: List[Callable[[], List[ID2ParquetPath]]]) -> List[ID2ParquetPath]: + def struct( + self, struct: StructType, field_results: builtins.list[Callable[[], builtins.list[ID2ParquetPath]]] + ) -> builtins.list[ID2ParquetPath]: return list(itertools.chain(*[result() for result in field_results])) - def field(self, field: NestedField, field_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: + def field( + self, field: NestedField, field_result: Callable[[], builtins.list[ID2ParquetPath]] + ) -> builtins.list[ID2ParquetPath]: self._field_id = field.field_id self._path.append(field.name) result = field_result() self._path.pop() return result - def list(self, list_type: ListType, element_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: + def list( + self, list_type: ListType, element_result: Callable[[], builtins.list[ID2ParquetPath]] + ) -> builtins.list[ID2ParquetPath]: self._field_id = list_type.element_id self._path.append("list.element") result = element_result() @@ -2367,9 +2378,9 @@ def list(self, list_type: ListType, element_result: Callable[[], List[ID2Parquet def map( self, map_type: MapType, - key_result: Callable[[], List[ID2ParquetPath]], - value_result: Callable[[], List[ID2ParquetPath]], - ) -> List[ID2ParquetPath]: + key_result: Callable[[], builtins.list[ID2ParquetPath]], + value_result: Callable[[], builtins.list[ID2ParquetPath]], + ) -> builtins.list[ID2ParquetPath]: self._field_id = map_type.key_id self._path.append("key_value.key") k = key_result() @@ -2380,13 +2391,13 @@ def map( self._path.pop() return k + v - def primitive(self, primitive: PrimitiveType) -> List[ID2ParquetPath]: + def primitive(self, primitive: PrimitiveType) -> builtins.list[ID2ParquetPath]: return [ID2ParquetPath(field_id=self._field_id, parquet_path=".".join(self._path))] def parquet_path_to_id_mapping( schema: Schema, -) -> Dict[str, int]: +) -> dict[str, int]: """ Compute the mapping of parquet column path to Iceberg ID. @@ -2397,7 +2408,7 @@ def parquet_path_to_id_mapping( Args: schema (pyiceberg.schema.Schema): The current table schema. """ - result: Dict[str, int] = {} + result: dict[str, int] = {} for pair in pre_order_visit(schema, ID2ParquetPathVisitor()): result[pair.parquet_path] = pair.field_id return result @@ -2406,12 +2417,12 @@ def parquet_path_to_id_mapping( @dataclass(frozen=True) class DataFileStatistics: record_count: int - column_sizes: Dict[int, int] - value_counts: Dict[int, int] - null_value_counts: Dict[int, int] - nan_value_counts: Dict[int, int] - column_aggregates: Dict[int, StatsAggregator] - split_offsets: List[int] + column_sizes: dict[int, int] + value_counts: dict[int, int] + null_value_counts: dict[int, int] + nan_value_counts: dict[int, int] + column_aggregates: dict[int, StatsAggregator] + split_offsets: list[int] def _partition_value(self, partition_field: PartitionField, schema: Schema) -> Any: if partition_field.source_id not in self.column_aggregates: @@ -2451,7 +2462,7 @@ def _partition_value(self, partition_field: PartitionField, schema: Schema) -> A def partition(self, partition_spec: PartitionSpec, schema: Schema) -> Record: return Record(*[self._partition_value(field, schema) for field in partition_spec.fields]) - def to_serialized_dict(self) -> Dict[str, Any]: + def to_serialized_dict(self) -> dict[str, Any]: lower_bounds = {} upper_bounds = {} @@ -2476,8 +2487,8 @@ def to_serialized_dict(self) -> Dict[str, Any]: def data_file_statistics_from_parquet_metadata( parquet_metadata: pq.FileMetaData, - stats_columns: Dict[int, StatisticsCollector], - parquet_column_mapping: Dict[str, int], + stats_columns: dict[int, StatisticsCollector], + parquet_column_mapping: dict[str, int], ) -> DataFileStatistics: """ Compute and return DataFileStatistics that includes the following. @@ -2496,16 +2507,16 @@ def data_file_statistics_from_parquet_metadata( set the mode for column metrics collection parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID """ - column_sizes: Dict[int, int] = {} - value_counts: Dict[int, int] = {} - split_offsets: List[int] = [] + column_sizes: dict[int, int] = {} + value_counts: dict[int, int] = {} + split_offsets: list[int] = [] - null_value_counts: Dict[int, int] = {} - nan_value_counts: Dict[int, int] = {} + null_value_counts: dict[int, int] = {} + nan_value_counts: dict[int, int] = {} col_aggs = {} - invalidate_col: Set[int] = set() + invalidate_col: set[int] = set() for r in range(parquet_metadata.num_row_groups): # References: # https://github.com/apache/iceberg/blob/fc381a81a1fdb8f51a0637ca27cd30673bd7aad3/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java#L232 @@ -2665,7 +2676,7 @@ def write_parquet(task: WriteTask) -> DataFile: return iter(data_files) -def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[pa.RecordBatch]]: +def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[pa.RecordBatch]]: from pyiceberg.utils.bin_packing import PackingIterator avg_row_size_bytes = tbl.nbytes / tbl.num_rows @@ -2759,7 +2770,7 @@ def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_pa PYARROW_UNCOMPRESSED_CODEC = "none" -def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: +def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]: from pyiceberg.table import TableProperties for key_pattern in [ diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 3c211f5e94..ab1e951100 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -24,12 +24,8 @@ from types import TracebackType from typing import ( Any, - Dict, Iterator, - List, Literal, - Tuple, - Type, ) from cachetools import LRUCache, cached @@ -111,7 +107,7 @@ def __repr__(self) -> str: return f"FileFormat.{self.name}" -DATA_FILE_TYPE: Dict[int, StructType] = { +DATA_FILE_TYPE: dict[int, StructType] = { 1: StructType( NestedField(field_id=100, name="file_path", field_type=StringType(), required=True, doc="Location URI with FS scheme"), NestedField( @@ -475,27 +471,27 @@ def file_size_in_bytes(self) -> int: return self._data[5] @property - def column_sizes(self) -> Dict[int, int]: + def column_sizes(self) -> dict[int, int]: return self._data[6] @property - def value_counts(self) -> Dict[int, int]: + def value_counts(self) -> dict[int, int]: return self._data[7] @property - def null_value_counts(self) -> Dict[int, int]: + def null_value_counts(self) -> dict[int, int]: return self._data[8] @property - def nan_value_counts(self) -> Dict[int, int]: + def nan_value_counts(self) -> dict[int, int]: return self._data[9] @property - def lower_bounds(self) -> Dict[int, bytes]: + def lower_bounds(self) -> dict[int, bytes]: return self._data[10] @property - def upper_bounds(self) -> Dict[int, bytes]: + def upper_bounds(self) -> dict[int, bytes]: return self._data[11] @property @@ -503,11 +499,11 @@ def key_metadata(self) -> bytes | None: return self._data[12] @property - def split_offsets(self) -> List[int] | None: + def split_offsets(self) -> list[int] | None: return self._data[13] @property - def equality_ids(self) -> List[int] | None: + def equality_ids(self) -> list[int] | None: return self._data[14] @property @@ -690,7 +686,7 @@ def update(self, value: Any) -> None: self._min = min(self._min, value) -def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partitions: List[Record]) -> List[PartitionFieldSummary]: +def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partitions: list[Record]) -> list[PartitionFieldSummary]: types = [field.field_type for field in spec.partition_type(schema).fields] field_stats = [PartitionFieldStats(field_type) for field_type in types] for partition_keys in partitions: @@ -702,7 +698,7 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition return [field.to_summary() for field in field_stats] -MANIFEST_LIST_FILE_SCHEMAS: Dict[int, Schema] = { +MANIFEST_LIST_FILE_SCHEMAS: dict[int, Schema] = { 1: Schema( NestedField(500, "manifest_path", StringType(), required=True, doc="Location URI with FS scheme"), NestedField(501, "manifest_length", LongType(), required=True), @@ -828,7 +824,7 @@ def deleted_rows_count(self) -> int | None: return self._data[12] @property - def partitions(self) -> List[PartitionFieldSummary] | None: + def partitions(self) -> list[PartitionFieldSummary] | None: return self._data[13] @property @@ -841,7 +837,7 @@ def has_added_files(self) -> bool: def has_existing_files(self) -> bool: return self.existing_files_count is None or self.existing_files_count > 0 - def fetch_manifest_entry(self, io: FileIO, discard_deleted: bool = True) -> List[ManifestEntry]: + def fetch_manifest_entry(self, io: FileIO, discard_deleted: bool = True) -> list[ManifestEntry]: """ Read the manifest entries from the manifest file. @@ -875,11 +871,11 @@ def __hash__(self) -> int: # Global cache for manifest lists -_manifest_cache: LRUCache[Any, Tuple[ManifestFile, ...]] = LRUCache(maxsize=128) +_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = LRUCache(maxsize=128) @cached(cache=_manifest_cache, key=lambda io, manifest_list: hashkey(manifest_list), lock=threading.RLock()) -def _manifests(io: FileIO, manifest_list: str) -> Tuple[ManifestFile, ...]: +def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]: """Read and cache manifests from the given manifest list, returning a tuple to prevent modification.""" file = io.new_input(manifest_list) return tuple(read_manifest_list(file)) @@ -957,7 +953,7 @@ class ManifestWriter(ABC): _deleted_files: int _deleted_rows: int _min_sequence_number: int | None - _partitions: List[Record] + _partitions: list[Record] _compression: AvroCompressionCodec def __init__( @@ -992,7 +988,7 @@ def __enter__(self) -> ManifestWriter: def __exit__( self, - exc_type: Type[BaseException] | None, + exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None, ) -> None: @@ -1012,7 +1008,7 @@ def content(self) -> ManifestContent: ... def version(self) -> TableVersion: ... @property - def _meta(self) -> Dict[str, str]: + def _meta(self) -> dict[str, str]: return { "schema": self._schema.model_dump_json(), "partition-spec": to_json(self._spec.fields).decode("utf-8"), @@ -1167,7 +1163,7 @@ def version(self) -> TableVersion: return 2 @property - def _meta(self) -> Dict[str, str]: + def _meta(self) -> dict[str, str]: return { **super()._meta, "content": "data", @@ -1201,12 +1197,12 @@ def write_manifest( class ManifestListWriter(ABC): _format_version: TableVersion _output_file: OutputFile - _meta: Dict[str, str] - _manifest_files: List[ManifestFile] + _meta: dict[str, str] + _manifest_files: list[ManifestFile] _commit_snapshot_id: int _writer: AvroOutputFile[ManifestFile] - def __init__(self, format_version: TableVersion, output_file: OutputFile, meta: Dict[str, Any]): + def __init__(self, format_version: TableVersion, output_file: OutputFile, meta: dict[str, Any]): self._format_version = format_version self._output_file = output_file self._meta = meta @@ -1226,7 +1222,7 @@ def __enter__(self) -> ManifestListWriter: def __exit__( self, - exc_type: Type[BaseException] | None, + exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None, ) -> None: @@ -1237,7 +1233,7 @@ def __exit__( @abstractmethod def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile: ... - def add_manifests(self, manifest_files: List[ManifestFile]) -> ManifestListWriter: + def add_manifests(self, manifest_files: list[ManifestFile]) -> ManifestListWriter: self._writer.write_block([self.prepare_manifest(manifest_file) for manifest_file in manifest_files]) return self diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index bf8e4081fe..45d0dfd212 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -21,7 +21,7 @@ from dataclasses import dataclass from datetime import date, datetime, time from functools import cached_property, singledispatch -from typing import Annotated, Any, Dict, Generic, List, Set, Tuple, TypeVar +from typing import Annotated, Any, Generic, TypeVar from urllib.parse import quote_plus from pydantic import ( @@ -131,7 +131,7 @@ class PartitionSpec(IcebergBaseModel): """ spec_id: int = Field(alias="spec-id", default=INITIAL_PARTITION_SPEC_ID) - fields: Tuple[PartitionField, ...] = Field(default_factory=tuple) + fields: tuple[PartitionField, ...] = Field(default_factory=tuple) def __init__( self, @@ -181,15 +181,15 @@ def last_assigned_field_id(self) -> int: return PARTITION_FIELD_ID_START - 1 @cached_property - def source_id_to_fields_map(self) -> Dict[int, List[PartitionField]]: - source_id_to_fields_map: Dict[int, List[PartitionField]] = {} + def source_id_to_fields_map(self) -> dict[int, list[PartitionField]]: + source_id_to_fields_map: dict[int, list[PartitionField]] = {} for partition_field in self.fields: existing = source_id_to_fields_map.get(partition_field.source_id, []) existing.append(partition_field) source_id_to_fields_map[partition_field.source_id] = existing return source_id_to_fields_map - def fields_by_source_id(self, field_id: int) -> List[PartitionField]: + def fields_by_source_id(self, field_id: int) -> list[PartitionField]: return self.source_id_to_fields_map.get(field_id, []) def compatible_with(self, other: PartitionSpec) -> bool: @@ -254,7 +254,7 @@ def validate_partition_name( partition_transform: Transform[Any, Any], source_id: int, schema: Schema, - partition_names: Set[str], + partition_names: set[str], ) -> None: """Validate that a partition field name doesn't conflict with schema field names.""" try: @@ -372,7 +372,7 @@ def unknown(self, field_id: int, source_name: str, source_id: int, transform: st @singledispatch -def _visit(spec: PartitionSpec, schema: Schema, visitor: PartitionSpecVisitor[R]) -> List[R]: +def _visit(spec: PartitionSpec, schema: Schema, visitor: PartitionSpecVisitor[R]) -> list[R]: return [_visit_partition_field(schema, field, visitor) for field in spec.fields] @@ -412,7 +412,7 @@ class PartitionFieldValue: @dataclass(frozen=True) class PartitionKey: - field_values: List[PartitionFieldValue] + field_values: list[PartitionFieldValue] partition_spec: PartitionSpec schema: Schema diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 7e99892504..5d622d2549 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -17,6 +17,7 @@ # pylint: disable=W0511 from __future__ import annotations +import builtins import itertools from abc import ABC, abstractmethod from dataclasses import dataclass @@ -25,12 +26,8 @@ TYPE_CHECKING, Any, Callable, - Dict, Generic, - List, Literal, - Set, - Tuple, TypeVar, ) @@ -89,11 +86,11 @@ class Schema(IcebergBaseModel): """ type: Literal["struct"] = "struct" - fields: Tuple[NestedField, ...] = Field(default_factory=tuple) + fields: tuple[NestedField, ...] = Field(default_factory=tuple) schema_id: int = Field(alias="schema-id", default=INITIAL_SCHEMA_ID) - identifier_field_ids: List[int] = Field(alias="identifier-field-ids", default_factory=list) + identifier_field_ids: list[int] = Field(alias="identifier-field-ids", default_factory=list) - _name_to_id: Dict[str, int] = PrivateAttr() + _name_to_id: dict[str, int] = PrivateAttr() def __init__(self, *fields: NestedField, **data: Any): if fields: @@ -138,12 +135,12 @@ def check_schema(self) -> Schema: return self @property - def columns(self) -> Tuple[NestedField, ...]: + def columns(self) -> tuple[NestedField, ...]: """A tuple of the top-level fields.""" return self.fields @cached_property - def _lazy_id_to_field(self) -> Dict[int, NestedField]: + def _lazy_id_to_field(self) -> dict[int, NestedField]: """Return an index of field ID to NestedField instance. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -151,7 +148,7 @@ def _lazy_id_to_field(self) -> Dict[int, NestedField]: return index_by_id(self) @cached_property - def _lazy_id_to_parent(self) -> Dict[int, int]: + def _lazy_id_to_parent(self) -> dict[int, int]: """Returns an index of field ID to parent field IDs. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -159,7 +156,7 @@ def _lazy_id_to_parent(self) -> Dict[int, int]: return _index_parents(self) @cached_property - def _lazy_name_to_id_lower(self) -> Dict[str, int]: + def _lazy_name_to_id_lower(self) -> dict[str, int]: """Return an index of lower-case field names to field IDs. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -167,7 +164,7 @@ def _lazy_name_to_id_lower(self) -> Dict[str, int]: return {name.lower(): field_id for name, field_id in self._name_to_id.items()} @cached_property - def _lazy_id_to_name(self) -> Dict[int, str]: + def _lazy_id_to_name(self) -> dict[int, str]: """Return an index of field ID to full name. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -175,7 +172,7 @@ def _lazy_id_to_name(self) -> Dict[int, str]: return index_name_by_id(self) @cached_property - def _lazy_id_to_accessor(self) -> Dict[int, Accessor]: + def _lazy_id_to_accessor(self) -> dict[int, Accessor]: """Return an index of field ID to accessor. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -257,7 +254,7 @@ def find_column_name(self, column_id: int) -> str | None: return self._lazy_id_to_name.get(column_id) @property - def column_names(self) -> List[str]: + def column_names(self) -> list[str]: """ Return a list of all the column names, including nested fields. @@ -285,7 +282,7 @@ def accessor_for_field(self, field_id: int) -> Accessor: return self._lazy_id_to_accessor[field_id] - def identifier_field_names(self) -> Set[str]: + def identifier_field_names(self) -> set[str]: """Return the names of the identifier fields. Returns: @@ -324,7 +321,7 @@ def select(self, *names: str, case_sensitive: bool = True) -> Schema: return prune_columns(self, ids) @property - def field_ids(self) -> Set[int]: + def field_ids(self) -> set[int]: """Return the IDs of the current schema.""" return set(self._name_to_id.values()) @@ -350,7 +347,7 @@ def _validate_identifier_field(self, field_id: int) -> None: # Check whether the nested field is in a chain of required struct fields # Exploring from root for better error message for list and map types parent_id = self._lazy_id_to_parent.get(field.field_id) - fields: List[int] = [] + fields: list[int] = [] while parent_id is not None: fields.append(parent_id) parent_id = self._lazy_id_to_parent.get(parent_id) @@ -417,7 +414,7 @@ def schema(self, schema: Schema, struct_result: T) -> T: """Visit a Schema.""" @abstractmethod - def struct(self, struct: StructType, field_results: List[T]) -> T: + def struct(self, struct: StructType, field_results: builtins.list[T]) -> T: """Visit a StructType.""" @abstractmethod @@ -443,7 +440,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], T]) -> T: """Visit a Schema.""" @abstractmethod - def struct(self, struct: StructType, field_results: List[Callable[[], T]]) -> T: + def struct(self, struct: StructType, field_results: builtins.list[Callable[[], T]]) -> T: """Visit a StructType.""" @abstractmethod @@ -499,7 +496,7 @@ def schema(self, schema: Schema, schema_partner: P | None, struct_result: T) -> """Visit a schema with a partner.""" @abstractmethod - def struct(self, struct: StructType, struct_partner: P | None, field_results: List[T]) -> T: + def struct(self, struct: StructType, struct_partner: P | None, field_results: builtins.list[T]) -> T: """Visit a struct type with a partner.""" @abstractmethod @@ -979,41 +976,41 @@ def _(obj: PrimitiveType, visitor: PreOrderSchemaVisitor[T]) -> T: return visitor.primitive(obj) -class _IndexById(SchemaVisitor[Dict[int, NestedField]]): +class _IndexById(SchemaVisitor[dict[int, NestedField]]): """A schema visitor for generating a field ID to NestedField index.""" def __init__(self) -> None: - self._index: Dict[int, NestedField] = {} + self._index: dict[int, NestedField] = {} - def schema(self, schema: Schema, struct_result: Dict[int, NestedField]) -> Dict[int, NestedField]: + def schema(self, schema: Schema, struct_result: dict[int, NestedField]) -> dict[int, NestedField]: return self._index - def struct(self, struct: StructType, field_results: List[Dict[int, NestedField]]) -> Dict[int, NestedField]: + def struct(self, struct: StructType, field_results: builtins.list[dict[int, NestedField]]) -> dict[int, NestedField]: return self._index - def field(self, field: NestedField, field_result: Dict[int, NestedField]) -> Dict[int, NestedField]: + def field(self, field: NestedField, field_result: dict[int, NestedField]) -> dict[int, NestedField]: """Add the field ID to the index.""" self._index[field.field_id] = field return self._index - def list(self, list_type: ListType, element_result: Dict[int, NestedField]) -> Dict[int, NestedField]: + def list(self, list_type: ListType, element_result: dict[int, NestedField]) -> dict[int, NestedField]: """Add the list element ID to the index.""" self._index[list_type.element_field.field_id] = list_type.element_field return self._index def map( - self, map_type: MapType, key_result: Dict[int, NestedField], value_result: Dict[int, NestedField] - ) -> Dict[int, NestedField]: + self, map_type: MapType, key_result: dict[int, NestedField], value_result: dict[int, NestedField] + ) -> dict[int, NestedField]: """Add the key ID and value ID as individual items in the index.""" self._index[map_type.key_field.field_id] = map_type.key_field self._index[map_type.value_field.field_id] = map_type.value_field return self._index - def primitive(self, primitive: PrimitiveType) -> Dict[int, NestedField]: + def primitive(self, primitive: PrimitiveType) -> dict[int, NestedField]: return self._index -def index_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, NestedField]: +def index_by_id(schema_or_type: Schema | IcebergType) -> dict[int, NestedField]: """Generate an index of field IDs to NestedField instances. Args: @@ -1025,10 +1022,10 @@ def index_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, NestedField]: return visit(schema_or_type, _IndexById()) -class _IndexParents(SchemaVisitor[Dict[int, int]]): +class _IndexParents(SchemaVisitor[dict[int, int]]): def __init__(self) -> None: - self.id_to_parent: Dict[int, int] = {} - self.id_stack: List[int] = [] + self.id_to_parent: dict[int, int] = {} + self.id_stack: list[int] = [] def before_field(self, field: NestedField) -> None: self.id_stack.append(field.field_id) @@ -1036,10 +1033,10 @@ def before_field(self, field: NestedField) -> None: def after_field(self, field: NestedField) -> None: self.id_stack.pop() - def schema(self, schema: Schema, struct_result: Dict[int, int]) -> Dict[int, int]: + def schema(self, schema: Schema, struct_result: dict[int, int]) -> dict[int, int]: return self.id_to_parent - def struct(self, struct: StructType, field_results: List[Dict[int, int]]) -> Dict[int, int]: + def struct(self, struct: StructType, field_results: builtins.list[dict[int, int]]) -> dict[int, int]: for field in struct.fields: parent_id = self.id_stack[-1] if self.id_stack else None if parent_id is not None: @@ -1048,23 +1045,23 @@ def struct(self, struct: StructType, field_results: List[Dict[int, int]]) -> Dic return self.id_to_parent - def field(self, field: NestedField, field_result: Dict[int, int]) -> Dict[int, int]: + def field(self, field: NestedField, field_result: dict[int, int]) -> dict[int, int]: return self.id_to_parent - def list(self, list_type: ListType, element_result: Dict[int, int]) -> Dict[int, int]: + def list(self, list_type: ListType, element_result: dict[int, int]) -> dict[int, int]: self.id_to_parent[list_type.element_id] = self.id_stack[-1] return self.id_to_parent - def map(self, map_type: MapType, key_result: Dict[int, int], value_result: Dict[int, int]) -> Dict[int, int]: + def map(self, map_type: MapType, key_result: dict[int, int], value_result: dict[int, int]) -> dict[int, int]: self.id_to_parent[map_type.key_id] = self.id_stack[-1] self.id_to_parent[map_type.value_id] = self.id_stack[-1] return self.id_to_parent - def primitive(self, primitive: PrimitiveType) -> Dict[int, int]: + def primitive(self, primitive: PrimitiveType) -> dict[int, int]: return self.id_to_parent -def _index_parents(schema_or_type: Schema | IcebergType) -> Dict[int, int]: +def _index_parents(schema_or_type: Schema | IcebergType) -> dict[int, int]: """Generate an index of field IDs to their parent field IDs. Args: @@ -1076,15 +1073,15 @@ def _index_parents(schema_or_type: Schema | IcebergType) -> Dict[int, int]: return visit(schema_or_type, _IndexParents()) -class _IndexByName(SchemaVisitor[Dict[str, int]]): +class _IndexByName(SchemaVisitor[dict[str, int]]): """A schema visitor for generating a field name to field ID index.""" def __init__(self) -> None: - self._index: Dict[str, int] = {} - self._short_name_to_id: Dict[str, int] = {} - self._combined_index: Dict[str, int] = {} - self._field_names: List[str] = [] - self._short_field_names: List[str] = [] + self._index: dict[str, int] = {} + self._short_name_to_id: dict[str, int] = {} + self._combined_index: dict[str, int] = {} + self._field_names: list[str] = [] + self._short_field_names: list[str] = [] def before_map_value(self, value: NestedField) -> None: if not isinstance(value.field_type, StructType): @@ -1117,23 +1114,23 @@ def after_field(self, field: NestedField) -> None: self._field_names.pop() self._short_field_names.pop() - def schema(self, schema: Schema, struct_result: Dict[str, int]) -> Dict[str, int]: + def schema(self, schema: Schema, struct_result: dict[str, int]) -> dict[str, int]: return self._index - def struct(self, struct: StructType, field_results: List[Dict[str, int]]) -> Dict[str, int]: + def struct(self, struct: StructType, field_results: builtins.list[dict[str, int]]) -> dict[str, int]: return self._index - def field(self, field: NestedField, field_result: Dict[str, int]) -> Dict[str, int]: + def field(self, field: NestedField, field_result: dict[str, int]) -> dict[str, int]: """Add the field name to the index.""" self._add_field(field.name, field.field_id) return self._index - def list(self, list_type: ListType, element_result: Dict[str, int]) -> Dict[str, int]: + def list(self, list_type: ListType, element_result: dict[str, int]) -> dict[str, int]: """Add the list element name to the index.""" self._add_field(list_type.element_field.name, list_type.element_field.field_id) return self._index - def map(self, map_type: MapType, key_result: Dict[str, int], value_result: Dict[str, int]) -> Dict[str, int]: + def map(self, map_type: MapType, key_result: dict[str, int], value_result: dict[str, int]) -> dict[str, int]: """Add the key name and value name as individual items in the index.""" self._add_field(map_type.key_field.name, map_type.key_field.field_id) self._add_field(map_type.value_field.name, map_type.value_field.field_id) @@ -1162,10 +1159,10 @@ def _add_field(self, name: str, field_id: int) -> None: short_name = ".".join([".".join(self._short_field_names), name]) self._short_name_to_id[short_name] = field_id - def primitive(self, primitive: PrimitiveType) -> Dict[str, int]: + def primitive(self, primitive: PrimitiveType) -> dict[str, int]: return self._index - def by_name(self) -> Dict[str, int]: + def by_name(self) -> dict[str, int]: """Return an index of combined full and short names. Note: Only short names that do not conflict with full names are included. @@ -1174,13 +1171,13 @@ def by_name(self) -> Dict[str, int]: combined_index.update(self._index) return combined_index - def by_id(self) -> Dict[int, str]: + def by_id(self) -> dict[int, str]: """Return an index of ID to full names.""" id_to_full_name = {value: key for key, value in self._index.items()} return id_to_full_name -def index_by_name(schema_or_type: Schema | IcebergType) -> Dict[str, int]: +def index_by_name(schema_or_type: Schema | IcebergType) -> dict[str, int]: """Generate an index of field names to field IDs. Args: @@ -1197,7 +1194,7 @@ def index_by_name(schema_or_type: Schema | IcebergType) -> Dict[str, int]: return EMPTY_DICT -def index_name_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, str]: +def index_name_by_id(schema_or_type: Schema | IcebergType) -> dict[int, str]: """Generate an index of field IDs full field names. Args: @@ -1214,7 +1211,7 @@ def index_name_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, str]: Position = int -class _BuildPositionAccessors(SchemaVisitor[Dict[Position, Accessor]]): +class _BuildPositionAccessors(SchemaVisitor[dict[Position, Accessor]]): """A schema visitor for generating a field ID to accessor index. Example: @@ -1247,10 +1244,10 @@ class _BuildPositionAccessors(SchemaVisitor[Dict[Position, Accessor]]): True """ - def schema(self, schema: Schema, struct_result: Dict[Position, Accessor]) -> Dict[Position, Accessor]: + def schema(self, schema: Schema, struct_result: dict[Position, Accessor]) -> dict[Position, Accessor]: return struct_result - def struct(self, struct: StructType, field_results: List[Dict[Position, Accessor]]) -> Dict[Position, Accessor]: + def struct(self, struct: StructType, field_results: builtins.list[dict[Position, Accessor]]) -> dict[Position, Accessor]: result = {} for position, field in enumerate(struct.fields): @@ -1261,22 +1258,22 @@ def struct(self, struct: StructType, field_results: List[Dict[Position, Accessor return result - def field(self, field: NestedField, field_result: Dict[Position, Accessor]) -> Dict[Position, Accessor]: + def field(self, field: NestedField, field_result: dict[Position, Accessor]) -> dict[Position, Accessor]: return field_result - def list(self, list_type: ListType, element_result: Dict[Position, Accessor]) -> Dict[Position, Accessor]: + def list(self, list_type: ListType, element_result: dict[Position, Accessor]) -> dict[Position, Accessor]: return {} def map( - self, map_type: MapType, key_result: Dict[Position, Accessor], value_result: Dict[Position, Accessor] - ) -> Dict[Position, Accessor]: + self, map_type: MapType, key_result: dict[Position, Accessor], value_result: dict[Position, Accessor] + ) -> dict[Position, Accessor]: return {} - def primitive(self, primitive: PrimitiveType) -> Dict[Position, Accessor]: + def primitive(self, primitive: PrimitiveType) -> dict[Position, Accessor]: return {} -def build_position_accessors(schema_or_type: Schema | IcebergType) -> Dict[int, Accessor]: +def build_position_accessors(schema_or_type: Schema | IcebergType) -> dict[int, Accessor]: """Generate an index of field IDs to schema position accessors. Args: @@ -1296,7 +1293,7 @@ def assign_fresh_schema_ids(schema_or_type: Schema | IcebergType, next_id: Calla class _SetFreshIDs(PreOrderSchemaVisitor[IcebergType]): """Traverses the schema and assigns monotonically increasing ids.""" - old_id_to_new_id: Dict[int, int] + old_id_to_new_id: dict[int, int] def __init__(self, next_id_func: Callable[[], int] | None = None) -> None: self.old_id_to_new_id = {} @@ -1314,7 +1311,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], StructType]) -> Sch identifier_field_ids=[self.old_id_to_new_id[field_id] for field_id in schema.identifier_field_ids], ) - def struct(self, struct: StructType, field_results: List[Callable[[], IcebergType]]) -> StructType: + def struct(self, struct: StructType, field_results: builtins.list[Callable[[], IcebergType]]) -> StructType: new_ids = [self._get_and_increment(field.field_id) for field in struct.fields] new_fields = [] for field_id, field, field_type in zip(new_ids, struct.fields, field_results, strict=True): @@ -1445,7 +1442,7 @@ def field(self, field: NestedField, field_result: IcebergType | None) -> Iceberg required=field.required, ) - def struct(self, struct: StructType, field_results: List[IcebergType | None]) -> IcebergType | None: + def struct(self, struct: StructType, field_results: builtins.list[IcebergType | None]) -> IcebergType | None: return StructType(*[field for field in field_results if field is not None]) def list(self, list_type: ListType, element_result: IcebergType | None) -> IcebergType | None: @@ -1464,7 +1461,7 @@ def primitive(self, primitive: PrimitiveType) -> IcebergType | None: return primitive -def prune_columns(schema: Schema, selected: Set[int], select_full_types: bool = True) -> Schema: +def prune_columns(schema: Schema, selected: set[int], select_full_types: bool = True) -> Schema: """Prunes a column by only selecting a set of field-ids. Args: @@ -1484,17 +1481,17 @@ def prune_columns(schema: Schema, selected: Set[int], select_full_types: bool = class _PruneColumnsVisitor(SchemaVisitor[IcebergType | None]): - selected: Set[int] + selected: set[int] select_full_types: bool - def __init__(self, selected: Set[int], select_full_types: bool): + def __init__(self, selected: set[int], select_full_types: bool): self.selected = selected self.select_full_types = select_full_types def schema(self, schema: Schema, struct_result: IcebergType | None) -> IcebergType | None: return struct_result - def struct(self, struct: StructType, field_results: List[IcebergType | None]) -> IcebergType | None: + def struct(self, struct: StructType, field_results: builtins.list[IcebergType | None]) -> IcebergType | None: fields = struct.fields selected_fields = [] same_type = True @@ -1781,7 +1778,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool: raise ValueError(f"Mismatch in fields:\n{self.console.export_text()}") return result - def struct(self, struct: StructType, field_results: List[Callable[[], bool]]) -> bool: + def struct(self, struct: StructType, field_results: builtins.list[Callable[[], bool]]) -> bool: results = [result() for result in field_results] return all(results) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 42ea40980b..56cab7618f 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -29,13 +29,8 @@ TYPE_CHECKING, Any, Callable, - Dict, Iterable, Iterator, - List, - Set, - Tuple, - Type, TypeVar, ) @@ -254,8 +249,8 @@ class TableProperties: class Transaction: _table: Table _autocommit: bool - _updates: Tuple[TableUpdate, ...] - _requirements: Tuple[TableRequirement, ...] + _updates: tuple[TableUpdate, ...] + _requirements: tuple[TableRequirement, ...] def __init__(self, table: Table, autocommit: bool = False): """Open a transaction to stage and commit changes to a table. @@ -277,12 +272,12 @@ def __enter__(self) -> Transaction: """Start a transaction to update the table.""" return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Close and commit the transaction if no exceptions have been raised.""" if exctype is None and excinst is None and exctb is None: self.commit_transaction() - def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...] = ()) -> Transaction: + def _apply(self, updates: tuple[TableUpdate, ...], requirements: tuple[TableRequirement, ...] = ()) -> Transaction: """Check if the requirements are met, and applies the updates to the metadata.""" for requirement in requirements: requirement.validate(self.table_metadata) @@ -377,7 +372,7 @@ def _set_ref_snapshot( return updates, requirements - def _build_partition_predicate(self, partition_records: Set[Record]) -> BooleanExpression: + def _build_partition_predicate(self, partition_records: set[Record]) -> BooleanExpression: """Build a filter predicate matching any of the input partition records. Args: @@ -404,7 +399,7 @@ def _build_partition_predicate(self, partition_records: Set[Record]) -> BooleanE return expr def _append_snapshot_producer( - self, snapshot_properties: Dict[str, str], branch: str | None = MAIN_BRANCH + self, snapshot_properties: dict[str, str], branch: str | None = MAIN_BRANCH ) -> _FastAppendFiles: """Determine the append type based on table properties. @@ -453,7 +448,7 @@ def update_sort_order(self, case_sensitive: bool = True) -> UpdateSortOrder: ) def update_snapshot( - self, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH + self, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH ) -> UpdateSnapshot: """Create a new UpdateSnapshot to produce a new snapshot for the table. @@ -471,7 +466,7 @@ def update_statistics(self) -> UpdateStatistics: """ return UpdateStatistics(transaction=self) - def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: + def append(self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: """ Shorthand API for appending a PyArrow table to a table transaction. @@ -510,7 +505,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, append_files.append_data_file(data_file) def dynamic_partition_overwrite( - self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH + self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH ) -> None: """ Shorthand for overwriting existing partitions with a PyArrow table. @@ -556,7 +551,7 @@ def dynamic_partition_overwrite( return append_snapshot_commit_uuid = uuid.uuid4() - data_files: List[DataFile] = list( + data_files: list[DataFile] = list( _dataframe_to_data_files( table_metadata=self._table.metadata, write_uuid=append_snapshot_commit_uuid, df=df, io=self._table.io ) @@ -575,7 +570,7 @@ def overwrite( self, df: pa.Table, overwrite_filter: BooleanExpression | str = ALWAYS_TRUE, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -635,7 +630,7 @@ def overwrite( def delete( self, delete_filter: str | BooleanExpression, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -684,7 +679,7 @@ def delete( commit_uuid = uuid.uuid4() counter = itertools.count(0) - replaced_files: List[Tuple[DataFile, List[DataFile]]] = [] + replaced_files: list[tuple[DataFile, list[DataFile]]] = [] # This will load the Parquet file into memory, including: # - Filter out the rows based on the delete filter # - Projecting it to the current schema @@ -736,7 +731,7 @@ def delete( def upsert( self, df: pa.Table, - join_cols: List[str] | None = None, + join_cols: list[str] | None = None, when_matched_update_all: bool = True, when_not_matched_insert_all: bool = True, case_sensitive: bool = True, @@ -879,8 +874,8 @@ def upsert( def add_files( self, - file_paths: List[str], - snapshot_properties: Dict[str, str] = EMPTY_DICT, + file_paths: list[str], + snapshot_properties: dict[str, str] = EMPTY_DICT, check_duplicate_files: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1025,10 +1020,10 @@ def commit_transaction(self) -> Table: return self._table -class Namespace(IcebergRootModel[List[str]]): +class Namespace(IcebergRootModel[list[str]]): """Reference to one or more levels of a namespace.""" - root: List[str] = Field( + root: list[str] = Field( ..., description="Reference to one or more levels of a namespace", ) @@ -1045,8 +1040,8 @@ class CommitTableRequest(IcebergBaseModel): """A pydantic BaseModel for a table commit request.""" identifier: TableIdentifier = Field() - requirements: Tuple[TableRequirement, ...] = Field(default_factory=tuple) - updates: Tuple[TableUpdate, ...] = Field(default_factory=tuple) + requirements: tuple[TableRequirement, ...] = Field(default_factory=tuple) + updates: tuple[TableUpdate, ...] = Field(default_factory=tuple) class CommitTableResponse(IcebergBaseModel): @@ -1064,7 +1059,7 @@ class Table: metadata_location: str = Field() io: FileIO catalog: Catalog - config: Dict[str, str] + config: dict[str, str] def __init__( self, @@ -1073,7 +1068,7 @@ def __init__( metadata_location: str, io: FileIO, catalog: Catalog, - config: Dict[str, str] = EMPTY_DICT, + config: dict[str, str] = EMPTY_DICT, ) -> None: self._identifier = identifier self.metadata = metadata @@ -1131,7 +1126,7 @@ def name(self) -> Identifier: def scan( self, row_filter: str | BooleanExpression = ALWAYS_TRUE, - selected_fields: Tuple[str, ...] = ("*",), + selected_fields: tuple[str, ...] = ("*",), case_sensitive: bool = True, snapshot_id: int | None = None, options: Properties = EMPTY_DICT, @@ -1185,7 +1180,7 @@ def schema(self) -> Schema: """Return the schema for this table.""" return next(schema for schema in self.metadata.schemas if schema.schema_id == self.metadata.current_schema_id) - def schemas(self) -> Dict[int, Schema]: + def schemas(self) -> dict[int, Schema]: """Return a dict of the schema of this table.""" return {schema.schema_id: schema for schema in self.metadata.schemas} @@ -1193,7 +1188,7 @@ def spec(self) -> PartitionSpec: """Return the partition spec of this table.""" return next(spec for spec in self.metadata.partition_specs if spec.spec_id == self.metadata.default_spec_id) - def specs(self) -> Dict[int, PartitionSpec]: + def specs(self) -> dict[int, PartitionSpec]: """Return a dict the partition specs this table.""" return {spec.spec_id: spec for spec in self.metadata.partition_specs} @@ -1203,7 +1198,7 @@ def sort_order(self) -> SortOrder: sort_order for sort_order in self.metadata.sort_orders if sort_order.order_id == self.metadata.default_sort_order_id ) - def sort_orders(self) -> Dict[int, SortOrder]: + def sort_orders(self) -> dict[int, SortOrder]: """Return a dict of the sort orders of this table.""" return {sort_order.order_id: sort_order for sort_order in self.metadata.sort_orders} @@ -1214,7 +1209,7 @@ def last_partition_id(self) -> int: return PARTITION_FIELD_ID_START - 1 @property - def properties(self) -> Dict[str, str]: + def properties(self) -> dict[str, str]: """Properties of the table.""" return self.metadata.properties @@ -1236,7 +1231,7 @@ def current_snapshot(self) -> Snapshot | None: return self.snapshot_by_id(self.metadata.current_snapshot_id) return None - def snapshots(self) -> List[Snapshot]: + def snapshots(self) -> list[Snapshot]: return self.metadata.snapshots def snapshot_by_id(self, snapshot_id: int) -> Snapshot | None: @@ -1261,7 +1256,7 @@ def snapshot_as_of_timestamp(self, timestamp_ms: int, inclusive: bool = True) -> return self.snapshot_by_id(log_entry.snapshot_id) return None - def history(self) -> List[SnapshotLogEntry]: + def history(self) -> list[SnapshotLogEntry]: """Get the snapshot history of this table.""" return self.metadata.snapshot_log @@ -1329,7 +1324,7 @@ def name_mapping(self) -> NameMapping | None: def upsert( self, df: pa.Table, - join_cols: List[str] | None = None, + join_cols: list[str] | None = None, when_matched_update_all: bool = True, when_not_matched_insert_all: bool = True, case_sensitive: bool = True, @@ -1380,7 +1375,7 @@ def upsert( branch=branch, ) - def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: + def append(self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: """ Shorthand API for appending a PyArrow table to the table. @@ -1393,7 +1388,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, tx.append(df=df, snapshot_properties=snapshot_properties, branch=branch) def dynamic_partition_overwrite( - self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH + self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH ) -> None: """Shorthand for dynamic overwriting the table with a PyArrow table. @@ -1410,7 +1405,7 @@ def overwrite( self, df: pa.Table, overwrite_filter: BooleanExpression | str = ALWAYS_TRUE, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1443,7 +1438,7 @@ def overwrite( def delete( self, delete_filter: BooleanExpression | str = ALWAYS_TRUE, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1463,8 +1458,8 @@ def delete( def add_files( self, - file_paths: List[str], - snapshot_properties: Dict[str, str] = EMPTY_DICT, + file_paths: list[str], + snapshot_properties: dict[str, str] = EMPTY_DICT, check_duplicate_files: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1488,11 +1483,11 @@ def add_files( def update_spec(self, case_sensitive: bool = True) -> UpdateSpec: return UpdateSpec(Transaction(self, autocommit=True), case_sensitive=case_sensitive) - def refs(self) -> Dict[str, SnapshotRef]: + def refs(self) -> dict[str, SnapshotRef]: """Return the snapshot references in the table.""" return self.metadata.refs - def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None: + def _do_commit(self, updates: tuple[TableUpdate, ...], requirements: tuple[TableRequirement, ...]) -> None: response = self.catalog.commit_table(self, requirements, updates) # https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527 @@ -1654,7 +1649,7 @@ def refresh(self) -> Table: def scan( self, row_filter: str | BooleanExpression = ALWAYS_TRUE, - selected_fields: Tuple[str, ...] = ("*",), + selected_fields: tuple[str, ...] = ("*",), case_sensitive: bool = True, snapshot_id: int | None = None, options: Properties = EMPTY_DICT, @@ -1686,7 +1681,7 @@ class TableScan(ABC): table_metadata: TableMetadata io: FileIO row_filter: BooleanExpression - selected_fields: Tuple[str, ...] + selected_fields: tuple[str, ...] case_sensitive: bool snapshot_id: int | None options: Properties @@ -1697,7 +1692,7 @@ def __init__( table_metadata: TableMetadata, io: FileIO, row_filter: str | BooleanExpression = ALWAYS_TRUE, - selected_fields: Tuple[str, ...] = ("*",), + selected_fields: tuple[str, ...] = ("*",), case_sensitive: bool = True, snapshot_id: int | None = None, options: Properties = EMPTY_DICT, @@ -1792,7 +1787,7 @@ class FileScanTask(ScanTask): """Task representing a data file and its corresponding delete files.""" file: DataFile - delete_files: Set[DataFile] + delete_files: set[DataFile] start: int length: int residual: BooleanExpression @@ -1800,7 +1795,7 @@ class FileScanTask(ScanTask): def __init__( self, data_file: DataFile, - delete_files: Set[DataFile] | None = None, + delete_files: set[DataFile] | None = None, start: int | None = None, length: int | None = None, residual: BooleanExpression = ALWAYS_TRUE, @@ -1817,7 +1812,7 @@ def _open_manifest( manifest: ManifestFile, partition_filter: Callable[[DataFile], bool], metrics_evaluator: Callable[[DataFile], bool], -) -> List[ManifestEntry]: +) -> list[ManifestEntry]: """Open a manifest file and return matching manifest entries. Returns: @@ -1830,7 +1825,7 @@ def _open_manifest( ] -def _min_sequence_number(manifests: List[ManifestFile]) -> int: +def _min_sequence_number(manifests: list[ManifestFile]) -> int: try: return min( manifest.min_sequence_number or INITIAL_SEQUENCE_NUMBER @@ -1842,7 +1837,7 @@ def _min_sequence_number(manifests: List[ManifestFile]) -> int: return INITIAL_SEQUENCE_NUMBER -def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_entries: SortedList[ManifestEntry]) -> Set[DataFile]: +def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_entries: SortedList[ManifestEntry]) -> set[DataFile]: """Check if the delete file is relevant for the data file. Using the column metrics to see if the filename is in the lower and upper bound. @@ -1939,7 +1934,7 @@ def _check_sequence_number(min_sequence_number: int, manifest: ManifestFile) -> and (manifest.sequence_number or INITIAL_SEQUENCE_NUMBER) >= min_sequence_number ) - def scan_plan_helper(self) -> Iterator[List[ManifestEntry]]: + def scan_plan_helper(self) -> Iterator[list[ManifestEntry]]: """Filter and return manifest entries based on partition and metrics evaluators. Returns: @@ -1952,7 +1947,7 @@ def scan_plan_helper(self) -> Iterator[List[ManifestEntry]]: # step 1: filter manifests using partition summaries # the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id - manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) + manifest_evaluators: dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) manifests = [ manifest_file @@ -1963,7 +1958,7 @@ def scan_plan_helper(self) -> Iterator[List[ManifestEntry]]: # step 2: filter the data files in each manifest # this filter depends on the partition spec used to write the manifest file - partition_evaluators: Dict[int, Callable[[DataFile], bool]] = KeyDefaultDict(self._build_partition_evaluator) + partition_evaluators: dict[int, Callable[[DataFile], bool]] = KeyDefaultDict(self._build_partition_evaluator) min_sequence_number = _min_sequence_number(manifests) @@ -1989,10 +1984,10 @@ def plan_files(self) -> Iterable[FileScanTask]: Returns: List of FileScanTasks that contain both data and delete files. """ - data_entries: List[ManifestEntry] = [] + data_entries: list[ManifestEntry] = [] positional_delete_entries = SortedList(key=lambda entry: entry.sequence_number or INITIAL_SEQUENCE_NUMBER) - residual_evaluators: Dict[int, Callable[[DataFile], ResidualEvaluator]] = KeyDefaultDict(self._build_residual_evaluator) + residual_evaluators: dict[int, Callable[[DataFile], ResidualEvaluator]] = KeyDefaultDict(self._build_residual_evaluator) for manifest_entry in chain.from_iterable(self.scan_plan_helper()): data_file = manifest_entry.data_file @@ -2138,7 +2133,7 @@ class WriteTask: write_uuid: uuid.UUID task_id: int schema: Schema - record_batches: List[pa.RecordBatch] + record_batches: list[pa.RecordBatch] sort_order_id: int | None = None partition_key: PartitionKey | None = None @@ -2148,7 +2143,7 @@ def generate_data_file_filename(self, extension: str) -> str: return f"00000-{self.task_id}-{self.write_uuid}.{extension}" -def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: List[str], io: FileIO) -> Iterable[DataFile]: +def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: list[str], io: FileIO) -> Iterable[DataFile]: """Convert a list files into DataFiles. Returns: diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index 45dc735a11..c64fcfbf12 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -18,7 +18,7 @@ import itertools from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Set, Tuple +from typing import TYPE_CHECKING, Any, Iterator from pyiceberg.conversions import from_bytes from pyiceberg.expressions import AlwaysTrue, BooleanExpression @@ -308,7 +308,7 @@ def partitions( snapshot_id=snapshot.snapshot_id, ) - partitions_map: Dict[Tuple[str, Any], Any] = {} + partitions_map: dict[tuple[str, Any], Any] = {} for entry in itertools.chain.from_iterable(scan.scan_plan_helper()): partition = entry.data_file.partition @@ -327,9 +327,9 @@ def partitions( def _update_partitions_map_from_manifest_entry( self, - partitions_map: Dict[Tuple[str, Any], Any], + partitions_map: dict[tuple[str, Any], Any], file: DataFile, - partition_record_dict: Dict[str, Any], + partition_record_dict: dict[str, Any], snapshot: Snapshot | None, ) -> None: partition_record_key = _convert_to_hashable_type(partition_record_dict) @@ -409,8 +409,8 @@ def _generate_manifests_table(self, snapshot: Snapshot | None, is_all_manifests_ import pyarrow as pa def _partition_summaries_to_rows( - spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] - ) -> List[Dict[str, Any]]: + spec: PartitionSpec, partition_summaries: list[PartitionFieldSummary] + ) -> list[dict[str, Any]]: rows = [] for i, field_summary in enumerate(partition_summaries): field = spec.fields[i] @@ -492,7 +492,7 @@ def metadata_log_entries(self) -> pa.Table: ] ) - def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any]: + def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> dict[str, Any]: latest_snapshot = self.tbl.snapshot_as_of_timestamp(metadata_entry.timestamp_ms) return { "timestamp": metadata_entry.timestamp_ms, @@ -545,7 +545,7 @@ def history(self) -> pa.Table: return pa.Table.from_pylist(history, schema=history_schema) def _get_files_from_manifest( - self, manifest_list: ManifestFile, data_file_filter: Set[DataFileContent] | None = None + self, manifest_list: ManifestFile, data_file_filter: set[DataFileContent] | None = None ) -> pa.Table: import pyarrow as pa @@ -663,7 +663,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: ) return files_schema - def _files(self, snapshot_id: int | None = None, data_file_filter: Set[DataFileContent] | None = None) -> pa.Table: + def _files(self, snapshot_id: int | None = None, data_file_filter: set[DataFileContent] | None = None) -> pa.Table: import pyarrow as pa if not snapshot_id and not self.tbl.metadata.current_snapshot(): @@ -702,7 +702,7 @@ def all_manifests(self) -> pa.Table: ) return pa.concat_tables(manifests_by_snapshots) - def _all_files(self, data_file_filter: Set[DataFileContent] | None = None) -> pa.Table: + def _all_files(self, data_file_filter: set[DataFileContent] | None = None) -> pa.Table: import pyarrow as pa snapshots = self.tbl.snapshots() diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 201aaee511..8ae930375a 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -19,7 +19,7 @@ import datetime import uuid from copy import copy -from typing import Annotated, Any, Dict, List, Literal +from typing import Annotated, Any, Literal from pydantic import Field, field_serializer, field_validator, model_validator from pydantic import ValidationError as PydanticValidationError @@ -68,7 +68,7 @@ SUPPORTED_TABLE_FORMAT_VERSION = 2 -def cleanup_snapshot_id(data: Dict[str, Any]) -> Dict[str, Any]: +def cleanup_snapshot_id(data: dict[str, Any]) -> dict[str, Any]: """Run before validation.""" if CURRENT_SNAPSHOT_ID in data and data[CURRENT_SNAPSHOT_ID] == -1: # We treat -1 and None the same, by cleaning this up @@ -92,7 +92,7 @@ def check_partition_specs(table_metadata: TableMetadata) -> TableMetadata: """Check if the default-spec-id is present in partition-specs.""" default_spec_id = table_metadata.default_spec_id - partition_specs: List[PartitionSpec] = table_metadata.partition_specs + partition_specs: list[PartitionSpec] = table_metadata.partition_specs for spec in partition_specs: if spec.spec_id == default_spec_id: return table_metadata @@ -105,7 +105,7 @@ def check_sort_orders(table_metadata: TableMetadata) -> TableMetadata: default_sort_order_id: int = table_metadata.default_sort_order_id if default_sort_order_id != UNSORTED_SORT_ORDER_ID: - sort_orders: List[SortOrder] = table_metadata.sort_orders + sort_orders: list[SortOrder] = table_metadata.sort_orders for sort_order in sort_orders: if sort_order.order_id == default_sort_order_id: return table_metadata @@ -151,13 +151,13 @@ class TableMetadataCommonFields(IcebergBaseModel): This is used to ensure fields are always assigned an unused ID when evolving schemas.""" - schemas: List[Schema] = Field(default_factory=list) + schemas: list[Schema] = Field(default_factory=list) """A list of schemas, stored as objects with schema-id.""" current_schema_id: int = Field(alias="current-schema-id", default=DEFAULT_SCHEMA_ID) """ID of the table’s current schema.""" - partition_specs: List[PartitionSpec] = Field(alias="partition-specs", default_factory=list) + partition_specs: list[PartitionSpec] = Field(alias="partition-specs", default_factory=list) """A list of partition specs, stored as full partition spec objects.""" default_spec_id: int = Field(alias="default-spec-id", default=INITIAL_SPEC_ID) @@ -168,7 +168,7 @@ class TableMetadataCommonFields(IcebergBaseModel): partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs.""" - properties: Dict[str, str] = Field(default_factory=dict) + properties: dict[str, str] = Field(default_factory=dict) """A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, commit.retry.num-retries @@ -177,13 +177,13 @@ class TableMetadataCommonFields(IcebergBaseModel): current_snapshot_id: int | None = Field(alias="current-snapshot-id", default=None) """ID of the current table snapshot.""" - snapshots: List[Snapshot] = Field(default_factory=list) + snapshots: list[Snapshot] = Field(default_factory=list) """A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected.""" - snapshot_log: List[SnapshotLogEntry] = Field(alias="snapshot-log", default_factory=list) + snapshot_log: list[SnapshotLogEntry] = Field(alias="snapshot-log", default_factory=list) """A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the @@ -191,7 +191,7 @@ class TableMetadataCommonFields(IcebergBaseModel): expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed.""" - metadata_log: List[MetadataLogEntry] = Field(alias="metadata-log", default_factory=list) + metadata_log: list[MetadataLogEntry] = Field(alias="metadata-log", default_factory=list) """A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata @@ -199,7 +199,7 @@ class TableMetadataCommonFields(IcebergBaseModel): remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit.""" - sort_orders: List[SortOrder] = Field(alias="sort-orders", default_factory=list) + sort_orders: list[SortOrder] = Field(alias="sort-orders", default_factory=list) """A list of sort orders, stored as full sort order objects.""" default_sort_order_id: int = Field(alias="default-sort-order-id", default=UNSORTED_SORT_ORDER_ID) @@ -207,14 +207,14 @@ class TableMetadataCommonFields(IcebergBaseModel): writers, but is not used when reading because reads use the specs stored in manifest files.""" - refs: Dict[str, SnapshotRef] = Field(default_factory=dict) + refs: dict[str, SnapshotRef] = Field(default_factory=dict) """A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a main branch reference pointing to the current-snapshot-id even if the refs map is null.""" - statistics: List[StatisticsFile] = Field(default_factory=list) + statistics: list[StatisticsFile] = Field(default_factory=list) """A optional list of table statistics files. Table statistics files are valid Puffin files. Statistics are informational. A reader can choose to ignore statistics @@ -222,7 +222,7 @@ class TableMetadataCommonFields(IcebergBaseModel): table correctly. A table can contain many statistics files associated with different table snapshots.""" - partition_statistics: List[PartitionStatisticsFile] = Field(alias="partition-statistics", default_factory=list) + partition_statistics: list[PartitionStatisticsFile] = Field(alias="partition-statistics", default_factory=list) """A optional list of partition statistics files. Partition statistics are not required for reading or planning and readers may ignore them. Each table snapshot may be associated @@ -232,7 +232,7 @@ class TableMetadataCommonFields(IcebergBaseModel): # validators @field_validator("properties", mode="before") - def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: + def transform_properties_dict_value_to_str(cls, properties: Properties) -> dict[str, str]: return transform_dict_value_to_str(properties) def snapshot_by_id(self, snapshot_id: int) -> Snapshot | None: @@ -258,7 +258,7 @@ def spec(self) -> PartitionSpec: """Return the partition spec of this table.""" return next(spec for spec in self.partition_specs if spec.spec_id == self.default_spec_id) - def specs(self) -> Dict[int, PartitionSpec]: + def specs(self) -> dict[int, PartitionSpec]: """Return a dict the partition specs this table.""" return {spec.spec_id: spec for spec in self.partition_specs} @@ -323,7 +323,7 @@ def serialize_current_snapshot_id(self, current_snapshot_id: int | None) -> int return current_snapshot_id @field_serializer("snapshots") - def serialize_snapshots(self, snapshots: List[Snapshot]) -> List[Snapshot]: + def serialize_snapshots(self, snapshots: list[Snapshot]) -> list[Snapshot]: # Snapshot field `sequence-number` should not be written for v1 metadata if self.format_version == 1: return [snapshot.model_copy(update={"sequence_number": None}) for snapshot in snapshots] @@ -361,7 +361,7 @@ class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel): # to the owner of the table. @model_validator(mode="before") - def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: return cleanup_snapshot_id(data) @model_validator(mode="after") @@ -369,7 +369,7 @@ def construct_refs(self) -> TableMetadataV1: return construct_refs(self) @model_validator(mode="before") - def set_v2_compatible_defaults(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def set_v2_compatible_defaults(cls, data: dict[str, Any]) -> dict[str, Any]: """Set default values to be compatible with the format v2. Args: @@ -387,7 +387,7 @@ def set_v2_compatible_defaults(cls, data: Dict[str, Any]) -> Dict[str, Any]: return data @model_validator(mode="before") - def construct_schemas(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def construct_schemas(cls, data: dict[str, Any]) -> dict[str, Any]: """Convert the schema into schemas. For V1 schemas is optional, and if they aren't set, we'll set them @@ -406,7 +406,7 @@ def construct_schemas(cls, data: Dict[str, Any]) -> Dict[str, Any]: return data @model_validator(mode="before") - def construct_partition_specs(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def construct_partition_specs(cls, data: dict[str, Any]) -> dict[str, Any]: """Convert the partition_spec into partition_specs. For V1 partition_specs is optional, and if they aren't set, we'll set them @@ -441,7 +441,7 @@ def construct_partition_specs(cls, data: Dict[str, Any]) -> Dict[str, Any]: return data @model_validator(mode="before") - def set_sort_orders(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def set_sort_orders(cls, data: dict[str, Any]) -> dict[str, Any]: """Set the sort_orders if not provided. For V1 sort_orders is optional, and if they aren't set, we'll set them @@ -470,7 +470,7 @@ def to_v2(self) -> TableMetadataV2: """The table’s current schema. (Deprecated: use schemas and current-schema-id instead).""" - partition_spec: List[Dict[str, Any]] = Field(alias="partition-spec", default_factory=list) + partition_spec: list[dict[str, Any]] = Field(alias="partition-spec", default_factory=list) """The table’s current partition spec, stored as only fields. Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in @@ -490,7 +490,7 @@ class TableMetadataV2(TableMetadataCommonFields, IcebergBaseModel): """ @model_validator(mode="before") - def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: return cleanup_snapshot_id(data) @model_validator(mode="after") @@ -534,7 +534,7 @@ class TableMetadataV3(TableMetadataCommonFields, IcebergBaseModel): """ @model_validator(mode="before") - def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: return cleanup_snapshot_id(data) @model_validator(mode="after") @@ -655,7 +655,7 @@ def parse_raw(data: str) -> TableMetadata: raise ValidationError(e) from e @staticmethod - def parse_obj(data: Dict[str, Any]) -> TableMetadata: + def parse_obj(data: dict[str, Any]) -> TableMetadata: if "format-version" not in data: raise ValidationError(f"Missing format-version in TableMetadata: {data}") format_version = data["format-version"] diff --git a/pyiceberg/table/name_mapping.py b/pyiceberg/table/name_mapping.py index 1216daa2a4..cce87f1468 100644 --- a/pyiceberg/table/name_mapping.py +++ b/pyiceberg/table/name_mapping.py @@ -23,10 +23,11 @@ from __future__ import annotations +import builtins from abc import ABC, abstractmethod from collections import ChainMap from functools import cached_property, singledispatch -from typing import Any, Dict, Generic, Iterator, List, TypeVar +from typing import Any, Generic, Iterator, TypeVar from pydantic import Field, conlist, field_validator, model_serializer @@ -37,8 +38,8 @@ class MappedField(IcebergBaseModel): field_id: int | None = Field(alias="field-id", default=None) - names: List[str] = conlist(str) - fields: List[MappedField] = Field(default_factory=list) + names: list[str] = conlist(str) + fields: list[MappedField] = Field(default_factory=list) @field_validator("fields", mode="before") @classmethod @@ -46,9 +47,9 @@ def convert_null_to_empty_List(cls, v: Any) -> Any: return v or [] @model_serializer - def ser_model(self) -> Dict[str, Any]: + def ser_model(self) -> dict[str, Any]: """Set custom serializer to leave out the field when it is empty.""" - serialized: Dict[str, Any] = {"names": self.names} + serialized: dict[str, Any] = {"names": self.names} if self.field_id is not None: serialized["field-id"] = self.field_id if len(self.fields) > 0: @@ -68,11 +69,11 @@ def __str__(self) -> str: return "([" + ", ".join(self.names) + "] -> " + field_id + fields_str + ")" -class NameMapping(IcebergRootModel[List[MappedField]]): - root: List[MappedField] +class NameMapping(IcebergRootModel[list[MappedField]]): + root: list[MappedField] @cached_property - def _field_by_name(self) -> Dict[str, MappedField]: + def _field_by_name(self) -> dict[str, MappedField]: return visit_name_mapping(self, _IndexByName()) def __len__(self) -> int: @@ -101,7 +102,7 @@ def mapping(self, nm: NameMapping, field_results: S) -> S: """Visit a NameMapping.""" @abstractmethod - def fields(self, struct: List[MappedField], field_results: List[T]) -> S: + def fields(self, struct: list[MappedField], field_results: list[T]) -> S: """Visit a List[MappedField].""" @abstractmethod @@ -109,15 +110,15 @@ def field(self, field: MappedField, field_result: S) -> T: """Visit a MappedField.""" -class _IndexByName(NameMappingVisitor[Dict[str, MappedField], Dict[str, MappedField]]): - def mapping(self, nm: NameMapping, field_results: Dict[str, MappedField]) -> Dict[str, MappedField]: +class _IndexByName(NameMappingVisitor[dict[str, MappedField], dict[str, MappedField]]): + def mapping(self, nm: NameMapping, field_results: dict[str, MappedField]) -> dict[str, MappedField]: return field_results - def fields(self, struct: List[MappedField], field_results: List[Dict[str, MappedField]]) -> Dict[str, MappedField]: + def fields(self, struct: list[MappedField], field_results: list[dict[str, MappedField]]) -> dict[str, MappedField]: return dict(ChainMap(*field_results)) - def field(self, field: MappedField, field_result: Dict[str, MappedField]) -> Dict[str, MappedField]: - result: Dict[str, MappedField] = { + def field(self, field: MappedField, field_result: dict[str, MappedField]) -> dict[str, MappedField]: + result: dict[str, MappedField] = { f"{field_name}.{key}": result_field for key, result_field in field_result.items() for field_name in field.names } @@ -128,7 +129,7 @@ def field(self, field: MappedField, field_result: Dict[str, MappedField]) -> Dic @singledispatch -def visit_name_mapping(obj: NameMapping | List[MappedField] | MappedField, visitor: NameMappingVisitor[S, T]) -> S: +def visit_name_mapping(obj: NameMapping | list[MappedField] | MappedField, visitor: NameMappingVisitor[S, T]) -> S: """Traverse the name mapping in post-order traversal.""" raise NotImplementedError(f"Cannot visit non-type: {obj}") @@ -139,7 +140,7 @@ def _(obj: NameMapping, visitor: NameMappingVisitor[S, T]) -> S: @visit_name_mapping.register(list) -def _(fields: List[MappedField], visitor: NameMappingVisitor[S, T]) -> S: +def _(fields: list[MappedField], visitor: NameMappingVisitor[S, T]) -> S: results = [visitor.field(field, visit_name_mapping(field.fields, visitor)) for field in fields] return visitor.fields(fields, results) @@ -148,42 +149,44 @@ def parse_mapping_from_json(mapping: str) -> NameMapping: return NameMapping.model_validate_json(mapping) -class _CreateMapping(SchemaVisitor[List[MappedField]]): - def schema(self, schema: Schema, struct_result: List[MappedField]) -> List[MappedField]: +class _CreateMapping(SchemaVisitor[list[MappedField]]): + def schema(self, schema: Schema, struct_result: builtins.list[MappedField]) -> builtins.list[MappedField]: return struct_result - def struct(self, struct: StructType, field_results: List[List[MappedField]]) -> List[MappedField]: + def struct(self, struct: StructType, field_results: builtins.list[builtins.list[MappedField]]) -> builtins.list[MappedField]: return [ MappedField(field_id=field.field_id, names=[field.name], fields=result) for field, result in zip(struct.fields, field_results, strict=True) ] - def field(self, field: NestedField, field_result: List[MappedField]) -> List[MappedField]: + def field(self, field: NestedField, field_result: builtins.list[MappedField]) -> builtins.list[MappedField]: return field_result - def list(self, list_type: ListType, element_result: List[MappedField]) -> List[MappedField]: + def list(self, list_type: ListType, element_result: builtins.list[MappedField]) -> builtins.list[MappedField]: return [MappedField(field_id=list_type.element_id, names=["element"], fields=element_result)] - def map(self, map_type: MapType, key_result: List[MappedField], value_result: List[MappedField]) -> List[MappedField]: + def map( + self, map_type: MapType, key_result: builtins.list[MappedField], value_result: builtins.list[MappedField] + ) -> builtins.list[MappedField]: return [ MappedField(field_id=map_type.key_id, names=["key"], fields=key_result), MappedField(field_id=map_type.value_id, names=["value"], fields=value_result), ] - def primitive(self, primitive: PrimitiveType) -> List[MappedField]: + def primitive(self, primitive: PrimitiveType) -> builtins.list[MappedField]: return [] -class _UpdateMapping(NameMappingVisitor[List[MappedField], MappedField]): - _updates: Dict[int, NestedField] - _adds: Dict[int, List[NestedField]] +class _UpdateMapping(NameMappingVisitor[list[MappedField], MappedField]): + _updates: dict[int, NestedField] + _adds: dict[int, list[NestedField]] - def __init__(self, updates: Dict[int, NestedField], adds: Dict[int, List[NestedField]]): + def __init__(self, updates: dict[int, NestedField], adds: dict[int, list[NestedField]]): self._updates = updates self._adds = adds @staticmethod - def _remove_reassigned_names(field: MappedField, assignments: Dict[str, int]) -> MappedField | None: + def _remove_reassigned_names(field: MappedField, assignments: dict[str, int]) -> MappedField | None: removed_names = set() for name in field.names: if (assigned_id := assignments.get(name)) and assigned_id != field.field_id: @@ -195,10 +198,10 @@ def _remove_reassigned_names(field: MappedField, assignments: Dict[str, int]) -> else: return None - def _add_new_fields(self, mapped_fields: List[MappedField], parent_id: int) -> List[MappedField]: + def _add_new_fields(self, mapped_fields: list[MappedField], parent_id: int) -> list[MappedField]: if fields_to_add := self._adds.get(parent_id): - fields: List[MappedField] = [] - new_fields: List[MappedField] = [] + fields: list[MappedField] = [] + new_fields: list[MappedField] = [] for add in fields_to_add: new_fields.append( @@ -215,11 +218,11 @@ def _add_new_fields(self, mapped_fields: List[MappedField], parent_id: int) -> L else: return mapped_fields - def mapping(self, nm: NameMapping, field_results: List[MappedField]) -> List[MappedField]: + def mapping(self, nm: NameMapping, field_results: list[MappedField]) -> list[MappedField]: return self._add_new_fields(field_results, -1) - def fields(self, struct: List[MappedField], field_results: List[MappedField]) -> List[MappedField]: - reassignments: Dict[str, int] = { + def fields(self, struct: list[MappedField], field_results: list[MappedField]) -> list[MappedField]: + reassignments: dict[str, int] = { update.name: update.field_id for f in field_results if f.field_id is not None and (update := self._updates.get(f.field_id)) @@ -230,7 +233,7 @@ def fields(self, struct: List[MappedField], field_results: List[MappedField]) -> if (updated_field := self._remove_reassigned_names(field, reassignments)) is not None ] - def field(self, field: MappedField, field_result: List[MappedField]) -> MappedField: + def field(self, field: MappedField, field_result: list[MappedField]) -> MappedField: if field.field_id is None: return field field_names = field.names @@ -244,7 +247,7 @@ def create_mapping_from_schema(schema: Schema) -> NameMapping: return NameMapping(visit(schema, _CreateMapping())) -def update_mapping(mapping: NameMapping, updates: Dict[int, NestedField], adds: Dict[int, List[NestedField]]) -> NameMapping: +def update_mapping(mapping: NameMapping, updates: dict[int, NestedField], adds: dict[int, list[NestedField]]) -> NameMapping: return NameMapping(visit_name_mapping(mapping, _UpdateMapping(updates, adds))) @@ -253,7 +256,7 @@ def schema_partner(self, partner: MappedField | None) -> MappedField | None: return partner def field_partner( - self, partner_struct: List[MappedField] | MappedField | None, _: int, field_name: str + self, partner_struct: list[MappedField] | MappedField | None, _: int, field_name: str ) -> MappedField | None: if partner_struct is not None: if isinstance(partner_struct, MappedField): @@ -288,7 +291,7 @@ def map_value_partner(self, partner_map: MappedField | None) -> MappedField | No class NameMappingProjectionVisitor(SchemaWithPartnerVisitor[MappedField, IcebergType]): - current_path: List[str] + current_path: builtins.list[str] def __init__(self) -> None: # For keeping track where we are in case when a field cannot be found @@ -321,7 +324,9 @@ def after_map_value(self, value: NestedField, value_partner: P | None) -> None: def schema(self, schema: Schema, schema_partner: MappedField | None, struct_result: StructType) -> IcebergType: return Schema(*struct_result.fields, schema_id=schema.schema_id) - def struct(self, struct: StructType, struct_partner: MappedField | None, field_results: List[NestedField]) -> IcebergType: + def struct( + self, struct: StructType, struct_partner: MappedField | None, field_results: builtins.list[NestedField] + ) -> IcebergType: return StructType(*field_results) def field(self, field: NestedField, field_partner: MappedField | None, field_result: IcebergType) -> IcebergType: diff --git a/pyiceberg/table/puffin.py b/pyiceberg/table/puffin.py index 326fe3e37a..917d387f45 100644 --- a/pyiceberg/table/puffin.py +++ b/pyiceberg/table/puffin.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import math -from typing import TYPE_CHECKING, Dict, List, Literal +from typing import TYPE_CHECKING, Literal from pydantic import Field from pyroaring import BitMap, FrozenBitMap @@ -32,7 +32,7 @@ PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file" -def _deserialize_bitmap(pl: bytes) -> List[BitMap]: +def _deserialize_bitmap(pl: bytes) -> list[BitMap]: number_of_bitmaps = int.from_bytes(pl[0:8], byteorder="little") pl = pl[8:] @@ -64,21 +64,21 @@ def _deserialize_bitmap(pl: bytes) -> List[BitMap]: class PuffinBlobMetadata(IcebergBaseModel): type: Literal["deletion-vector-v1"] = Field() - fields: List[int] = Field() + fields: list[int] = Field() snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") offset: int = Field() length: int = Field() compression_codec: str | None = Field(alias="compression-codec", default=None) - properties: Dict[str, str] = Field(default_factory=dict) + properties: dict[str, str] = Field(default_factory=dict) class Footer(IcebergBaseModel): - blobs: List[PuffinBlobMetadata] = Field() - properties: Dict[str, str] = Field(default_factory=dict) + blobs: list[PuffinBlobMetadata] = Field() + properties: dict[str, str] = Field(default_factory=dict) -def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray": +def _bitmaps_to_chunked_array(bitmaps: list[BitMap]) -> "pa.ChunkedArray": import pyarrow as pa return pa.chunked_array([(key_pos << 32) + pos for pos in bitmap] for key_pos, bitmap in enumerate(bitmaps)) @@ -86,7 +86,7 @@ def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray": class PuffinFile: footer: Footer - _deletion_vectors: Dict[str, List[BitMap]] + _deletion_vectors: dict[str, list[BitMap]] def __init__(self, puffin: bytes) -> None: for magic_bytes in [puffin[:4], puffin[-4:]]: @@ -112,5 +112,5 @@ def __init__(self, puffin: bytes) -> None: for blob in self.footer.blobs } - def to_vector(self) -> Dict[str, "pa.ChunkedArray"]: + def to_vector(self) -> dict[str, "pa.ChunkedArray"]: return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()} diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index bc76569211..20646b8160 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -20,7 +20,7 @@ import warnings from collections import defaultdict from enum import Enum -from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping +from typing import TYPE_CHECKING, Any, Iterable, Mapping from pydantic import Field, PrivateAttr, model_serializer @@ -154,8 +154,8 @@ def remove_file(self, data_file: DataFile) -> None: else: raise ValueError(f"Unknown data file content: {data_file.content}") - def to_dict(self) -> Dict[str, str]: - properties: Dict[str, str] = {} + def to_dict(self) -> dict[str, str]: + properties: dict[str, str] = {} set_when_positive(properties, self.added_file_size, ADDED_FILE_SIZE) set_when_positive(properties, self.removed_file_size, REMOVED_FILE_SIZE) set_when_positive(properties, self.added_data_files, ADDED_DATA_FILES) @@ -183,7 +183,7 @@ class Summary(IcebergBaseModel, Mapping[str, str]): """ operation: Operation = Field() - _additional_properties: Dict[str, str] = PrivateAttr() + _additional_properties: dict[str, str] = PrivateAttr() def __init__(self, operation: Operation | None = None, **data: Any) -> None: if operation is None: @@ -212,14 +212,14 @@ def __len__(self) -> int: return 1 + len(self._additional_properties) @model_serializer - def ser_model(self) -> Dict[str, str]: + def ser_model(self) -> dict[str, str]: return { "operation": str(self.operation.value), **self._additional_properties, } @property - def additional_properties(self) -> Dict[str, str]: + def additional_properties(self) -> dict[str, str]: return self._additional_properties def __repr__(self) -> str: @@ -275,7 +275,7 @@ def __repr__(self) -> str: filtered_fields = [field for field in fields if field is not None] return f"Snapshot({', '.join(filtered_fields)})" - def manifests(self, io: FileIO) -> List[ManifestFile]: + def manifests(self, io: FileIO) -> list[ManifestFile]: """Return the manifests for the given snapshot.""" return list(_manifests(io, self.manifest_list)) @@ -292,7 +292,7 @@ class SnapshotLogEntry(IcebergBaseModel): class SnapshotSummaryCollector: metrics: UpdateMetrics - partition_metrics: DefaultDict[str, UpdateMetrics] + partition_metrics: defaultdict[str, UpdateMetrics] max_changed_partitions_for_summaries: int def __init__(self, partition_summary_limit: int = 0) -> None: @@ -324,7 +324,7 @@ def update_partition_metrics(self, partition_spec: PartitionSpec, file: DataFile else: partition_metrics.remove_file(file) - def build(self) -> Dict[str, str]: + def build(self) -> dict[str, str]: properties = self.metrics.to_dict() changed_partitions_size = len(self.partition_metrics) set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP) @@ -447,7 +447,7 @@ def _update_totals(total_property: str, added_property: str, removed_property: s return summary -def set_when_positive(properties: Dict[str, str], num: int, property_name: str) -> None: +def set_when_positive(properties: dict[str, str], num: int, property_name: str) -> None: if num > 0: properties[property_name] = str(num) diff --git a/pyiceberg/table/sorting.py b/pyiceberg/table/sorting.py index 8bd9a08176..4a8b85c3cd 100644 --- a/pyiceberg/table/sorting.py +++ b/pyiceberg/table/sorting.py @@ -16,7 +16,7 @@ # under the License. # pylint: disable=keyword-arg-before-vararg from enum import Enum -from typing import Annotated, Any, Callable, Dict, List +from typing import Annotated, Any, Callable from pydantic import ( BeforeValidator, @@ -88,7 +88,7 @@ def __init__( super().__init__(**data) @model_validator(mode="before") - def set_null_order(cls, values: Dict[str, Any]) -> Dict[str, Any]: + def set_null_order(cls, values: dict[str, Any]) -> dict[str, Any]: values["direction"] = values["direction"] if values.get("direction") else SortDirection.ASC if not values.get("null-order"): values["null-order"] = NullOrder.NULLS_FIRST if values["direction"] == SortDirection.ASC else NullOrder.NULLS_LAST @@ -144,7 +144,7 @@ class SortOrder(IcebergBaseModel): """ order_id: int = Field(alias="order-id", default=INITIAL_SORT_ORDER_ID) - fields: List[SortField] = Field(default_factory=list) + fields: list[SortField] = Field(default_factory=list) def __init__(self, *fields: SortField, **data: Any): if fields: diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py index 25654d0c27..34185b980a 100644 --- a/pyiceberg/table/statistics.py +++ b/pyiceberg/table/statistics.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import Dict, List, Literal +from typing import Literal from pydantic import Field @@ -25,8 +25,8 @@ class BlobMetadata(IcebergBaseModel): type: Literal["apache-datasketches-theta-v1", "deletion-vector-v1"] snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") - fields: List[int] - properties: Dict[str, str] | None = None + fields: list[int] + properties: dict[str, str] | None = None class StatisticsCommonFields(IcebergBaseModel): @@ -40,7 +40,7 @@ class StatisticsCommonFields(IcebergBaseModel): class StatisticsFile(StatisticsCommonFields): file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes") key_metadata: str | None = Field(alias="key-metadata", default=None) - blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata") + blob_metadata: list[BlobMetadata] = Field(alias="blob-metadata") class PartitionStatisticsFile(StatisticsCommonFields): @@ -48,7 +48,7 @@ class PartitionStatisticsFile(StatisticsCommonFields): def filter_statistics_by_snapshot_id( - statistics: List[StatisticsFile | PartitionStatisticsFile], + statistics: list[StatisticsFile | PartitionStatisticsFile], reject_snapshot_id: int, -) -> List[StatisticsFile | PartitionStatisticsFile]: +) -> list[StatisticsFile | PartitionStatisticsFile]: return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id] diff --git a/pyiceberg/table/update/__init__.py b/pyiceberg/table/update/__init__.py index 6533719b05..a79e2cb468 100644 --- a/pyiceberg/table/update/__init__.py +++ b/pyiceberg/table/update/__init__.py @@ -21,7 +21,7 @@ from abc import ABC, abstractmethod from datetime import datetime from functools import singledispatch -from typing import TYPE_CHECKING, Annotated, Any, Dict, Generic, List, Literal, Tuple, TypeVar, cast +from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar, cast from pydantic import Field, field_validator, model_serializer, model_validator @@ -143,7 +143,7 @@ class SetSnapshotRefUpdate(IcebergBaseModel): class RemoveSnapshotsUpdate(IcebergBaseModel): action: Literal["remove-snapshots"] = Field(default="remove-snapshots") - snapshot_ids: List[int] = Field(alias="snapshot-ids") + snapshot_ids: list[int] = Field(alias="snapshot-ids") class RemoveSnapshotRefUpdate(IcebergBaseModel): @@ -158,16 +158,16 @@ class SetLocationUpdate(IcebergBaseModel): class SetPropertiesUpdate(IcebergBaseModel): action: Literal["set-properties"] = Field(default="set-properties") - updates: Dict[str, str] + updates: dict[str, str] @field_validator("updates", mode="before") - def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: + def transform_properties_dict_value_to_str(cls, properties: Properties) -> dict[str, str]: return transform_dict_value_to_str(properties) class RemovePropertiesUpdate(IcebergBaseModel): action: Literal["remove-properties"] = Field(default="remove-properties") - removals: List[str] + removals: list[str] class SetStatisticsUpdate(IcebergBaseModel): @@ -180,7 +180,7 @@ class SetStatisticsUpdate(IcebergBaseModel): ) @model_validator(mode="before") - def validate_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def validate_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: stats = cast(StatisticsFile, data["statistics"]) data["snapshot_id"] = stats.snapshot_id @@ -195,12 +195,12 @@ class RemoveStatisticsUpdate(IcebergBaseModel): class RemovePartitionSpecsUpdate(IcebergBaseModel): action: Literal["remove-partition-specs"] = Field(default="remove-partition-specs") - spec_ids: List[int] = Field(alias="spec-ids") + spec_ids: list[int] = Field(alias="spec-ids") class RemoveSchemasUpdate(IcebergBaseModel): action: Literal["remove-schemas"] = Field(default="remove-schemas") - schema_ids: List[int] = Field(alias="schema-ids") + schema_ids: list[int] = Field(alias="schema-ids") class SetPartitionStatisticsUpdate(IcebergBaseModel): @@ -240,7 +240,7 @@ class RemovePartitionStatisticsUpdate(IcebergBaseModel): class _TableMetadataUpdateContext: - _updates: List[TableUpdate] + _updates: list[TableUpdate] def __init__(self) -> None: self._updates = [] @@ -345,7 +345,7 @@ def _(update: RemovePropertiesUpdate, base_metadata: TableMetadata, context: _Ta @_apply_table_update.register(AddSchemaUpdate) def _(update: AddSchemaUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata: - metadata_updates: Dict[str, Any] = { + metadata_updates: dict[str, Any] = { "last_column_id": max(base_metadata.last_column_id, update.schema_.highest_field_id), "schemas": base_metadata.schemas + [update.schema_], } @@ -381,7 +381,7 @@ def _(update: AddPartitionSpecUpdate, base_metadata: TableMetadata, context: _Ta if spec.spec_id == update.spec.spec_id and spec != update.spec: raise ValueError(f"Partition spec with id {spec.spec_id} already exists: {spec}") - metadata_updates: Dict[str, Any] = { + metadata_updates: dict[str, Any] = { "partition_specs": base_metadata.partition_specs + [update.spec], "last_partition_id": max( max([field.field_id for field in update.spec.fields], default=0), @@ -480,7 +480,7 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: _Tabl if snapshot is None: raise ValueError(f"Cannot set {update.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") - metadata_updates: Dict[str, Any] = {} + metadata_updates: dict[str, Any] = {} if context.is_added_snapshot(snapshot_ref.snapshot_id): metadata_updates["last_updated_ms"] = snapshot.timestamp_ms @@ -672,7 +672,7 @@ def _( def update_table_metadata( base_metadata: TableMetadata, - updates: Tuple[TableUpdate, ...], + updates: tuple[TableUpdate, ...], enforce_validation: bool = False, metadata_location: str | None = None, ) -> TableMetadata: @@ -732,7 +732,7 @@ def _update_table_metadata_log(base_metadata: TableMetadata, metadata_location: if len(base_metadata.metadata_log) >= max_metadata_log_entries: # type: ignore remove_index = len(base_metadata.metadata_log) - max_metadata_log_entries + 1 # type: ignore previous_metadata_log = base_metadata.metadata_log[remove_index:] - metadata_updates: Dict[str, Any] = { + metadata_updates: dict[str, Any] = { "metadata_log": previous_metadata_log + [MetadataLogEntry(metadata_file=metadata_location, timestamp_ms=last_updated_ms)] } return base_metadata.model_copy(update=metadata_updates) @@ -899,4 +899,4 @@ def validate(self, base_metadata: TableMetadata | None) -> None: Field(discriminator="type"), ] -UpdatesAndRequirements = Tuple[Tuple[TableUpdate, ...], Tuple[TableRequirement, ...]] +UpdatesAndRequirements = tuple[tuple[TableUpdate, ...], tuple[TableRequirement, ...]] diff --git a/pyiceberg/table/update/schema.py b/pyiceberg/table/update/schema.py index 851d683866..c2d99f6980 100644 --- a/pyiceberg/table/update/schema.py +++ b/pyiceberg/table/update/schema.py @@ -16,11 +16,12 @@ # under the License. from __future__ import annotations +import builtins import itertools from copy import copy from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.exceptions import ResolveError, ValidationError from pyiceberg.expressions import literal # type: ignore @@ -76,16 +77,16 @@ class _Move: class UpdateSchema(UpdateTableMetadata["UpdateSchema"]): _schema: Schema _last_column_id: itertools.count[int] - _identifier_field_names: Set[str] + _identifier_field_names: set[str] - _adds: Dict[int, List[NestedField]] = {} - _updates: Dict[int, NestedField] = {} - _deletes: Set[int] = set() - _moves: Dict[int, List[_Move]] = {} + _adds: dict[int, list[NestedField]] = {} + _updates: dict[int, NestedField] = {} + _deletes: set[int] = set() + _moves: dict[int, list[_Move]] = {} - _added_name_to_id: Dict[str, int] = {} + _added_name_to_id: dict[str, int] = {} # Part of https://github.com/apache/iceberg/pull/8393 - _id_to_parent: Dict[int, str] = {} + _id_to_parent: dict[int, str] = {} _allow_incompatible_changes: bool _case_sensitive: bool @@ -161,7 +162,7 @@ def union_by_name( def add_column( self, - path: str | Tuple[str, ...], + path: str | tuple[str, ...], field_type: IcebergType, doc: str | None = None, required: bool = False, @@ -257,7 +258,7 @@ def add_column( return self - def delete_column(self, path: str | Tuple[str, ...]) -> UpdateSchema: + def delete_column(self, path: str | tuple[str, ...]) -> UpdateSchema: """Delete a column from a table. Args: @@ -280,7 +281,7 @@ def delete_column(self, path: str | Tuple[str, ...]) -> UpdateSchema: return self - def set_default_value(self, path: str | Tuple[str, ...], default_value: L | None) -> UpdateSchema: + def set_default_value(self, path: str | tuple[str, ...], default_value: L | None) -> UpdateSchema: """Set the default value of a column. Args: @@ -293,7 +294,7 @@ def set_default_value(self, path: str | Tuple[str, ...], default_value: L | None return self - def rename_column(self, path_from: str | Tuple[str, ...], new_name: str) -> UpdateSchema: + def rename_column(self, path_from: str | tuple[str, ...], new_name: str) -> UpdateSchema: """Update the name of a column. Args: @@ -339,7 +340,7 @@ def rename_column(self, path_from: str | Tuple[str, ...], new_name: str) -> Upda return self - def make_column_optional(self, path: str | Tuple[str, ...]) -> UpdateSchema: + def make_column_optional(self, path: str | tuple[str, ...]) -> UpdateSchema: """Make a column optional. Args: @@ -354,7 +355,7 @@ def make_column_optional(self, path: str | Tuple[str, ...]) -> UpdateSchema: def set_identifier_fields(self, *fields: str) -> None: self._identifier_field_names = set(fields) - def _set_column_requirement(self, path: str | Tuple[str, ...], required: bool) -> None: + def _set_column_requirement(self, path: str | tuple[str, ...], required: bool) -> None: path = (path,) if isinstance(path, str) else path name = ".".join(path) @@ -391,7 +392,7 @@ def _set_column_requirement(self, path: str | Tuple[str, ...], required: bool) - write_default=field.write_default, ) - def _set_column_default_value(self, path: str | Tuple[str, ...], default_value: Any) -> None: + def _set_column_default_value(self, path: str | tuple[str, ...], default_value: Any) -> None: path = (path,) if isinstance(path, str) else path name = ".".join(path) @@ -437,7 +438,7 @@ def _set_column_default_value(self, path: str | Tuple[str, ...], default_value: def update_column( self, - path: str | Tuple[str, ...], + path: str | tuple[str, ...], field_type: IcebergType | None = None, required: bool | None = None, doc: str | None = None, @@ -534,7 +535,7 @@ def _move(self, move: _Move) -> None: self._moves[TABLE_ROOT_ID] = self._moves.get(TABLE_ROOT_ID, []) + [move] - def move_first(self, path: str | Tuple[str, ...]) -> UpdateSchema: + def move_first(self, path: str | tuple[str, ...]) -> UpdateSchema: """Move the field to the first position of the parent struct. Args: @@ -554,7 +555,7 @@ def move_first(self, path: str | Tuple[str, ...]) -> UpdateSchema: return self - def move_before(self, path: str | Tuple[str, ...], before_path: str | Tuple[str, ...]) -> UpdateSchema: + def move_before(self, path: str | tuple[str, ...], before_path: str | tuple[str, ...]) -> UpdateSchema: """Move the field to before another field. Args: @@ -588,7 +589,7 @@ def move_before(self, path: str | Tuple[str, ...], before_path: str | Tuple[str, return self - def move_after(self, path: str | Tuple[str, ...], after_name: str | Tuple[str, ...]) -> UpdateSchema: + def move_after(self, path: str | tuple[str, ...], after_name: str | tuple[str, ...]) -> UpdateSchema: """Move the field to after another field. Args: @@ -627,8 +628,8 @@ def _commit(self) -> UpdatesAndRequirements: (schema.schema_id for schema in self._transaction.table_metadata.schemas if schema == new_schema), None ) - requirements: Tuple[TableRequirement, ...] = () - updates: Tuple[TableUpdate, ...] = () + requirements: tuple[TableRequirement, ...] = () + updates: tuple[TableUpdate, ...] = () # Check if it is different current schema ID if existing_schema_id != self._schema.schema_id: @@ -694,17 +695,17 @@ def assign_new_column_id(self) -> int: class _ApplyChanges(SchemaVisitor[IcebergType | None]): - _adds: Dict[int, List[NestedField]] - _updates: Dict[int, NestedField] - _deletes: Set[int] - _moves: Dict[int, List[_Move]] + _adds: dict[int, builtins.list[NestedField]] + _updates: dict[int, NestedField] + _deletes: set[int] + _moves: dict[int, builtins.list[_Move]] def __init__( self, - adds: Dict[int, List[NestedField]], - updates: Dict[int, NestedField], - deletes: Set[int], - moves: Dict[int, List[_Move]], + adds: dict[int, builtins.list[NestedField]], + updates: dict[int, NestedField], + deletes: set[int], + moves: dict[int, builtins.list[_Move]], ) -> None: self._adds = adds self._updates = updates @@ -724,7 +725,7 @@ def schema(self, schema: Schema, struct_result: IcebergType | None) -> IcebergTy return struct_result - def struct(self, struct: StructType, field_results: List[IcebergType | None]) -> IcebergType | None: + def struct(self, struct: StructType, field_results: builtins.list[IcebergType | None]) -> IcebergType | None: has_changes = False new_fields = [] @@ -851,7 +852,7 @@ def __init__(self, update_schema: UpdateSchema, existing_schema: Schema, case_se def schema(self, schema: Schema, partner_id: int | None, struct_result: bool) -> bool: return struct_result - def struct(self, struct: StructType, partner_id: int | None, missing_positions: List[bool]) -> bool: + def struct(self, struct: StructType, partner_id: int | None, missing_positions: builtins.list[bool]) -> bool: if partner_id is None: return True @@ -873,7 +874,7 @@ def struct(self, struct: StructType, partner_id: int | None, missing_positions: def _add_column(self, parent_id: int, field: NestedField) -> None: if parent_name := self.existing_schema.find_column_name(parent_id): - path: Tuple[str, ...] = (parent_name, field.name) + path: tuple[str, ...] = (parent_name, field.name) else: path = (field.name,) @@ -997,12 +998,12 @@ def map_value_partner(self, partner_map_id: int | None) -> int | None: return None -def _add_fields(fields: Tuple[NestedField, ...], adds: List[NestedField] | None) -> Tuple[NestedField, ...]: +def _add_fields(fields: tuple[NestedField, ...], adds: list[NestedField] | None) -> tuple[NestedField, ...]: adds = adds or [] return fields + tuple(adds) -def _move_fields(fields: Tuple[NestedField, ...], moves: List[_Move]) -> Tuple[NestedField, ...]: +def _move_fields(fields: tuple[NestedField, ...], moves: list[_Move]) -> tuple[NestedField, ...]: reordered = list(copy(fields)) for move in moves: # Find the field that we're about to move @@ -1026,8 +1027,8 @@ def _move_fields(fields: Tuple[NestedField, ...], moves: List[_Move]) -> Tuple[N def _add_and_move_fields( - fields: Tuple[NestedField, ...], adds: List[NestedField], moves: List[_Move] -) -> Tuple[NestedField, ...] | None: + fields: tuple[NestedField, ...], adds: list[NestedField], moves: list[_Move] +) -> tuple[NestedField, ...] | None: if len(adds) > 0: # always apply adds first so that added fields can be moved added = _add_fields(fields, adds) diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index d59afbf68d..15fc82f72a 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -24,7 +24,7 @@ from concurrent.futures import Future from datetime import datetime from functools import cached_property -from typing import TYPE_CHECKING, Callable, Dict, Generic, List, Set, Tuple +from typing import TYPE_CHECKING, Callable, Generic from sortedcontainers import SortedList @@ -106,9 +106,9 @@ class _SnapshotProducer(UpdateTableMetadata[U], Generic[U]): _operation: Operation _snapshot_id: int _parent_snapshot_id: int | None - _added_data_files: List[DataFile] + _added_data_files: list[DataFile] _manifest_num_counter: itertools.count[int] - _deleted_data_files: Set[DataFile] + _deleted_data_files: set[DataFile] _compression: AvroCompressionCodec _target_branch: str | None @@ -118,7 +118,7 @@ def __init__( transaction: Transaction, io: FileIO, commit_uuid: uuid.UUID | None = None, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH, ) -> None: super().__init__(transaction) @@ -157,7 +157,7 @@ def delete_data_file(self, data_file: DataFile) -> _SnapshotProducer[U]: self._deleted_data_files.add(data_file) return self - def _calculate_added_rows(self, manifests: List[ManifestFile]) -> int: + def _calculate_added_rows(self, manifests: list[ManifestFile]) -> int: """Calculate the number of added rows from a list of manifest files.""" added_rows = 0 for manifest in manifests: @@ -171,17 +171,17 @@ def _calculate_added_rows(self, manifests: List[ManifestFile]) -> int: return added_rows @abstractmethod - def _deleted_entries(self) -> List[ManifestEntry]: ... + def _deleted_entries(self) -> list[ManifestEntry]: ... @abstractmethod - def _existing_manifests(self) -> List[ManifestFile]: ... + def _existing_manifests(self) -> list[ManifestFile]: ... - def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + def _process_manifests(self, manifests: list[ManifestFile]) -> list[ManifestFile]: """To perform any post-processing on the manifests before writing them to the new snapshot.""" return manifests - def _manifests(self) -> List[ManifestFile]: - def _write_added_manifest() -> List[ManifestFile]: + def _manifests(self) -> list[ManifestFile]: + def _write_added_manifest() -> list[ManifestFile]: if self._added_data_files: with write_manifest( format_version=self._transaction.table_metadata.format_version, @@ -205,12 +205,12 @@ def _write_added_manifest() -> List[ManifestFile]: else: return [] - def _write_delete_manifest() -> List[ManifestFile]: + def _write_delete_manifest() -> list[ManifestFile]: # Check if we need to mark the files as deleted deleted_entries = self._deleted_entries() if len(deleted_entries) > 0: deleted_manifests = [] - partition_groups: Dict[int, List[ManifestEntry]] = defaultdict(list) + partition_groups: dict[int, list[ManifestEntry]] = defaultdict(list) for deleted_entry in deleted_entries: partition_groups[deleted_entry.data_file.spec_id].append(deleted_entry) for spec_id, entries in partition_groups.items(): @@ -237,7 +237,7 @@ def _write_delete_manifest() -> List[ManifestFile]: return self._process_manifests(added_manifests.result() + delete_manifests.result() + existing_manifests.result()) - def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: + def _summary(self, snapshot_properties: dict[str, str] = EMPTY_DICT) -> Summary: from pyiceberg.table import TableProperties # avoid copying metadata for each data file @@ -364,7 +364,7 @@ def new_manifest_output(self) -> OutputFile: file_path = location_provider.new_metadata_location(file_name) return self._io.new_output(file_path) - def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: + def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> list[ManifestEntry]: return manifest.fetch_manifest_entry(io=self._io, discard_deleted=discard_deleted) @@ -388,7 +388,7 @@ def __init__( io: FileIO, branch: str | None = MAIN_BRANCH, commit_uuid: uuid.UUID | None = None, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, ): super().__init__(operation, transaction, io, commit_uuid, snapshot_properties, branch) self._predicate = AlwaysFalse() @@ -421,7 +421,7 @@ def delete_by_predicate(self, predicate: BooleanExpression, case_sensitive: bool self._case_sensitive = case_sensitive @cached_property - def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry], bool]: + def _compute_deletes(self) -> tuple[list[ManifestFile], list[ManifestEntry], bool]: """Computes all the delete operation and cache it when nothing changes. Returns: @@ -441,7 +441,7 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> data_file=entry.data_file, ) - manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) + manifest_evaluators: dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) strict_metrics_evaluator = _StrictMetricsEvaluator(schema, self._predicate, case_sensitive=self._case_sensitive).eval inclusive_metrics_evaluator = _InclusiveMetricsEvaluator( schema, self._predicate, case_sensitive=self._case_sensitive @@ -501,10 +501,10 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> return existing_manifests, total_deleted_entries, partial_rewrites_needed - def _existing_manifests(self) -> List[ManifestFile]: + def _existing_manifests(self) -> list[ManifestFile]: return self._compute_deletes[0] - def _deleted_entries(self) -> List[ManifestEntry]: + def _deleted_entries(self) -> list[ManifestEntry]: return self._compute_deletes[1] @property @@ -519,7 +519,7 @@ def files_affected(self) -> bool: class _FastAppendFiles(_SnapshotProducer["_FastAppendFiles"]): - def _existing_manifests(self) -> List[ManifestFile]: + def _existing_manifests(self) -> list[ManifestFile]: """To determine if there are any existing manifest files. A fast append will add another ManifestFile to the ManifestList. @@ -539,7 +539,7 @@ def _existing_manifests(self) -> List[ManifestFile]: return existing_manifests - def _deleted_entries(self) -> List[ManifestEntry]: + def _deleted_entries(self) -> list[ManifestEntry]: """To determine if we need to record any deleted manifest entries. In case of an append, nothing is deleted. @@ -559,7 +559,7 @@ def __init__( io: FileIO, branch: str | None = MAIN_BRANCH, commit_uuid: uuid.UUID | None = None, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, ) -> None: from pyiceberg.table import TableProperties @@ -580,7 +580,7 @@ def __init__( TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT, ) - def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + def _process_manifests(self, manifests: list[ManifestFile]) -> list[ManifestFile]: """To perform any post-processing on the manifests before writing them to the new snapshot. In _MergeAppendFiles, we merge manifests based on the target size and the minimum count to merge @@ -605,7 +605,7 @@ class _OverwriteFiles(_SnapshotProducer["_OverwriteFiles"]): Data and delete files were added and removed in a logical overwrite operation. """ - def _existing_manifests(self) -> List[ManifestFile]: + def _existing_manifests(self) -> list[ManifestFile]: """Determine if there are any existing manifest files.""" existing_files = [] @@ -641,7 +641,7 @@ def _existing_manifests(self) -> List[ManifestFile]: existing_files.append(writer.to_manifest_file()) return existing_files - def _deleted_entries(self) -> List[ManifestEntry]: + def _deleted_entries(self) -> list[ManifestEntry]: """To determine if we need to record any deleted entries. With a full overwrite all the entries are considered deleted. @@ -656,7 +656,7 @@ def _deleted_entries(self) -> List[ManifestEntry]: executor = ExecutorFactory.get_or_create() - def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]: + def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]: return [ ManifestEntry.from_args( status=ManifestEntryStatus.DELETED, @@ -679,14 +679,14 @@ class UpdateSnapshot: _transaction: Transaction _io: FileIO _branch: str | None - _snapshot_properties: Dict[str, str] + _snapshot_properties: dict[str, str] def __init__( self, transaction: Transaction, io: FileIO, branch: str | None = MAIN_BRANCH, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, ) -> None: self._transaction = transaction self._io = io @@ -747,13 +747,13 @@ def __init__( self._merge_enabled = merge_enabled self._snapshot_producer = snapshot_producer - def _group_by_spec(self, manifests: List[ManifestFile]) -> Dict[int, List[ManifestFile]]: + def _group_by_spec(self, manifests: list[ManifestFile]) -> dict[int, list[ManifestFile]]: groups = defaultdict(list) for manifest in manifests: groups[manifest.partition_spec_id].append(manifest) return groups - def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> ManifestFile: + def _create_manifest(self, spec_id: int, manifest_bin: list[ManifestFile]) -> ManifestFile: with self._snapshot_producer.new_manifest_writer(spec=self._snapshot_producer.spec(spec_id)) as writer: for manifest in manifest_bin: for entry in self._snapshot_producer.fetch_manifest_entry(manifest=manifest, discard_deleted=False): @@ -769,11 +769,11 @@ def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> Ma return writer.to_manifest_file() - def _merge_group(self, first_manifest: ManifestFile, spec_id: int, manifests: List[ManifestFile]) -> List[ManifestFile]: + def _merge_group(self, first_manifest: ManifestFile, spec_id: int, manifests: list[ManifestFile]) -> list[ManifestFile]: packer: ListPacker[ManifestFile] = ListPacker(target_weight=self._target_size_bytes, lookback=1, largest_bin_first=False) - bins: List[List[ManifestFile]] = packer.pack_end(manifests, lambda m: m.manifest_length) + bins: list[list[ManifestFile]] = packer.pack_end(manifests, lambda m: m.manifest_length) - def merge_bin(manifest_bin: List[ManifestFile]) -> List[ManifestFile]: + def merge_bin(manifest_bin: list[ManifestFile]) -> list[ManifestFile]: output_manifests = [] if len(manifest_bin) == 1: output_manifests.append(manifest_bin[0]) @@ -792,15 +792,15 @@ def merge_bin(manifest_bin: List[ManifestFile]) -> List[ManifestFile]: # for consistent ordering, we need to maintain future order futures_index = {f: i for i, f in enumerate(futures)} - completed_futures: SortedList[Future[List[ManifestFile]]] = SortedList(iterable=[], key=lambda f: futures_index[f]) + completed_futures: SortedList[Future[list[ManifestFile]]] = SortedList(iterable=[], key=lambda f: futures_index[f]) for future in concurrent.futures.as_completed(futures): completed_futures.add(future) - bin_results: List[List[ManifestFile]] = [f.result() for f in completed_futures if f.result()] + bin_results: list[list[ManifestFile]] = [f.result() for f in completed_futures if f.result()] return [manifest for bin_result in bin_results for manifest in bin_result] - def merge_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + def merge_manifests(self, manifests: list[ManifestFile]) -> list[ManifestFile]: if not self._merge_enabled or len(manifests) == 0: return manifests @@ -830,8 +830,8 @@ class ManageSnapshots(UpdateTableMetadata["ManageSnapshots"]): ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B") """ - _updates: Tuple[TableUpdate, ...] - _requirements: Tuple[TableRequirement, ...] + _updates: tuple[TableUpdate, ...] + _requirements: tuple[TableRequirement, ...] def __init__(self, transaction: Transaction) -> None: super().__init__(transaction) @@ -949,9 +949,9 @@ class ExpireSnapshots(UpdateTableMetadata["ExpireSnapshots"]): Pending changes are applied on commit. """ - _updates: Tuple[TableUpdate, ...] - _requirements: Tuple[TableRequirement, ...] - _snapshot_ids_to_expire: Set[int] + _updates: tuple[TableUpdate, ...] + _requirements: tuple[TableRequirement, ...] + _snapshot_ids_to_expire: set[int] def __init__(self, transaction: Transaction) -> None: super().__init__(transaction) @@ -976,7 +976,7 @@ def _commit(self) -> UpdatesAndRequirements: self._updates += (update,) return self._updates, self._requirements - def _get_protected_snapshot_ids(self) -> Set[int]: + def _get_protected_snapshot_ids(self) -> set[int]: """ Get the IDs of protected snapshots. @@ -1012,7 +1012,7 @@ def by_id(self, snapshot_id: int) -> ExpireSnapshots: return self - def by_ids(self, snapshot_ids: List[int]) -> ExpireSnapshots: + def by_ids(self, snapshot_ids: list[int]) -> ExpireSnapshots: """ Expire multiple snapshots by their IDs. diff --git a/pyiceberg/table/update/sorting.py b/pyiceberg/table/update/sorting.py index 7e931b1a33..e03d2264df 100644 --- a/pyiceberg/table/update/sorting.py +++ b/pyiceberg/table/update/sorting.py @@ -16,7 +16,7 @@ # under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.table.sorting import INITIAL_SORT_ORDER_ID, UNSORTED_SORT_ORDER, NullOrder, SortDirection, SortField, SortOrder from pyiceberg.table.update import ( @@ -38,11 +38,11 @@ class UpdateSortOrder(UpdateTableMetadata["UpdateSortOrder"]): _transaction: Transaction _last_assigned_order_id: int | None _case_sensitive: bool - _fields: List[SortField] + _fields: list[SortField] def __init__(self, transaction: Transaction, case_sensitive: bool = True) -> None: super().__init__(transaction) - self._fields: List[SortField] = [] + self._fields: list[SortField] = [] self._case_sensitive: bool = case_sensitive self._last_assigned_order_id: int | None = None @@ -118,8 +118,8 @@ def _apply(self) -> SortOrder: def _commit(self) -> UpdatesAndRequirements: """Apply the pending changes and commit.""" new_sort_order = self._apply() - requirements: Tuple[TableRequirement, ...] = () - updates: Tuple[TableUpdate, ...] = () + requirements: tuple[TableRequirement, ...] = () + updates: tuple[TableUpdate, ...] = () if ( self._transaction.table_metadata.default_sort_order_id != new_sort_order.order_id diff --git a/pyiceberg/table/update/spec.py b/pyiceberg/table/update/spec.py index b1f5f83d8f..e060d6c261 100644 --- a/pyiceberg/table/update/spec.py +++ b/pyiceberg/table/update/spec.py @@ -16,7 +16,7 @@ # under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.expressions import ( Reference, @@ -48,15 +48,15 @@ class UpdateSpec(UpdateTableMetadata["UpdateSpec"]): _transaction: Transaction - _name_to_field: Dict[str, PartitionField] = {} - _name_to_added_field: Dict[str, PartitionField] = {} - _transform_to_field: Dict[Tuple[int, str], PartitionField] = {} - _transform_to_added_field: Dict[Tuple[int, str], PartitionField] = {} - _renames: Dict[str, str] = {} - _added_time_fields: Dict[int, PartitionField] = {} + _name_to_field: dict[str, PartitionField] = {} + _name_to_added_field: dict[str, PartitionField] = {} + _transform_to_field: dict[tuple[int, str], PartitionField] = {} + _transform_to_added_field: dict[tuple[int, str], PartitionField] = {} + _renames: dict[str, str] = {} + _added_time_fields: dict[int, PartitionField] = {} _case_sensitive: bool - _adds: List[PartitionField] - _deletes: Set[int] + _adds: list[PartitionField] + _deletes: set[int] _last_assigned_partition_id: int def __init__(self, transaction: Transaction, case_sensitive: bool = True) -> None: @@ -157,8 +157,8 @@ def rename_field(self, name: str, new_name: str) -> UpdateSpec: def _commit(self) -> UpdatesAndRequirements: new_spec = self._apply() - updates: Tuple[TableUpdate, ...] = () - requirements: Tuple[TableRequirement, ...] = () + updates: tuple[TableUpdate, ...] = () + requirements: tuple[TableRequirement, ...] = () if self._transaction.table_metadata.default_spec_id != new_spec.spec_id: if new_spec.spec_id not in self._transaction.table_metadata.specs(): @@ -180,7 +180,7 @@ def _commit(self) -> UpdatesAndRequirements: def _apply(self) -> PartitionSpec: def _check_and_add_partition_name( - schema: Schema, name: str, source_id: int, transform: Transform[Any, Any], partition_names: Set[str] + schema: Schema, name: str, source_id: int, transform: Transform[Any, Any], partition_names: set[str] ) -> None: from pyiceberg.partitioning import validate_partition_name @@ -188,13 +188,13 @@ def _check_and_add_partition_name( partition_names.add(name) def _add_new_field( - schema: Schema, source_id: int, field_id: int, name: str, transform: Transform[Any, Any], partition_names: Set[str] + schema: Schema, source_id: int, field_id: int, name: str, transform: Transform[Any, Any], partition_names: set[str] ) -> PartitionField: _check_and_add_partition_name(schema, name, source_id, transform, partition_names) return PartitionField(source_id, field_id, transform, name) partition_fields = [] - partition_names: Set[str] = set() + partition_names: set[str] = set() for field in self._transaction.table_metadata.spec().fields: if field.field_id not in self._deletes: renamed = self._renames.get(field.name) @@ -267,7 +267,7 @@ def _add_new_field( new_spec_id = spec.spec_id + 1 return PartitionSpec(*partition_fields, spec_id=new_spec_id) - def _partition_field(self, transform_key: Tuple[int, Transform[Any, Any]], name: str | None) -> PartitionField: + def _partition_field(self, transform_key: tuple[int, Transform[Any, Any]], name: str | None) -> PartitionField: if self._transaction.table_metadata.format_version == 2: source_id, transform = transform_key historical_fields = [] diff --git a/pyiceberg/table/update/statistics.py b/pyiceberg/table/update/statistics.py index 5ba712e13d..76fe2cb07b 100644 --- a/pyiceberg/table/update/statistics.py +++ b/pyiceberg/table/update/statistics.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING from pyiceberg.table.statistics import StatisticsFile from pyiceberg.table.update import ( @@ -47,7 +47,7 @@ class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]): update.remove_statistics(snapshot_id=2) """ - _updates: Tuple[TableUpdate, ...] = () + _updates: tuple[TableUpdate, ...] = () def __init__(self, transaction: "Transaction") -> None: super().__init__(transaction) diff --git a/pyiceberg/table/update/validate.py b/pyiceberg/table/update/validate.py index 4ef3bcf160..2586c83af9 100644 --- a/pyiceberg/table/update/validate.py +++ b/pyiceberg/table/update/validate.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import Iterator, Set +from typing import Iterator from pyiceberg.exceptions import ValidationException from pyiceberg.expressions import BooleanExpression @@ -25,8 +25,8 @@ from pyiceberg.table.snapshots import Operation, Snapshot, ancestors_between from pyiceberg.typedef import Record -VALIDATE_DATA_FILES_EXIST_OPERATIONS: Set[Operation] = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE} -VALIDATE_ADDED_DATA_FILES_OPERATIONS: Set[Operation] = {Operation.APPEND, Operation.OVERWRITE} +VALIDATE_DATA_FILES_EXIST_OPERATIONS: set[Operation] = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE} +VALIDATE_ADDED_DATA_FILES_OPERATIONS: set[Operation] = {Operation.APPEND, Operation.OVERWRITE} def _validation_history( diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py index 3480adcb32..e4a9501687 100644 --- a/pyiceberg/typedef.py +++ b/pyiceberg/typedef.py @@ -23,13 +23,9 @@ TYPE_CHECKING, Any, Callable, - Dict, Generic, - List, Literal, Protocol, - Set, - Tuple, TypeVar, Union, runtime_checkable, @@ -43,7 +39,7 @@ from pyiceberg.types import StructType -class FrozenDict(Dict[Any, Any]): +class FrozenDict(dict[Any, Any]): def __setitem__(self, instance: Any, value: Any) -> None: """Assign a value to a FrozenDict.""" raise AttributeError("FrozenDict does not support assignment") @@ -61,7 +57,7 @@ def update(self, *args: Any, **kwargs: Any) -> None: # from https://stackoverflow.com/questions/2912231/is-there-a-clever-way-to-pass-the-key-to-defaultdicts-default-factory -class KeyDefaultDict(Dict[K, V]): +class KeyDefaultDict(dict[K, V]): def __init__(self, default_factory: Callable[[K], V]): super().__init__() self.default_factory = default_factory @@ -73,7 +69,7 @@ def __missing__(self, key: K) -> V: return val -Identifier = Tuple[str, ...] +Identifier = tuple[str, ...] """A tuple of strings representing a table identifier. Each string in the tuple represents a part of the table's unique path. For example, @@ -85,11 +81,11 @@ def __missing__(self, key: K) -> V: >>> identifier: Identifier = ("namespace", "table_name") """ -Properties = Dict[str, Any] +Properties = dict[str, Any] """A dictionary type for properties in PyIceberg.""" -RecursiveDict = Dict[str, Union[str, "RecursiveDict"]] +RecursiveDict = dict[str, Union[str, "RecursiveDict"]] """A recursive dictionary type for nested structures in PyIceberg.""" # Represents the literal value @@ -126,7 +122,7 @@ class IcebergBaseModel(BaseModel): model_config = ConfigDict(populate_by_name=True, frozen=True) - def _exclude_private_properties(self, exclude: Set[str] | None = None) -> Set[str]: + def _exclude_private_properties(self, exclude: set[str] | None = None) -> set[str]: # A small trick to exclude private properties. Properties are serialized by pydantic, # regardless if they start with an underscore. # This will look at the dict, and find the fields and exclude them @@ -135,14 +131,14 @@ def _exclude_private_properties(self, exclude: Set[str] | None = None) -> Set[st ) def model_dump( - self, exclude_none: bool = True, exclude: Set[str] | None = None, by_alias: bool = True, **kwargs: Any - ) -> Dict[str, Any]: + self, exclude_none: bool = True, exclude: set[str] | None = None, by_alias: bool = True, **kwargs: Any + ) -> dict[str, Any]: return super().model_dump( exclude_none=exclude_none, exclude=self._exclude_private_properties(exclude), by_alias=by_alias, **kwargs ) def model_dump_json( - self, exclude_none: bool = True, exclude: Set[str] | None = None, by_alias: bool = True, **kwargs: Any + self, exclude_none: bool = True, exclude: set[str] | None = None, by_alias: bool = True, **kwargs: Any ) -> str: return super().model_dump_json( exclude_none=exclude_none, exclude=self._exclude_private_properties(exclude), by_alias=by_alias, **kwargs @@ -172,7 +168,7 @@ class IcebergRootModel(RootModel[T], Generic[T]): class Record(StructProtocol): __slots__ = ("_data",) - _data: List[Any] + _data: list[Any] @classmethod def _bind(cls, struct: StructType, **arguments: Any) -> Self: diff --git a/pyiceberg/types.py b/pyiceberg/types.py index c22bee092f..742da00f57 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -38,9 +38,7 @@ Annotated, Any, ClassVar, - Dict, Literal, - Tuple, ) from pydantic import ( @@ -64,7 +62,7 @@ FIXED_PARSER = ParseNumberFromBrackets(FIXED) -def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]: +def transform_dict_value_to_str(dict: dict[str, Any]) -> dict[str, str]: """Transform all values in the dictionary to string. Raise an error if any value is None.""" for key, value in dict.items(): if value is None: @@ -72,7 +70,7 @@ def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]: return {k: str(v).lower() if isinstance(v, bool) else str(v) for k, v in dict.items()} -def _parse_decimal_type(decimal: Any) -> Tuple[int, int]: +def _parse_decimal_type(decimal: Any) -> tuple[int, int]: if isinstance(decimal, str): matches = DECIMAL_REGEX.search(decimal) if matches: @@ -250,7 +248,7 @@ class DecimalType(PrimitiveType): True """ - root: Tuple[int, int] + root: tuple[int, int] def __init__(self, precision: int, scale: int) -> None: super().__init__(root=(precision, scale)) @@ -282,7 +280,7 @@ def __hash__(self) -> int: """Return the hash of the tuple.""" return hash(self.root) - def __getnewargs__(self) -> Tuple[int, int]: + def __getnewargs__(self) -> tuple[int, int]: """Pickle the DecimalType class.""" return self.precision, self.scale @@ -375,7 +373,7 @@ def __init__( super().__init__(**data) @model_serializer() - def serialize_model(self) -> Dict[str, Any]: + def serialize_model(self) -> dict[str, Any]: from pyiceberg.conversions import to_json fields = { @@ -415,7 +413,7 @@ def __repr__(self) -> str: return f"NestedField({', '.join(parts)})" - def __getnewargs__(self) -> Tuple[int, str, IcebergType, bool, str | None]: + def __getnewargs__(self) -> tuple[int, str, IcebergType, bool, str | None]: """Pickle the NestedField class.""" return (self.field_id, self.name, self.field_type, self.required, self.doc) @@ -436,7 +434,7 @@ class StructType(IcebergType): """ type: Literal["struct"] = Field(default="struct") - fields: Tuple[NestedField, ...] = Field(default_factory=tuple) + fields: tuple[NestedField, ...] = Field(default_factory=tuple) _hash: int = PrivateAttr() def __init__(self, *fields: NestedField, **data: Any): @@ -476,7 +474,7 @@ def __len__(self) -> int: """Return the length of an instance of the StructType class.""" return len(self.fields) - def __getnewargs__(self) -> Tuple[NestedField, ...]: + def __getnewargs__(self) -> tuple[NestedField, ...]: """Pickle the StructType class.""" return self.fields @@ -526,7 +524,7 @@ def __str__(self) -> str: """Return the string representation of the ListType class.""" return f"list<{self.element_type}>" - def __getnewargs__(self) -> Tuple[int, IcebergType, bool]: + def __getnewargs__(self) -> tuple[int, IcebergType, bool]: """Pickle the ListType class.""" return (self.element_id, self.element_type, self.element_required) @@ -594,7 +592,7 @@ def __str__(self) -> str: """Return the string representation of the MapType class.""" return f"map<{self.key_type}, {self.value_type}>" - def __getnewargs__(self) -> Tuple[int, IcebergType, int, IcebergType, bool]: + def __getnewargs__(self) -> tuple[int, IcebergType, int, IcebergType, bool]: """Pickle the MapType class.""" return (self.key_id, self.key_type, self.value_id, self.value_type, self.value_required) diff --git a/pyiceberg/utils/bin_packing.py b/pyiceberg/utils/bin_packing.py index 825420d8b7..bee7bd81b3 100644 --- a/pyiceberg/utils/bin_packing.py +++ b/pyiceberg/utils/bin_packing.py @@ -20,7 +20,6 @@ Callable, Generic, Iterable, - List, TypeVar, ) @@ -31,7 +30,7 @@ class Bin(Generic[T]): def __init__(self, target_weight: int) -> None: self.bin_weight = 0 self.target_weight = target_weight - self.items: List[T] = [] + self.items: list[T] = [] def weight(self) -> int: return self.bin_weight @@ -45,7 +44,7 @@ def add(self, item: T, weight: int) -> None: class PackingIterator(Generic[T]): - bins: List[Bin[T]] + bins: list[Bin[T]] def __init__( self, @@ -66,7 +65,7 @@ def __iter__(self) -> PackingIterator[T]: """Return an iterator for the PackingIterator class.""" return self - def __next__(self) -> List[T]: + def __next__(self) -> list[T]: """Return the next item when iterating over the PackingIterator class.""" while True: try: @@ -115,7 +114,7 @@ def __init__(self, target_weight: int, lookback: int, largest_bin_first: bool) - self._lookback = lookback self._largest_bin_first = largest_bin_first - def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: + def pack(self, items: list[T], weight_func: Callable[[T], int]) -> list[list[T]]: return list( PackingIterator( items=items, @@ -126,6 +125,6 @@ def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]] ) ) - def pack_end(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: + def pack_end(self, items: list[T], weight_func: Callable[[T], int]) -> list[list[T]]: packed = self.pack(items=list(reversed(items)), weight_func=weight_func) return [list(reversed(bin_items)) for bin_items in reversed(packed)] diff --git a/pyiceberg/utils/config.py b/pyiceberg/utils/config.py index 98fb292369..ab9b549d25 100644 --- a/pyiceberg/utils/config.py +++ b/pyiceberg/utils/config.py @@ -16,7 +16,6 @@ # under the License. import logging import os -from typing import List import strictyaml @@ -106,7 +105,7 @@ def _from_environment_variables(config: RecursiveDict) -> RecursiveDict: Amended configuration. """ - def set_property(_config: RecursiveDict, path: List[str], config_value: str) -> None: + def set_property(_config: RecursiveDict, path: list[str], config_value: str) -> None: while len(path) > 0: element = path.pop(0) if len(path) == 0: @@ -159,7 +158,7 @@ def get_catalog_config(self, catalog_name: str) -> RecursiveDict | None: return catalog_conf return None - def get_known_catalogs(self) -> List[str]: + def get_known_catalogs(self) -> list[str]: catalogs = self.config.get(CATALOG, {}) if not isinstance(catalogs, dict): raise ValueError("Catalog configurations needs to be an object") diff --git a/pyiceberg/utils/lazydict.py b/pyiceberg/utils/lazydict.py index 4b616c5c27..f1a3718dc7 100644 --- a/pyiceberg/utils/lazydict.py +++ b/pyiceberg/utils/lazydict.py @@ -16,7 +16,6 @@ # under the License. from typing import ( - Dict, Iterator, Mapping, Sequence, @@ -41,9 +40,9 @@ class LazyDict(Mapping[K, V]): # that the developer has correctly used the class and that the contents are valid. def __init__(self, contents: Sequence[Sequence[K | V]]): self._contents = contents - self._dict: Dict[K, V] | None = None + self._dict: dict[K, V] | None = None - def _build_dict(self) -> Dict[K, V]: + def _build_dict(self) -> dict[K, V]: self._dict = {} for item in self._contents: self._dict.update(dict(zip(cast(Sequence[K], item[::2]), cast(Sequence[V], item[1::2]), strict=True))) @@ -65,6 +64,6 @@ def __len__(self) -> int: source = self._dict or self._build_dict() return len(source) - def __dict__(self) -> Dict[K, V]: # type: ignore + def __dict__(self) -> dict[K, V]: # type: ignore """Convert the lazy dict in a dict.""" return self._dict or self._build_dict() diff --git a/pyiceberg/utils/properties.py b/pyiceberg/utils/properties.py index 11241e485c..2a95b39a50 100644 --- a/pyiceberg/utils/properties.py +++ b/pyiceberg/utils/properties.py @@ -17,7 +17,6 @@ from typing import ( Any, - Dict, ) from pyiceberg.typedef import Properties @@ -27,7 +26,7 @@ def property_as_int( - properties: Dict[str, str], + properties: dict[str, str], property_name: str, default: int | None = None, ) -> int | None: @@ -41,7 +40,7 @@ def property_as_int( def property_as_float( - properties: Dict[str, str], + properties: dict[str, str], property_name: str, default: float | None = None, ) -> float | None: @@ -55,7 +54,7 @@ def property_as_float( def property_as_bool( - properties: Dict[str, str], + properties: dict[str, str], property_name: str, default: bool, ) -> bool: diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py index 0ec8dce084..66e57d5d9f 100644 --- a/pyiceberg/utils/schema_conversion.py +++ b/pyiceberg/utils/schema_conversion.py @@ -19,9 +19,6 @@ import logging from typing import ( Any, - Dict, - List, - Tuple, ) from pyiceberg.schema import ( @@ -59,7 +56,7 @@ logger = logging.getLogger(__name__) -PRIMITIVE_FIELD_TYPE_MAPPING: Dict[str, PrimitiveType] = { +PRIMITIVE_FIELD_TYPE_MAPPING: dict[str, PrimitiveType] = { "boolean": BooleanType(), "bytes": BinaryType(), "double": DoubleType(), @@ -71,7 +68,7 @@ "null": UnknownType(), } -LOGICAL_FIELD_TYPE_MAPPING: Dict[Tuple[str, str], PrimitiveType] = { +LOGICAL_FIELD_TYPE_MAPPING: dict[tuple[str, str], PrimitiveType] = { ("date", "int"): DateType(), ("time-micros", "long"): TimeType(), ("timestamp-micros", "long"): TimestampType(), @@ -83,7 +80,7 @@ class AvroSchemaConversion: - def avro_to_iceberg(self, avro_schema: Dict[str, Any]) -> Schema: + def avro_to_iceberg(self, avro_schema: dict[str, Any]) -> Schema: """Convert an Apache Avro into an Apache Iceberg schema equivalent. This expects to have field id's to be encoded in the Avro schema: @@ -132,7 +129,7 @@ def iceberg_to_avro(self, schema: Schema, schema_name: str | None = None) -> Avr """Convert an Iceberg schema into an Avro dictionary that can be serialized to JSON.""" return visit(schema, ConvertSchemaToAvro(schema_name)) - def _resolve_union(self, type_union: Dict[str, str] | List[str | Dict[str, str]] | str) -> Tuple[str | Dict[str, Any], bool]: + def _resolve_union(self, type_union: dict[str, str] | list[str | dict[str, str]] | str) -> tuple[str | dict[str, Any], bool]: """ Convert Unions into their type and resolves if the field is required. @@ -155,7 +152,7 @@ def _resolve_union(self, type_union: Dict[str, str] | List[str | Dict[str, str]] Raises: TypeError: In the case non-optional union types are encountered. """ - avro_types: Dict[str, str] | List[Dict[str, str] | str] + avro_types: dict[str, str] | list[dict[str, str] | str] if isinstance(type_union, str): # It is a primitive and required return type_union, True @@ -181,7 +178,7 @@ def _resolve_union(self, type_union: Dict[str, str] | List[str | Dict[str, str]] # Filter the null value and return the type return list(filter(lambda t: t != "null", avro_types))[0], False - def _convert_schema(self, avro_type: str | Dict[str, Any]) -> IcebergType: + def _convert_schema(self, avro_type: str | dict[str, Any]) -> IcebergType: """ Resolve the Avro type. @@ -219,7 +216,7 @@ def _convert_schema(self, avro_type: str | Dict[str, Any]) -> IcebergType: else: raise TypeError(f"Type not recognized: {avro_type}") - def _convert_field(self, field: Dict[str, Any]) -> NestedField: + def _convert_field(self, field: dict[str, Any]) -> NestedField: """Convert an Avro field into an Iceberg equivalent field. Args: @@ -241,7 +238,7 @@ def _convert_field(self, field: Dict[str, Any]) -> NestedField: doc=field.get("doc"), ) - def _convert_record_type(self, record_type: Dict[str, Any]) -> StructType: + def _convert_record_type(self, record_type: dict[str, Any]) -> StructType: """ Convert the fields from a record into an Iceberg struct. @@ -295,7 +292,7 @@ def _convert_record_type(self, record_type: Dict[str, Any]) -> StructType: return StructType(*[self._convert_field(field) for field in record_type["fields"]]) - def _convert_array_type(self, array_type: Dict[str, Any]) -> ListType: + def _convert_array_type(self, array_type: dict[str, Any]) -> ListType: if "element-id" not in array_type: raise ValueError(f"Cannot convert array-type, missing element-id: {array_type}") @@ -307,7 +304,7 @@ def _convert_array_type(self, array_type: Dict[str, Any]) -> ListType: element_required=element_required, ) - def _convert_map_type(self, map_type: Dict[str, Any]) -> MapType: + def _convert_map_type(self, map_type: dict[str, Any]) -> MapType: """Convert an avro map type into an Iceberg MapType. Args: @@ -344,7 +341,7 @@ def _convert_map_type(self, map_type: Dict[str, Any]) -> MapType: value_required=value_required, ) - def _convert_logical_type(self, avro_logical_type: Dict[str, Any]) -> IcebergType: + def _convert_logical_type(self, avro_logical_type: dict[str, Any]) -> IcebergType: """Convert a schema with a logical type annotation into an IcebergType. For the decimal and map we need to fetch more keys from the dict, and for @@ -385,7 +382,7 @@ def _convert_logical_type(self, avro_logical_type: Dict[str, Any]) -> IcebergTyp else: raise ValueError(f"Unknown logical/physical type combination: {avro_logical_type}") - def _convert_logical_decimal_type(self, avro_type: Dict[str, Any]) -> DecimalType: + def _convert_logical_decimal_type(self, avro_type: dict[str, Any]) -> DecimalType: """Convert an avro type to an Iceberg DecimalType. Args: @@ -412,7 +409,7 @@ def _convert_logical_decimal_type(self, avro_type: Dict[str, Any]) -> DecimalTyp """ return DecimalType(precision=avro_type["precision"], scale=avro_type["scale"]) - def _convert_logical_map_type(self, avro_type: Dict[str, Any]) -> MapType: + def _convert_logical_map_type(self, avro_type: dict[str, Any]) -> MapType: """Convert an avro map type to an Iceberg MapType. In the case where a map hasn't a key as a type you can use a logical map to still encode this in Avro. @@ -464,7 +461,7 @@ def _convert_logical_map_type(self, avro_type: Dict[str, Any]) -> MapType: value_required=value.required, ) - def _convert_fixed_type(self, avro_type: Dict[str, Any]) -> FixedType: + def _convert_fixed_type(self, avro_type: dict[str, Any]) -> FixedType: """ Convert Avro Type to the equivalent Iceberg fixed type. @@ -519,7 +516,7 @@ def before_map_key(self, key: NestedField) -> None: def before_map_value(self, value: NestedField) -> None: self.last_map_value_field_id = value.field_id - def struct(self, struct: StructType, field_results: List[AvroType]) -> AvroType: + def struct(self, struct: StructType, field_results: list[AvroType]) -> AvroType: return {"type": "record", "fields": field_results} def field(self, field: NestedField, field_result: AvroType) -> AvroType: diff --git a/pyiceberg/utils/singleton.py b/pyiceberg/utils/singleton.py index 06ee62febe..b59f43fbcd 100644 --- a/pyiceberg/utils/singleton.py +++ b/pyiceberg/utils/singleton.py @@ -28,7 +28,7 @@ More information on metaclasses: https://docs.python.org/3/reference/datamodel.html#metaclasses """ -from typing import Any, ClassVar, Dict +from typing import Any, ClassVar def _convert_to_hashable_type(element: Any) -> Any: @@ -40,7 +40,7 @@ def _convert_to_hashable_type(element: Any) -> Any: class Singleton: - _instances: ClassVar[Dict] = {} # type: ignore + _instances: ClassVar[dict] = {} # type: ignore def __new__(cls, *args, **kwargs): # type: ignore key = (cls, tuple(args), _convert_to_hashable_type(kwargs)) @@ -48,7 +48,7 @@ def __new__(cls, *args, **kwargs): # type: ignore cls._instances[key] = super().__new__(cls) return cls._instances[key] - def __deepcopy__(self, memo: Dict[int, Any]) -> Any: + def __deepcopy__(self, memo: dict[int, Any]) -> Any: """ Prevent deep copy operations for singletons. diff --git a/ruff.toml b/ruff.toml index d439caf74e..36391b033a 100644 --- a/ruff.toml +++ b/ruff.toml @@ -59,8 +59,7 @@ select = [ ] ignore = [ "E501", - "UP035", - "UP006" + "UP035" ] # Allow autofix for all enabled rules (when `--fix`) is provided. diff --git a/tests/avro/test_decoder.py b/tests/avro/test_decoder.py index c7c64ea096..26b0a0d15a 100644 --- a/tests/avro/test_decoder.py +++ b/tests/avro/test_decoder.py @@ -20,7 +20,7 @@ import struct from io import SEEK_SET from types import TracebackType -from typing import Callable, Type +from typing import Callable from unittest.mock import MagicMock, patch import pytest @@ -129,7 +129,7 @@ def close(self) -> None: def __enter__(self) -> OneByteAtATimeInputStream: return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: self.close() diff --git a/tests/catalog/integration_test_dynamodb.py b/tests/catalog/integration_test_dynamodb.py index 895f233c45..4ffe98a4b8 100644 --- a/tests/catalog/integration_test_dynamodb.py +++ b/tests/catalog/integration_test_dynamodb.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Generator, List +from typing import Generator import boto3 import pytest @@ -115,7 +115,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, database assert table.metadata == loaded_table.metadata -def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: +def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: list[str]) -> None: test_catalog.create_namespace(database_name) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) @@ -204,7 +204,7 @@ def test_create_namespace_with_comment_and_location(test_catalog: Catalog, datab assert properties["location"] == test_location -def test_list_namespaces(test_catalog: Catalog, database_list: List[str]) -> None: +def test_list_namespaces(test_catalog: Catalog, database_list: list[str]) -> None: for database_name in database_list: test_catalog.create_namespace(database_name) db_list = test_catalog.list_namespaces() diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 475fc07ead..a3b8f17c30 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -16,7 +16,7 @@ # under the License. import time -from typing import Any, Dict, Generator, List +from typing import Any, Generator from uuid import uuid4 import boto3 @@ -70,7 +70,7 @@ def __init__(self) -> None: self._output_bucket = get_bucket_name() self._output_path = f"athena_results_{uuid4()}" - def get_query_results(self, query: str) -> List[Dict[str, Any]]: + def get_query_results(self, query: str) -> list[dict[str, Any]]: query_execution_id = self._athena_client.start_query_execution( QueryString=query, ResultConfiguration={"OutputLocation": f"s3://{self._output_bucket}/{self._output_path}"} )["QueryExecutionId"] @@ -222,7 +222,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, table_na assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 -def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: +def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: list[str]) -> None: test_catalog.create_namespace(database_name) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) @@ -312,7 +312,7 @@ def test_create_namespace_with_comment_and_location(test_catalog: Catalog, datab assert properties["location"] == test_location -def test_list_namespaces(test_catalog: Catalog, database_list: List[str]) -> None: +def test_list_namespaces(test_catalog: Catalog, database_list: list[str]) -> None: for database_name in database_list: test_catalog.create_namespace(database_name) db_list = test_catalog.list_namespaces() diff --git a/tests/catalog/test_dynamodb.py b/tests/catalog/test_dynamodb.py index c7c39a600d..5933e7d472 100644 --- a/tests/catalog/test_dynamodb.py +++ b/tests/catalog/test_dynamodb.py @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import List from unittest import mock import boto3 @@ -393,7 +392,7 @@ def test_fail_on_rename_non_iceberg_table( @mock_aws def test_list_tables( - _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_list: List[str] + _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_list: list[str] ) -> None: test_catalog = DynamoDbCatalog("test_ddb_catalog", **{"warehouse": f"s3://{BUCKET_NAME}", "s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name) @@ -405,7 +404,7 @@ def test_list_tables( @mock_aws -def test_list_namespaces(_bucket_initialize: None, database_list: List[str]) -> None: +def test_list_namespaces(_bucket_initialize: None, database_list: list[str]) -> None: test_catalog = DynamoDbCatalog("test_ddb_catalog") for database_name in database_list: test_catalog.create_namespace(namespace=database_name) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 0ff43cd52b..5273db22f8 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import List from unittest import mock import boto3 @@ -439,7 +438,7 @@ def test_list_tables( moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, - table_list: List[str], + table_list: list[str], ) -> None: test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) @@ -475,7 +474,7 @@ def test_list_tables( @mock_aws -def test_list_namespaces(_bucket_initialize: None, moto_endpoint_url: str, database_list: List[str]) -> None: +def test_list_namespaces(_bucket_initialize: None, moto_endpoint_url: str, database_list: list[str]) -> None: test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url}) for database_name in database_list: test_catalog.create_namespace(namespace=database_name) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index d2ecd02aba..efc89c7c7e 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -17,7 +17,7 @@ # pylint: disable=redefined-outer-name,unused-argument import base64 import os -from typing import Any, Callable, Dict, cast +from typing import Any, Callable, cast from unittest import mock import pytest @@ -69,7 +69,7 @@ @pytest.fixture -def example_table_metadata_with_snapshot_v1_rest_json(example_table_metadata_with_snapshot_v1: Dict[str, Any]) -> Dict[str, Any]: +def example_table_metadata_with_snapshot_v1_rest_json(example_table_metadata_with_snapshot_v1: dict[str, Any]) -> dict[str, Any]: return { "metadata-location": "s3://warehouse/database/table/metadata/00001-5f2f8166-244c-4eae-ac36-384ecdec81fc.gz.metadata.json", "metadata": example_table_metadata_with_snapshot_v1, @@ -81,7 +81,7 @@ def example_table_metadata_with_snapshot_v1_rest_json(example_table_metadata_wit @pytest.fixture -def example_table_metadata_with_no_location(example_table_metadata_with_snapshot_v1: Dict[str, Any]) -> Dict[str, Any]: +def example_table_metadata_with_no_location(example_table_metadata_with_snapshot_v1: dict[str, Any]) -> dict[str, Any]: return { "metadata": example_table_metadata_with_snapshot_v1, "config": { @@ -92,7 +92,7 @@ def example_table_metadata_with_no_location(example_table_metadata_with_snapshot @pytest.fixture -def example_table_metadata_no_snapshot_v1_rest_json(example_table_metadata_no_snapshot_v1: Dict[str, Any]) -> Dict[str, Any]: +def example_table_metadata_no_snapshot_v1_rest_json(example_table_metadata_no_snapshot_v1: dict[str, Any]) -> dict[str, Any]: return { "metadata-location": "s3://warehouse/database/table/metadata.json", "metadata": example_table_metadata_no_snapshot_v1, @@ -837,7 +837,7 @@ def test_update_namespace_properties_404(rest_mock: Mocker) -> None: assert "Namespace does not exist" in str(e.value) -def test_load_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any]) -> None: +def test_load_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any]) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/fokko/tables/table", json=example_table_metadata_with_snapshot_v1_rest_json, @@ -859,7 +859,7 @@ def test_load_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_ def test_load_table_200_loading_mode( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/fokko/tables/table?snapshots=refs", @@ -882,7 +882,7 @@ def test_load_table_200_loading_mode( def test_load_table_honor_access_delegation( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: test_headers_with_remote_signing = {**TEST_HEADERS, "X-Iceberg-Access-Delegation": "remote-signing"} rest_mock.get( @@ -914,7 +914,7 @@ def test_load_table_honor_access_delegation( def test_load_table_from_self_identifier_200( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/pdames/tables/table", @@ -1017,7 +1017,7 @@ def test_drop_table_404(rest_mock: Mocker) -> None: def test_create_table_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", @@ -1047,7 +1047,7 @@ def test_create_table_200( def test_create_table_with_given_location_removes_trailing_slash_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", @@ -1074,8 +1074,8 @@ def test_create_table_with_given_location_removes_trailing_slash_200( def test_create_staged_table_200( rest_mock: Mocker, table_schema_simple: Schema, - example_table_metadata_with_no_location: Dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any], + example_table_metadata_with_no_location: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", @@ -1163,12 +1163,12 @@ def test_create_table_409(rest_mock: Mocker, table_schema_simple: Schema) -> Non def test_create_table_if_not_exists_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: - def json_callback() -> Callable[[Any, Any], Dict[str, Any]]: + def json_callback() -> Callable[[Any, Any], dict[str, Any]]: call_count = 0 - def callback(request: Any, context: Any) -> Dict[str, Any]: + def callback(request: Any, context: Any) -> dict[str, Any]: nonlocal call_count call_count += 1 @@ -1250,7 +1250,7 @@ def test_create_table_419(rest_mock: Mocker, table_schema_simple: Schema) -> Non def test_register_table_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/default/register", @@ -1318,7 +1318,7 @@ def test_delete_table_204(rest_mock: Mocker) -> None: def test_delete_table_from_self_identifier_204( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/pdames/tables/table", @@ -1337,7 +1337,7 @@ def test_delete_table_from_self_identifier_204( catalog.drop_table(table.name()) -def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any]) -> None: +def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any]) -> None: rest_mock.post( f"{TEST_URI}v1/tables/rename", json={ @@ -1374,7 +1374,7 @@ def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapsho def test_rename_table_from_self_identifier_200( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/pdames/tables/source", @@ -1795,7 +1795,7 @@ def test_catalog_from_parameters_empty_env(rest_mock: Mocker) -> None: def test_table_identifier_in_commit_table_request( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_v2: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_v2: dict[str, Any] ) -> None: metadata_location = "s3://some_bucket/metadata.json" rest_mock.post( diff --git a/tests/conftest.py b/tests/conftest.py index 706baea38d..947fc00a83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,9 +38,7 @@ from typing import ( TYPE_CHECKING, Any, - Dict, Generator, - List, ) import boto3 @@ -103,7 +101,7 @@ from pyiceberg.io.pyarrow import PyArrowFileIO -def pytest_collection_modifyitems(items: List[pytest.Item]) -> None: +def pytest_collection_modifyitems(items: list[pytest.Item]) -> None: for item in items: if not any(item.iter_markers()): item.add_marker("unmarked") @@ -546,7 +544,7 @@ def iceberg_schema_nested_no_ids() -> Schema: @pytest.fixture(scope="session") -def all_avro_types() -> Dict[str, Any]: +def all_avro_types() -> dict[str, Any]: return { "type": "record", "name": "all_avro_types", @@ -650,7 +648,7 @@ def all_avro_types() -> Dict[str, Any]: @pytest.fixture(scope="session") -def example_table_metadata_v1() -> Dict[str, Any]: +def example_table_metadata_v1() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_V1 @@ -724,7 +722,7 @@ def example_table_metadata_v1() -> Dict[str, Any]: @pytest.fixture -def example_table_metadata_with_snapshot_v1() -> Dict[str, Any]: +def example_table_metadata_with_snapshot_v1() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_WITH_SNAPSHOT_V1 @@ -777,18 +775,18 @@ def example_table_metadata_with_snapshot_v1() -> Dict[str, Any]: @pytest.fixture -def example_table_metadata_no_snapshot_v1() -> Dict[str, Any]: +def example_table_metadata_no_snapshot_v1() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_NO_SNAPSHOT_V1 @pytest.fixture -def example_table_metadata_v2_with_extensive_snapshots() -> Dict[str, Any]: +def example_table_metadata_v2_with_extensive_snapshots() -> dict[str, Any]: def generate_snapshot( snapshot_id: int, parent_snapshot_id: int | None = None, timestamp_ms: int | None = None, sequence_number: int = 0, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: return { "snapshot-id": snapshot_id, "parent-snapshot-id": parent_snapshot_id, @@ -1116,22 +1114,22 @@ def generate_snapshot( @pytest.fixture -def example_table_metadata_v2() -> Dict[str, Any]: +def example_table_metadata_v2() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_V2 @pytest.fixture -def table_metadata_v2_with_fixed_and_decimal_types() -> Dict[str, Any]: +def table_metadata_v2_with_fixed_and_decimal_types() -> dict[str, Any]: return TABLE_METADATA_V2_WITH_FIXED_AND_DECIMAL_TYPES @pytest.fixture -def table_metadata_v2_with_statistics() -> Dict[str, Any]: +def table_metadata_v2_with_statistics() -> dict[str, Any]: return TABLE_METADATA_V2_WITH_STATISTICS @pytest.fixture -def example_table_metadata_v3() -> Dict[str, Any]: +def example_table_metadata_v3() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_V3 @@ -1487,7 +1485,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: @pytest.fixture(scope="session") -def avro_schema_manifest_file_v1() -> Dict[str, Any]: +def avro_schema_manifest_file_v1() -> dict[str, Any]: return { "type": "record", "name": "manifest_file", @@ -1589,7 +1587,7 @@ def avro_schema_manifest_file_v1() -> Dict[str, Any]: @pytest.fixture(scope="session") -def avro_schema_manifest_file_v2() -> Dict[str, Any]: +def avro_schema_manifest_file_v2() -> dict[str, Any]: return { "type": "record", "name": "manifest_file", @@ -1668,7 +1666,7 @@ def avro_schema_manifest_file_v2() -> Dict[str, Any]: @pytest.fixture(scope="session") -def avro_schema_manifest_entry() -> Dict[str, Any]: +def avro_schema_manifest_entry() -> dict[str, Any]: return { "type": "record", "name": "manifest_entry", @@ -1898,7 +1896,7 @@ def test_partition_spec() -> Schema: @pytest.fixture(scope="session") def generated_manifest_entry_file( - avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec + avro_schema_manifest_entry: dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec ) -> Generator[str, None, None]: from fastavro import parse_schema, writer @@ -1921,7 +1919,7 @@ def generated_manifest_entry_file( @pytest.fixture(scope="session") def generated_manifest_file_file_v1( - avro_schema_manifest_file_v1: Dict[str, Any], generated_manifest_entry_file: str + avro_schema_manifest_file_v1: dict[str, Any], generated_manifest_entry_file: str ) -> Generator[str, None, None]: from fastavro import parse_schema, writer @@ -1939,7 +1937,7 @@ def generated_manifest_file_file_v1( @pytest.fixture(scope="session") def generated_manifest_file_file_v2( - avro_schema_manifest_file_v2: Dict[str, Any], generated_manifest_entry_file: str + avro_schema_manifest_file_v2: dict[str, Any], generated_manifest_entry_file: str ) -> Generator[str, None, None]: from fastavro import parse_schema, writer @@ -2288,7 +2286,7 @@ def table_name() -> str: @pytest.fixture() -def table_list(table_name: str) -> List[str]: +def table_list(table_name: str) -> list[str]: return [f"{table_name}_{idx}" for idx in range(NUM_TABLES)] @@ -2307,7 +2305,7 @@ def gcp_dataset_name() -> str: @pytest.fixture() -def database_list(database_name: str) -> List[str]: +def database_list(database_name: str) -> list[str]: return [f"{database_name}_{idx}" for idx in range(NUM_TABLES)] @@ -2320,7 +2318,7 @@ def hierarchical_namespace_name() -> str: @pytest.fixture() -def hierarchical_namespace_list(hierarchical_namespace_name: str) -> List[str]: +def hierarchical_namespace_list(hierarchical_namespace_name: str) -> list[str]: return [f"{hierarchical_namespace_name}_{idx}" for idx in range(NUM_TABLES)] @@ -2466,7 +2464,7 @@ def warehouse(tmp_path_factory: pytest.TempPathFactory) -> Path: @pytest.fixture -def table_v1(example_table_metadata_v1: Dict[str, Any]) -> Table: +def table_v1(example_table_metadata_v1: dict[str, Any]) -> Table: table_metadata = TableMetadataV1(**example_table_metadata_v1) return Table( identifier=("database", "table"), @@ -2478,7 +2476,7 @@ def table_v1(example_table_metadata_v1: Dict[str, Any]) -> Table: @pytest.fixture -def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: +def table_v2(example_table_metadata_v2: dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**example_table_metadata_v2) return Table( identifier=("database", "table"), @@ -2490,7 +2488,7 @@ def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: @pytest.fixture -def table_v3(example_table_metadata_v3: Dict[str, Any]) -> Table: +def table_v3(example_table_metadata_v3: dict[str, Any]) -> Table: table_metadata = TableMetadataV3(**example_table_metadata_v3) return Table( identifier=("database", "table"), @@ -2502,7 +2500,7 @@ def table_v3(example_table_metadata_v3: Dict[str, Any]) -> Table: @pytest.fixture -def table_v2_orc(example_table_metadata_v2: Dict[str, Any]) -> Table: +def table_v2_orc(example_table_metadata_v2: dict[str, Any]) -> Table: import copy metadata_dict = copy.deepcopy(example_table_metadata_v2) @@ -2521,7 +2519,7 @@ def table_v2_orc(example_table_metadata_v2: Dict[str, Any]) -> Table: @pytest.fixture def table_v2_with_fixed_and_decimal_types( - table_metadata_v2_with_fixed_and_decimal_types: Dict[str, Any], + table_metadata_v2_with_fixed_and_decimal_types: dict[str, Any], ) -> Table: table_metadata = TableMetadataV2( **table_metadata_v2_with_fixed_and_decimal_types, @@ -2536,7 +2534,7 @@ def table_v2_with_fixed_and_decimal_types( @pytest.fixture -def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_snapshots: Dict[str, Any]) -> Table: +def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_snapshots: dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**example_table_metadata_v2_with_extensive_snapshots) return Table( identifier=("database", "table"), @@ -2548,7 +2546,7 @@ def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_s @pytest.fixture -def table_v2_with_statistics(table_metadata_v2_with_statistics: Dict[str, Any]) -> Table: +def table_v2_with_statistics(table_metadata_v2_with_statistics: dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**table_metadata_v2_with_statistics) return Table( identifier=("database", "table"), diff --git a/tests/expressions/test_literals.py b/tests/expressions/test_literals.py index 2137681e79..c3ace5d368 100644 --- a/tests/expressions/test_literals.py +++ b/tests/expressions/test_literals.py @@ -21,9 +21,6 @@ from decimal import Decimal from typing import ( Any, - List, - Set, - Type, ) import pytest @@ -95,14 +92,14 @@ def test_literal_from_nan_error() -> None: BinaryLiteral, ], ) -def test_literal_classes_with_none_type_error(literal_class: Type[PrimitiveType]) -> None: +def test_literal_classes_with_none_type_error(literal_class: type[PrimitiveType]) -> None: with pytest.raises(TypeError) as e: literal_class(None) assert "Invalid literal value: None" in str(e.value) @pytest.mark.parametrize("literal_class", [FloatLiteral, DoubleLiteral]) -def test_literal_classes_with_nan_value_error(literal_class: Type[PrimitiveType]) -> None: +def test_literal_classes_with_nan_value_error(literal_class: type[PrimitiveType]) -> None: with pytest.raises(ValueError) as e: literal_class(float("nan")) assert "Cannot create expression literal from NaN." in str(e.value) @@ -824,7 +821,7 @@ def test_invalid_binary_conversions() -> None: ) -def assert_invalid_conversions(lit: Literal[Any], types: List[PrimitiveType]) -> None: +def assert_invalid_conversions(lit: Literal[Any], types: list[PrimitiveType]) -> None: for type_var in types: with pytest.raises(TypeError): _ = lit.to(type_var) @@ -958,4 +955,4 @@ def test_to_json() -> None: assert_type(literal(123.4), Literal[float]) assert_type(literal(bytes([0x01, 0x02, 0x03])), Literal[bytes]) assert_type(literal(Decimal("19.25")), Literal[Decimal]) -assert_type({literal(1), literal(2), literal(3)}, Set[Literal[int]]) +assert_type({literal(1), literal(2), literal(3)}, set[Literal[int]]) diff --git a/tests/expressions/test_visitors.py b/tests/expressions/test_visitors.py index d0b6ab5ab4..2847859db5 100644 --- a/tests/expressions/test_visitors.py +++ b/tests/expressions/test_visitors.py @@ -16,7 +16,7 @@ # under the License. # pylint:disable=redefined-outer-name -from typing import Any, List, Set +from typing import Any import pytest @@ -91,7 +91,7 @@ ) -class ExampleVisitor(BooleanExpressionVisitor[List[str]]): +class ExampleVisitor(BooleanExpressionVisitor[list[str]]): """A test implementation of a BooleanExpressionVisitor As this visitor visits each node, it appends an element to a `visit_history` list. This enables testing that a given expression is @@ -99,119 +99,119 @@ class ExampleVisitor(BooleanExpressionVisitor[List[str]]): """ def __init__(self) -> None: - self.visit_history: List[str] = [] + self.visit_history: list[str] = [] - def visit_true(self) -> List[str]: + def visit_true(self) -> list[str]: self.visit_history.append("TRUE") return self.visit_history - def visit_false(self) -> List[str]: + def visit_false(self) -> list[str]: self.visit_history.append("FALSE") return self.visit_history - def visit_not(self, child_result: List[str]) -> List[str]: + def visit_not(self, child_result: list[str]) -> list[str]: self.visit_history.append("NOT") return self.visit_history - def visit_and(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_and(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("AND") return self.visit_history - def visit_or(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_or(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("OR") return self.visit_history - def visit_unbound_predicate(self, predicate: UnboundPredicate[Any]) -> List[str]: + def visit_unbound_predicate(self, predicate: UnboundPredicate[Any]) -> list[str]: self.visit_history.append(str(predicate.__class__.__name__).upper()) return self.visit_history - def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> List[str]: + def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> list[str]: self.visit_history.append(str(predicate.__class__.__name__).upper()) return self.visit_history -class FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[List[str]]): +class FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[list[str]]): """A test implementation of a BoundBooleanExpressionVisitor As this visitor visits each node, it appends an element to a `visit_history` list. This enables testing that a given bound expression is visited in an expected order by the `visit` method. """ def __init__(self) -> None: - self.visit_history: List[str] = [] + self.visit_history: list[str] = [] - def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> List[str]: + def visit_in(self, term: BoundTerm[Any], literals: set[Any]) -> list[str]: self.visit_history.append("IN") return self.visit_history - def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> List[str]: + def visit_not_in(self, term: BoundTerm[Any], literals: set[Any]) -> list[str]: self.visit_history.append("NOT_IN") return self.visit_history - def visit_is_nan(self, term: BoundTerm[Any]) -> List[str]: + def visit_is_nan(self, term: BoundTerm[Any]) -> list[str]: self.visit_history.append("IS_NAN") return self.visit_history - def visit_not_nan(self, term: BoundTerm[Any]) -> List[str]: + def visit_not_nan(self, term: BoundTerm[Any]) -> list[str]: self.visit_history.append("NOT_NAN") return self.visit_history - def visit_is_null(self, term: BoundTerm[Any]) -> List[str]: + def visit_is_null(self, term: BoundTerm[Any]) -> list[str]: self.visit_history.append("IS_NULL") return self.visit_history - def visit_not_null(self, term: BoundTerm[Any]) -> List[str]: + def visit_not_null(self, term: BoundTerm[Any]) -> list[str]: self.visit_history.append("NOT_NULL") return self.visit_history - def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("EQUAL") return self.visit_history - def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("NOT_EQUAL") return self.visit_history - def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("GREATER_THAN_OR_EQUAL") return self.visit_history - def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("GREATER_THAN") return self.visit_history - def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("LESS_THAN") return self.visit_history - def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("LESS_THAN_OR_EQUAL") return self.visit_history - def visit_true(self) -> List[str]: + def visit_true(self) -> list[str]: self.visit_history.append("TRUE") return self.visit_history - def visit_false(self) -> List[str]: + def visit_false(self) -> list[str]: self.visit_history.append("FALSE") return self.visit_history - def visit_not(self, child_result: List[str]) -> List[str]: + def visit_not(self, child_result: list[str]) -> list[str]: self.visit_history.append("NOT") return self.visit_history - def visit_and(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_and(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("AND") return self.visit_history - def visit_or(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_or(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("OR") return self.visit_history - def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: + def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: self.visit_history.append("STARTS_WITH") return self.visit_history - def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: + def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> list[str]: self.visit_history.append("NOT_STARTS_WITH") return self.visit_history @@ -1041,7 +1041,7 @@ def test_not_nan(schema: Schema, manifest: ManifestFile) -> None: def test_missing_stats(schema: Schema, manifest_no_stats: ManifestFile) -> None: - expressions: List[BooleanExpression] = [ + expressions: list[BooleanExpression] = [ LessThan(Reference("id"), 5), LessThanOrEqual(Reference("id"), 30), EqualTo(Reference("id"), 70), diff --git a/tests/integration/test_catalog.py b/tests/integration/test_catalog.py index 3590d0837e..12bbdc3d2e 100644 --- a/tests/integration/test_catalog.py +++ b/tests/integration/test_catalog.py @@ -17,7 +17,7 @@ import os from pathlib import Path, PosixPath -from typing import Generator, List +from typing import Generator import pytest @@ -171,7 +171,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, database @pytest.mark.integration @pytest.mark.parametrize("test_catalog", CATALOGS) -def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: +def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: list[str]) -> None: test_catalog.create_namespace(database_name) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) @@ -443,7 +443,7 @@ def test_create_namespace_with_comment(test_catalog: Catalog, database_name: str @pytest.mark.integration @pytest.mark.parametrize("test_catalog", CATALOGS) -def test_list_namespaces(test_catalog: Catalog, database_list: List[str]) -> None: +def test_list_namespaces(test_catalog: Catalog, database_list: list[str]) -> None: for database_name in database_list: test_catalog.create_namespace(database_name) db_list = test_catalog.list_namespaces() diff --git a/tests/integration/test_delete_count.py b/tests/integration/test_delete_count.py index 0ba9d2d6da..d0d83a24dc 100644 --- a/tests/integration/test_delete_count.py +++ b/tests/integration/test_delete_count.py @@ -17,7 +17,7 @@ # pylint:disable=redefined-outer-name import random from datetime import datetime, timedelta -from typing import Generator, List +from typing import Generator import pyarrow as pa import pytest @@ -34,7 +34,7 @@ from pyiceberg.types import LongType, NestedField, StringType -def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None: +def run_spark_commands(spark: SparkSession, sqls: list[str]) -> None: for sql in sqls: spark.sql(sql) diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index 21c3d12999..a0ee59cc46 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -16,7 +16,7 @@ # under the License. # pylint:disable=redefined-outer-name from datetime import datetime -from typing import Generator, List +from typing import Generator import pyarrow as pa import pytest @@ -34,7 +34,7 @@ from pyiceberg.types import FloatType, IntegerType, LongType, NestedField, StringType, TimestampType -def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None: +def run_spark_commands(spark: SparkSession, sqls: list[str]) -> None: for sql in sqls: spark.sql(sql) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index fcc5dc0e35..0419fcf23a 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -17,7 +17,7 @@ # pylint:disable=redefined-outer-name from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, List +from typing import Any import pytest from pyspark.sql import SparkSession @@ -728,8 +728,8 @@ def test_partition_key( session_catalog: Catalog, spark: SparkSession, - partition_fields: List[PartitionField], - partition_values: List[Any], + partition_fields: list[PartitionField], + partition_values: list[Any], expected_partition_record: Record, expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 5d7a3d9441..6c2bf7baed 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -20,7 +20,7 @@ from copy import copy from enum import Enum from tempfile import TemporaryDirectory -from typing import Any, List +from typing import Any import pytest from fastavro import reader @@ -36,7 +36,7 @@ # helper function to serialize our objects to dicts to enable # direct comparison with the dicts returned by fastavro -def todict(obj: Any, spec_keys: List[str]) -> Any: +def todict(obj: Any, spec_keys: list[str]) -> Any: if type(obj) is Record: return {key: obj[pos] for key, pos in zip(spec_keys, range(len(obj)), strict=True)} if isinstance(obj, dict) or isinstance(obj, LazyDict): diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 1913f7beb7..d194669bd3 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -18,7 +18,7 @@ from datetime import date -from typing import Any, Set +from typing import Any import pyarrow as pa import pytest @@ -1038,7 +1038,7 @@ def test_append_transform_partition_verify_partitions_count( arrow_table_date_timestamps: pa.Table, table_date_timestamps_schema: Schema, transform: Transform[Any, Any], - expected_partitions: Set[Any], + expected_partitions: set[Any], format_version: int, ) -> None: # Given diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index e7bac5e3b8..835eda087c 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -24,7 +24,7 @@ from datetime import date, datetime, timedelta from decimal import Decimal from pathlib import Path -from typing import Any, Dict +from typing import Any from urllib.parse import urlparse import fastavro @@ -639,7 +639,7 @@ def test_write_parquet_compression_properties( session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int, - properties: Dict[str, Any], + properties: dict[str, Any], expected_compression_name: str, ) -> None: identifier = "default.write_parquet_compression_properties" @@ -674,8 +674,8 @@ def test_write_parquet_other_properties( spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, - properties: Dict[str, Any], - expected_kwargs: Dict[str, Any], + properties: dict[str, Any], + expected_kwargs: dict[str, Any], ) -> None: identifier = "default.test_write_parquet_other_properties" @@ -701,7 +701,7 @@ def test_write_parquet_unsupported_properties( spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, - properties: Dict[str, str], + properties: dict[str, str], ) -> None: identifier = "default.write_parquet_unsupported_properties" diff --git a/tests/integration/test_writes/utils.py b/tests/integration/test_writes/utils.py index ce30c19477..4ab54d97e7 100644 --- a/tests/integration/test_writes/utils.py +++ b/tests/integration/test_writes/utils.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name -from typing import List, Union +from typing import Union import pyarrow as pa @@ -63,7 +63,7 @@ def _create_table( session_catalog: Catalog, identifier: str, properties: Properties = EMPTY_DICT, - data: List[pa.Table] | None = None, + data: list[pa.Table] | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, schema: Union[Schema, "pa.Schema"] = TABLE_SCHEMA, ) -> Table: diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py index 7111aaa87c..c28eb0714f 100644 --- a/tests/io/test_fsspec.py +++ b/tests/io/test_fsspec.py @@ -20,7 +20,6 @@ import tempfile import threading import uuid -from typing import List from unittest import mock import pytest @@ -59,8 +58,8 @@ def test_fsspec_local_fs_can_create_path_without_parent_dir(fsspec_fileio: Fsspe def test_fsspec_get_fs_instance_per_thread_caching(fsspec_fileio: FsspecFileIO) -> None: """Test that filesystem instances are cached per-thread by `FsspecFileIO.get_fs`""" - fs_instances: List[AbstractFileSystem] = [] - start_work_events: List[threading.Event] = [threading.Event() for _ in range(2)] + fs_instances: list[AbstractFileSystem] = [] + start_work_events: list[threading.Event] = [threading.Event() for _ in range(2)] def get_fs(start_work_event: threading.Event) -> None: # Wait to be told to actually start getting the filesystem instances diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 3bec6fd157..5758dbe4e5 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -22,7 +22,7 @@ import warnings from datetime import date, datetime, timezone from pathlib import Path -from typing import Any, List +from typing import Any from unittest.mock import MagicMock, patch from uuid import uuid4 @@ -1014,7 +1014,7 @@ def file_map(schema_map: Schema, tmpdir: str) -> str: def project( - schema: Schema, files: List[str], expr: BooleanExpression | None = None, table_schema: Schema | None = None + schema: Schema, files: list[str], expr: BooleanExpression | None = None, table_schema: Schema | None = None ) -> pa.Table: def _set_spec_id(datafile: DataFile) -> DataFile: datafile.spec_id = 0 @@ -2160,7 +2160,7 @@ def test_make_compatible_name() -> None: ([None, None, None], DateType(), None), ], ) -def test_stats_aggregator_update_min(vals: List[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: +def test_stats_aggregator_update_min(vals: list[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: stats = StatsAggregator(primitive_type, _primitive_to_physical(primitive_type)) for val in vals: @@ -2180,7 +2180,7 @@ def test_stats_aggregator_update_min(vals: List[Any], primitive_type: PrimitiveT ([None, None, None], DateType(), None), ], ) -def test_stats_aggregator_update_max(vals: List[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: +def test_stats_aggregator_update_max(vals: list[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: stats = StatsAggregator(primitive_type, _primitive_to_physical(primitive_type)) for val in vals: diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index fd175cae60..0e628829eb 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -30,9 +30,6 @@ from decimal import Decimal from typing import ( Any, - Dict, - List, - Tuple, ) import pyarrow as pa @@ -81,8 +78,8 @@ class TestStruct: def construct_test_table( - write_statistics: bool | List[str] = True, -) -> Tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: + write_statistics: bool | list[str] = True, +) -> tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -143,7 +140,7 @@ def construct_test_table( _list = [[1, 2, 3], [4, 5, 6], None, [7, 8, 9]] - _maps: List[Dict[int, int] | None] = [ + _maps: list[dict[int, int] | None] = [ {1: 2, 3: 4}, None, {5: 6}, @@ -167,7 +164,7 @@ def construct_test_table( }, schema=arrow_schema, ) - metadata_collector: List[Any] = [] + metadata_collector: list[Any] = [] with pa.BufferOutputStream() as f: with pq.ParquetWriter( @@ -422,7 +419,7 @@ def test_column_metrics_mode() -> None: assert 1 not in datafile.upper_bounds -def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: +def construct_test_table_primitive_types() -> tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -506,7 +503,7 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, TableMetada schema=arrow_schema, ) - metadata_collector: List[Any] = [] + metadata_collector: list[Any] = [] with pa.BufferOutputStream() as f: with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, store_decimal_as_integer=True) as writer: @@ -576,7 +573,7 @@ def test_metrics_primitive_types() -> None: assert not any(key in datafile.upper_bounds.keys() for key in [16, 17, 18]) -def construct_test_table_invalid_upper_bound() -> Tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: +def construct_test_table_invalid_upper_bound() -> tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -618,7 +615,7 @@ def construct_test_table_invalid_upper_bound() -> Tuple[pq.FileMetaData, TableMe schema=arrow_schema, ) - metadata_collector: List[Any] = [] + metadata_collector: list[Any] = [] with pa.BufferOutputStream() as f: with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer: diff --git a/tests/table/test_expire_snapshots.py b/tests/table/test_expire_snapshots.py index d11851f246..106e5b786c 100644 --- a/tests/table/test_expire_snapshots.py +++ b/tests/table/test_expire_snapshots.py @@ -16,7 +16,6 @@ # under the License. import threading from datetime import datetime, timedelta -from typing import Dict from unittest.mock import MagicMock, Mock from uuid import uuid4 @@ -253,7 +252,7 @@ def test_thread_safety_fix() -> None: def test_concurrent_operations() -> None: """Test concurrent operations with separate ExpireSnapshots instances.""" - results: Dict[str, set[int]] = {"expire1_snapshots": set(), "expire2_snapshots": set()} + results: dict[str, set[int]] = {"expire1_snapshots": set(), "expire2_snapshots": set()} def worker1() -> None: expire1 = ExpireSnapshots(Mock()) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 5cc68b62a4..37d7f46e38 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -18,7 +18,7 @@ import json import uuid from copy import copy -from typing import Any, Dict +from typing import Any import pytest from pydantic import ValidationError @@ -545,7 +545,7 @@ def test_update_column(table_v1: Table, table_v2: Table) -> None: def test_add_primitive_type_column(table_v2: Table) -> None: - primitive_type: Dict[str, PrimitiveType] = { + primitive_type: dict[str, PrimitiveType] = { "boolean": BooleanType(), "int": IntegerType(), "long": LongType(), @@ -1221,7 +1221,7 @@ def test_correct_schema() -> None: assert "Snapshot not found: -1" in str(exc_info.value) -def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_table_properties(example_table_metadata_v2: dict[str, Any]) -> None: # metadata properties are all strings for k, v in example_table_metadata_v2["properties"].items(): assert isinstance(k, str) @@ -1239,7 +1239,7 @@ def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None: assert isinstance(new_metadata.properties["property_name"], str) -def test_table_properties_raise_for_none_value(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_table_properties_raise_for_none_value(example_table_metadata_v2: dict[str, Any]) -> None: property_with_none = {"property_name": None} example_table_metadata_v2 = {**example_table_metadata_v2, "properties": property_with_none} with pytest.raises(ValidationError) as exc_info: diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index 9141189ec5..c163c90626 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -19,7 +19,7 @@ import io import json from copy import copy -from typing import Any, Dict +from typing import Any from unittest.mock import MagicMock, patch from uuid import UUID @@ -57,34 +57,34 @@ ) -def test_from_dict_v1(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_from_dict_v1(example_table_metadata_v1: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a dictionary""" TableMetadataUtil.parse_obj(example_table_metadata_v1) -def test_from_dict_v1_parse_raw(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_from_dict_v1_parse_raw(example_table_metadata_v1: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a str""" TableMetadataUtil.parse_raw(json.dumps(example_table_metadata_v1)) -def test_from_dict_v2(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_from_dict_v2(example_table_metadata_v2: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a dictionary""" TableMetadataUtil.parse_obj(example_table_metadata_v2) -def test_from_dict_v2_parse_raw(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_from_dict_v2_parse_raw(example_table_metadata_v2: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a str""" TableMetadataUtil.parse_raw(json.dumps(example_table_metadata_v2)) -def test_from_byte_stream(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_from_byte_stream(example_table_metadata_v2: dict[str, Any]) -> None: """Test generating a TableMetadata instance from a file-like byte stream""" data = bytes(json.dumps(example_table_metadata_v2), encoding=UTF8) byte_stream = io.BytesIO(data) FromByteStream.table_metadata(byte_stream=byte_stream) -def test_v2_metadata_parsing(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_v2_metadata_parsing(example_table_metadata_v2: dict[str, Any]) -> None: """Test retrieving values from a TableMetadata instance of version 2""" table_metadata = TableMetadataUtil.parse_obj(example_table_metadata_v2) @@ -107,7 +107,7 @@ def test_v2_metadata_parsing(example_table_metadata_v2: Dict[str, Any]) -> None: assert table_metadata.default_sort_order_id == 3 -def test_v1_metadata_parsing_directly(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_v1_metadata_parsing_directly(example_table_metadata_v1: dict[str, Any]) -> None: """Test retrieving values from a TableMetadata instance of version 1""" table_metadata = TableMetadataV1(**example_table_metadata_v1) @@ -138,14 +138,14 @@ def test_v1_metadata_parsing_directly(example_table_metadata_v1: Dict[str, Any]) assert table_metadata.default_sort_order_id == 0 -def test_parsing_correct_types(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_parsing_correct_types(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2) assert isinstance(table_metadata.schemas[0], Schema) assert isinstance(table_metadata.schemas[0].fields[0], NestedField) assert isinstance(table_metadata.schemas[0].fields[0].field_type, LongType) -def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_updating_metadata(example_table_metadata_v2: dict[str, Any]) -> None: """Test creating a new TableMetadata instance that's an updated version of an existing TableMetadata instance""" table_metadata = TableMetadataV2(**example_table_metadata_v2) @@ -170,20 +170,20 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: assert table_metadata.schemas[-1] == Schema(**new_schema) -def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_serialize_v1(example_table_metadata_v1: dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) table_metadata_json = table_metadata.model_dump_json() expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"partition-statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" assert table_metadata_json == expected -def test_serialize_v2(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_serialize_v2(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2).model_dump_json() expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"partition-statistics":[],"format-version":2,"last-sequence-number":34}""" assert table_metadata == expected -def test_serialize_v3(example_table_metadata_v3: Dict[str, Any]) -> None: +def test_serialize_v3(example_table_metadata_v3: dict[str, Any]) -> None: # Writing will be part of https://github.com/apache/iceberg-python/issues/1551 with pytest.raises(NotImplementedError) as exc_info: @@ -192,7 +192,7 @@ def test_serialize_v3(example_table_metadata_v3: Dict[str, Any]) -> None: assert "Writing V3 is not yet supported, see: https://github.com/apache/iceberg-python/issues/1551" in str(exc_info.value) -def test_migrate_v1_schemas(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_migrate_v1_schemas(example_table_metadata_v1: dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) assert isinstance(table_metadata, TableMetadataV1) @@ -200,7 +200,7 @@ def test_migrate_v1_schemas(example_table_metadata_v1: Dict[str, Any]) -> None: assert table_metadata.schemas[0] == table_metadata.schema_ -def test_migrate_v1_partition_specs(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_migrate_v1_partition_specs(example_table_metadata_v1: dict[str, Any]) -> None: # Copy the example, and add a spec table_metadata = TableMetadataV1(**example_table_metadata_v1) assert isinstance(table_metadata, TableMetadataV1) @@ -281,7 +281,7 @@ def test_new_table_metadata_with_explicit_v1_format() -> None: assert actual.sort_orders == [expected_sort_order] -def test_invalid_format_version(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_invalid_format_version(example_table_metadata_v1: dict[str, Any]) -> None: """Test the exception when trying to load an unknown version""" example_table_metadata_v22 = copy(example_table_metadata_v1) @@ -449,7 +449,7 @@ def test_invalid_partition_spec() -> None: assert "default-spec-id 1 can't be found" in str(exc_info.value) -def test_v1_writing_metadata(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_v1_writing_metadata(example_table_metadata_v1: dict[str, Any]) -> None: """ https://iceberg.apache.org/spec/#version-2 @@ -464,7 +464,7 @@ def test_v1_writing_metadata(example_table_metadata_v1: Dict[str, Any]) -> None: assert "last-sequence-number" not in metadata_v1 -def test_v1_metadata_for_v2(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_v1_metadata_for_v2(example_table_metadata_v1: dict[str, Any]) -> None: """ https://iceberg.apache.org/spec/#version-2 @@ -548,7 +548,7 @@ def test_v1_write_metadata_for_v2() -> None: assert "partition-spec" not in metadata_v2 -def test_v2_ref_creation(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_v2_ref_creation(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2) assert table_metadata.refs == { "main": SnapshotRef( diff --git a/tests/table/test_puffin.py b/tests/table/test_puffin.py index 2140915389..bf8c82014c 100644 --- a/tests/table/test_puffin.py +++ b/tests/table/test_puffin.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. from os import path -from typing import List import pytest from pyroaring import BitMap @@ -32,7 +31,7 @@ def _open_file(file: str) -> bytes: def test_map_empty() -> None: puffin = _open_file("64mapempty.bin") - expected: List[BitMap] = [] + expected: list[BitMap] = [] actual = _deserialize_bitmap(puffin) assert expected == actual diff --git a/tests/table/test_sorting.py b/tests/table/test_sorting.py index 3efda56509..cb7a2c187a 100644 --- a/tests/table/test_sorting.py +++ b/tests/table/test_sorting.py @@ -16,7 +16,7 @@ # under the License. # pylint:disable=redefined-outer-name,eval-used import json -from typing import Any, Dict +from typing import Any import pytest @@ -63,7 +63,7 @@ def test_deserialize_sort_order(sort_order: SortOrder) -> None: assert SortOrder.model_validate_json(payload) == sort_order -def test_sorting_schema(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_sorting_schema(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataUtil.parse_raw(json.dumps(example_table_metadata_v2)) assert table_metadata.sort_orders == [ diff --git a/tests/test_avro_sanitization.py b/tests/test_avro_sanitization.py index 0ca23e3165..a053bf90ad 100644 --- a/tests/test_avro_sanitization.py +++ b/tests/test_avro_sanitization.py @@ -18,7 +18,7 @@ import tempfile -from typing import Any, Dict +from typing import Any from fastavro import reader @@ -72,7 +72,7 @@ def test_comprehensive_field_name_sanitization() -> None: schema = Schema(NestedField(field_id=1, name=original_name, field_type=StringType(), required=True)) avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema) - avro_dict: Dict[str, Any] = avro_schema + avro_dict: dict[str, Any] = avro_schema assert avro_dict["fields"][0]["name"] == expected_sanitized @@ -126,7 +126,7 @@ def test_comprehensive_avro_compatibility() -> None: avro_reader = reader(fo) avro_schema: AvroType = avro_reader.writer_schema - avro_dict: Dict[str, Any] = avro_schema + avro_dict: dict[str, Any] = avro_schema field_names = [field["name"] for field in avro_dict["fields"]] # Expected sanitized names (matching Java implementation) @@ -143,7 +143,7 @@ def test_comprehensive_avro_compatibility() -> None: # Verify iceberg-field-name properties for field in avro_dict["fields"]: - field_dict: Dict[str, Any] = field + field_dict: dict[str, Any] = field if field_dict["name"] == "invalid_x2Efield": assert "iceberg-field-name" in field_dict assert field_dict["iceberg-field-name"] == "invalid.field" @@ -201,7 +201,7 @@ def test_emoji_field_name_sanitization() -> None: ) avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema, schema_name="emoji_test") - avro_dict: Dict[str, Any] = avro_schema + avro_dict: dict[str, Any] = avro_schema field_names = [field["name"] for field in avro_dict["fields"]] expected_field_names = [ @@ -213,7 +213,7 @@ def test_emoji_field_name_sanitization() -> None: assert field_names == expected_field_names for field in avro_dict["fields"]: - field_dict: Dict[str, Any] = field + field_dict: dict[str, Any] = field if field_dict["name"] == "_x1F60E": assert field_dict["iceberg-field-name"] == "😎" elif field_dict["name"] == "_x1F60E_with_text": @@ -240,13 +240,13 @@ def test_emoji_field_name_sanitization() -> None: avro_reader = reader(fo) avro_schema_reader: AvroType = avro_reader.writer_schema - avro_dict_reader: Dict[str, Any] = avro_schema_reader + avro_dict_reader: dict[str, Any] = avro_schema_reader field_names_reader = [field["name"] for field in avro_dict_reader["fields"]] assert field_names_reader == expected_field_names for field in avro_dict_reader["fields"]: - field_dict_reader: Dict[str, Any] = field + field_dict_reader: dict[str, Any] = field if field_dict_reader["name"] == "_x1F60E": assert field_dict_reader["iceberg-field-name"] == "😎" elif field_dict_reader["name"] == "_x1F60E_with_text": diff --git a/tests/test_schema.py b/tests/test_schema.py index e0dba59eaa..589a45c3b4 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -16,7 +16,7 @@ # under the License. from textwrap import dedent -from typing import Any, Dict, List +from typing import Any import pyarrow as pa import pytest @@ -409,8 +409,8 @@ def test_build_position_accessors(table_schema_nested: Schema) -> None: def test_build_position_accessors_with_struct(table_schema_nested: Schema) -> None: class TestStruct(StructProtocol): - def __init__(self, pos: Dict[int, Any] = EMPTY_DICT): - self._pos: Dict[int, Any] = pos + def __init__(self, pos: dict[int, Any] = EMPTY_DICT): + self._pos: dict[int, Any] = pos def __setitem__(self, pos: int, value: Any) -> None: pass @@ -952,14 +952,14 @@ def test_unknown_type_promotion_to_non_primitive_raises_resolve_error() -> None: @pytest.fixture() -def primitive_fields() -> List[NestedField]: +def primitive_fields() -> list[NestedField]: return [ NestedField(field_id=1, name=str(primitive_type), field_type=primitive_type, required=False) for primitive_type in TEST_PRIMITIVE_TYPES ] -def test_add_top_level_primitives(primitive_fields: List[NestedField], table_v2: Table) -> None: +def test_add_top_level_primitives(primitive_fields: list[NestedField], table_v2: Table) -> None: for primitive_field in primitive_fields: new_schema = Schema(primitive_field) applied = UpdateSchema(transaction=Transaction(table_v2), schema=Schema()).union_by_name(new_schema)._apply() @@ -1025,7 +1025,7 @@ def test_add_nested_primitive(primitive_fields: NestedField, table_v2: Table) -> assert applied.as_struct() == new_schema.as_struct() -def _primitive_fields(types: List[PrimitiveType], start_id: int = 0) -> List[NestedField]: +def _primitive_fields(types: list[PrimitiveType], start_id: int = 0) -> list[NestedField]: fields = [] for iceberg_type in types: fields.append(NestedField(field_id=start_id, name=str(iceberg_type), field_type=iceberg_type, required=False)) diff --git a/tests/test_serializers.py b/tests/test_serializers.py index 3f2bd73e48..53ce6fcd42 100644 --- a/tests/test_serializers.py +++ b/tests/test_serializers.py @@ -18,7 +18,7 @@ import json import os import uuid -from typing import Any, Dict, Tuple +from typing import Any import pytest from pytest_mock import MockFixture @@ -31,7 +31,7 @@ def test_legacy_current_snapshot_id( - mocker: MockFixture, tmp_path_factory: pytest.TempPathFactory, example_table_metadata_no_snapshot_v1: Dict[str, Any] + mocker: MockFixture, tmp_path_factory: pytest.TempPathFactory, example_table_metadata_no_snapshot_v1: dict[str, Any] ) -> None: from pyiceberg.io.pyarrow import PyArrowFileIO @@ -54,7 +54,7 @@ def test_legacy_current_snapshot_id( def test_null_serializer_field() -> None: class ExampleRequest(IcebergBaseModel): - requirements: Tuple[TableRequirement, ...] + requirements: tuple[TableRequirement, ...] request = ExampleRequest(requirements=(AssertRefSnapshotId(ref="main", snapshot_id=None),)) dumped_json = request.model_dump_json() diff --git a/tests/test_types.py b/tests/test_types.py index 6d671e951f..707deb160e 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=W0123,W0613 import pickle -from typing import Type import pydantic_core import pytest @@ -79,7 +78,7 @@ @pytest.mark.parametrize("input_index, input_type", non_parameterized_types) -def test_repr_primitive_types(input_index: int, input_type: Type[PrimitiveType]) -> None: +def test_repr_primitive_types(input_index: int, input_type: type[PrimitiveType]) -> None: assert isinstance(eval(repr(input_type())), input_type) assert input_type == pickle.loads(pickle.dumps(input_type)) @@ -273,7 +272,7 @@ def test_nested_field_primitive_type_as_str() -> None: @pytest.mark.parametrize("input_index,input_type", non_parameterized_types) @pytest.mark.parametrize("check_index,check_type", non_parameterized_types) def test_non_parameterized_type_equality( - input_index: int, input_type: Type[PrimitiveType], check_index: int, check_type: Type[PrimitiveType] + input_index: int, input_type: type[PrimitiveType], check_index: int, check_type: type[PrimitiveType] ) -> None: if input_index == check_index: assert input_type() == check_type() diff --git a/tests/utils/test_bin_packing.py b/tests/utils/test_bin_packing.py index 3bfacdf481..add6e56156 100644 --- a/tests/utils/test_bin_packing.py +++ b/tests/utils/test_bin_packing.py @@ -16,7 +16,6 @@ # under the License. import random -from typing import List import pytest @@ -38,11 +37,11 @@ ), # sparse ], ) -def test_bin_packing(splits: List[int], lookback: int, split_size: int, open_cost: int) -> None: +def test_bin_packing(splits: list[int], lookback: int, split_size: int, open_cost: int) -> None: def weight_func(x: int) -> int: return max(x, open_cost) - item_list_sums: List[int] = [sum(item) for item in PackingIterator(splits, split_size, lookback, weight_func)] + item_list_sums: list[int] = [sum(item) for item in PackingIterator(splits, split_size, lookback, weight_func)] assert all(split_size >= item_sum >= 0 for item_sum in item_list_sums) @@ -80,7 +79,7 @@ def weight_func(x: int) -> int: ], ) def test_bin_packing_lookback( - splits: List[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: List[List[int]] + splits: list[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: list[list[int]] ) -> None: def weight_func(x: int) -> int: return x @@ -123,7 +122,7 @@ def weight_func(x: int) -> int: ], ) def test_reverse_bin_packing_lookback( - splits: List[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: List[List[int]] + splits: list[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: list[list[int]] ) -> None: packer: ListPacker[int] = ListPacker(target_weight, lookback, largest_bin_first) result = packer.pack_end(splits, lambda x: x) diff --git a/tests/utils/test_concurrent.py b/tests/utils/test_concurrent.py index 48039e0c24..f1070a7bf8 100644 --- a/tests/utils/test_concurrent.py +++ b/tests/utils/test_concurrent.py @@ -18,14 +18,14 @@ import multiprocessing import os from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from typing import Dict, Generator +from typing import Generator from unittest import mock import pytest from pyiceberg.utils.concurrent import ExecutorFactory -EMPTY_ENV: Dict[str, str | None] = {} +EMPTY_ENV: dict[str, str | None] = {} VALID_ENV = {"PYICEBERG_MAX_WORKERS": "5"} INVALID_ENV = {"PYICEBERG_MAX_WORKERS": "invalid"} diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 8953754103..5cd6a7203a 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import os -from typing import Any, Dict +from typing import Any from unittest import mock import pytest @@ -148,7 +148,7 @@ def test_from_configuration_files_get_typed_value(tmp_path_factory: pytest.TempP def test_config_lookup_order( monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory, - config_setup: Dict[str, Any], + config_setup: dict[str, Any], expected_result: str | None, ) -> None: """ diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index df2166fdbf..d12019c9e2 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=redefined-outer-name,arguments-renamed,fixme from tempfile import TemporaryDirectory -from typing import Dict from unittest.mock import patch import fastavro @@ -51,7 +50,7 @@ def clear_global_manifests_cache() -> None: _manifest_cache.clear() -def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: Dict[str, str]) -> None: +def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: dict[str, str]) -> None: with open(avro_file, "rb") as f: reader = fastavro.reader(f) metadata = reader.metadata diff --git a/tests/utils/test_schema_conversion.py b/tests/utils/test_schema_conversion.py index eb44dcdff3..44322849d8 100644 --- a/tests/utils/test_schema_conversion.py +++ b/tests/utils/test_schema_conversion.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=W0212 -from typing import Any, Dict +from typing import Any import pytest @@ -40,7 +40,7 @@ from pyiceberg.utils.schema_conversion import AvroSchemaConversion -def test_avro_to_iceberg(avro_schema_manifest_file_v1: Dict[str, Any]) -> None: +def test_avro_to_iceberg(avro_schema_manifest_file_v1: dict[str, Any]) -> None: iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_file_v1) expected_iceberg_schema = Schema( NestedField( @@ -377,14 +377,14 @@ def test_logical_map_with_invalid_fields() -> None: assert "Invalid key-value pair schema:" in str(exc_info.value) -def test_iceberg_to_avro_manifest_list(avro_schema_manifest_file_v1: Dict[str, Any]) -> None: +def test_iceberg_to_avro_manifest_list(avro_schema_manifest_file_v1: dict[str, Any]) -> None: """Round trip the manifest list""" iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_file_v1) avro_result = AvroSchemaConversion().iceberg_to_avro(iceberg_schema, schema_name="manifest_file") assert avro_schema_manifest_file_v1 == avro_result -def test_iceberg_to_avro_manifest(avro_schema_manifest_entry: Dict[str, Any]) -> None: +def test_iceberg_to_avro_manifest(avro_schema_manifest_entry: dict[str, Any]) -> None: """Round trip the manifest itself""" iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_entry) avro_result = AvroSchemaConversion().iceberg_to_avro(iceberg_schema, schema_name="manifest_entry")