From 6d5534633b9ce0bb680678dcd77d9ef069e959f2 Mon Sep 17 00:00:00 2001 From: EkaterinaVoloshina Date: Tue, 28 Nov 2023 12:47:54 +0300 Subject: [PATCH 1/2] fix types --- DPF/dataset_reader.py | 2 +- DPF/processors/processor.py | 12 ++++++------ DPF/processors/writers/sharded_files_writer.py | 6 +++--- DPF/processors/writers/shards_writer.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/DPF/dataset_reader.py b/DPF/dataset_reader.py index 04b277e..60ac39a 100644 --- a/DPF/dataset_reader.py +++ b/DPF/dataset_reader.py @@ -13,7 +13,7 @@ ) -def help_reader(filesystem: FileSystem, required_columns: Optional[list[str]], path: str): +def help_reader(filesystem: FileSystem, required_columns: Optional[list], path: str): df = filesystem.read_dataframe(path) if required_columns: diff --git a/DPF/processors/processor.py b/DPF/processors/processor.py index 435b6d6..567b1e3 100644 --- a/DPF/processors/processor.py +++ b/DPF/processors/processor.py @@ -42,7 +42,7 @@ def __setitem__(self, key: str, value: Union[List[str], pd.Series]): self._df[key] = value @abstractmethod - def rename_columns(self, column_map: Dict[str, str], workers: int = 16) -> List[str]: + def rename_columns(self, column_map: dict, workers: int = 16) -> List[str]: pass @abstractmethod @@ -57,7 +57,7 @@ def update_columns(self, columns: List[str], workers: int = 16) -> List[str]: def get_torch_dataset( self, modalities: List[str], - meta_columns: Optional[List[str]] = None, + meta_columns: Optional[list] = None, preprocess_f: Callable[[dict, dict], Any] = default_preprocess, return_none_on_error: bool = False ) -> Dataset: @@ -98,7 +98,7 @@ def validate( def get_random_sample( self, df_filter: Optional[pd.Series] = None - ) -> (Dict[str, bytes], Dict[str, str]): + ) -> (dict, dict): if df_filter: df_to_sample = self.df[df_filter] else: @@ -111,7 +111,7 @@ def get_random_sample( @abstractmethod def _read_files_from_sample( self, - sample: Dict[str, str] + sample: dict ) -> Dict[str, bytes]: pass @@ -170,7 +170,7 @@ def to_sharded_files( max_files_in_shard: int = 1000, datafiles_ext: str = "csv", meta_columns: Optional[List[str]] = None, - keys_mapping: Optional[dict[str, str]] = None, + keys_mapping: Optional[dict] = None, workers: int = 8, pbar: bool = True ): @@ -196,7 +196,7 @@ def to_shards( datafiles_ext: str = "csv", archives_ext: Optional[str] = "tar", meta_columns: Optional[List[str]] = None, - keys_mapping: Optional[dict[str, str]] = None, + keys_mapping: Optional[dict] = None, workers: int = 8, pbar: bool = True ): diff --git a/DPF/processors/writers/sharded_files_writer.py b/DPF/processors/writers/sharded_files_writer.py index d62f820..8e25709 100644 --- a/DPF/processors/writers/sharded_files_writer.py +++ b/DPF/processors/writers/sharded_files_writer.py @@ -18,7 +18,7 @@ def __init__( self, filesystem: FileSystem, destination_dir: str, - keys_mapping: Optional[dict[str, str]] = None, + keys_mapping: Optional = None, max_files_in_shard: int = 1000, datafiles_ext: str = "csv", ) -> None: @@ -63,8 +63,8 @@ def __enter__(self) -> "FileWriter": def __exit__( self, - exception_type: Optional[type], - exception_value: Optional[Exception], + exception_type: Optional, + exception_value: Optional, exception_traceback: traceback, ) -> None: if len(self.df_raw) != 0: diff --git a/DPF/processors/writers/shards_writer.py b/DPF/processors/writers/shards_writer.py index 0a11f88..c024081 100644 --- a/DPF/processors/writers/shards_writer.py +++ b/DPF/processors/writers/shards_writer.py @@ -20,7 +20,7 @@ def __init__( self, filesystem: FileSystem, destination_dir: str, - keys_mapping: Optional[dict[str, str]] = None, + keys_mapping: Optional[dict] = None, max_files_in_shard: Optional[int] = 1000, datafiles_ext: Optional[str] = "csv", archives_ext: Optional[str] = "tar", From b60118f3ac259e5b8a9cdc6e0662c324f2d6baab Mon Sep 17 00:00:00 2001 From: boomb0om Date: Tue, 28 Nov 2023 21:36:17 +0300 Subject: [PATCH 2/2] Fix typehinting --- DPF/dataset_reader.py | 2 +- DPF/processors/processor.py | 12 ++++++------ DPF/processors/writers/sharded_files_writer.py | 6 +++--- DPF/processors/writers/shards_writer.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/DPF/dataset_reader.py b/DPF/dataset_reader.py index 60ac39a..848b327 100644 --- a/DPF/dataset_reader.py +++ b/DPF/dataset_reader.py @@ -13,7 +13,7 @@ ) -def help_reader(filesystem: FileSystem, required_columns: Optional[list], path: str): +def help_reader(filesystem: FileSystem, required_columns: Optional[List[str]], path: str): df = filesystem.read_dataframe(path) if required_columns: diff --git a/DPF/processors/processor.py b/DPF/processors/processor.py index 567b1e3..8a2b3fc 100644 --- a/DPF/processors/processor.py +++ b/DPF/processors/processor.py @@ -42,7 +42,7 @@ def __setitem__(self, key: str, value: Union[List[str], pd.Series]): self._df[key] = value @abstractmethod - def rename_columns(self, column_map: dict, workers: int = 16) -> List[str]: + def rename_columns(self, column_map: Dict[str, str], workers: int = 16) -> List[str]: pass @abstractmethod @@ -57,7 +57,7 @@ def update_columns(self, columns: List[str], workers: int = 16) -> List[str]: def get_torch_dataset( self, modalities: List[str], - meta_columns: Optional[list] = None, + meta_columns: Optional[List[str]] = None, preprocess_f: Callable[[dict, dict], Any] = default_preprocess, return_none_on_error: bool = False ) -> Dataset: @@ -98,7 +98,7 @@ def validate( def get_random_sample( self, df_filter: Optional[pd.Series] = None - ) -> (dict, dict): + ) -> (Dict[str, bytes], Dict[str, str]): if df_filter: df_to_sample = self.df[df_filter] else: @@ -111,7 +111,7 @@ def get_random_sample( @abstractmethod def _read_files_from_sample( self, - sample: dict + sample: Dict[str, str] ) -> Dict[str, bytes]: pass @@ -170,7 +170,7 @@ def to_sharded_files( max_files_in_shard: int = 1000, datafiles_ext: str = "csv", meta_columns: Optional[List[str]] = None, - keys_mapping: Optional[dict] = None, + keys_mapping: Optional[Dict[str, str]] = None, workers: int = 8, pbar: bool = True ): @@ -196,7 +196,7 @@ def to_shards( datafiles_ext: str = "csv", archives_ext: Optional[str] = "tar", meta_columns: Optional[List[str]] = None, - keys_mapping: Optional[dict] = None, + keys_mapping: Optional[Dict[str, str]] = None, workers: int = 8, pbar: bool = True ): diff --git a/DPF/processors/writers/sharded_files_writer.py b/DPF/processors/writers/sharded_files_writer.py index 8e25709..036bc9d 100644 --- a/DPF/processors/writers/sharded_files_writer.py +++ b/DPF/processors/writers/sharded_files_writer.py @@ -18,7 +18,7 @@ def __init__( self, filesystem: FileSystem, destination_dir: str, - keys_mapping: Optional = None, + keys_mapping: Optional[Dict[str, str]] = None, max_files_in_shard: int = 1000, datafiles_ext: str = "csv", ) -> None: @@ -63,8 +63,8 @@ def __enter__(self) -> "FileWriter": def __exit__( self, - exception_type: Optional, - exception_value: Optional, + exception_type: Optional[type], + exception_value: Optional[Exception], exception_traceback: traceback, ) -> None: if len(self.df_raw) != 0: diff --git a/DPF/processors/writers/shards_writer.py b/DPF/processors/writers/shards_writer.py index c024081..30c167b 100644 --- a/DPF/processors/writers/shards_writer.py +++ b/DPF/processors/writers/shards_writer.py @@ -20,7 +20,7 @@ def __init__( self, filesystem: FileSystem, destination_dir: str, - keys_mapping: Optional[dict] = None, + keys_mapping: Optional[Dict[str, str]] = None, max_files_in_shard: Optional[int] = 1000, datafiles_ext: Optional[str] = "csv", archives_ext: Optional[str] = "tar",