Add config options for label preprocessing (Refs #5)

- Expose normalize_unicode parameter of LmdbDataset - Add remove_whitespace flag for disabling whitespace removal in labels
baudm · Jul 28, 2022 · e8ea463 · e8ea463
1 parent 98959c9
commit e8ea463
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 7 deletions.
diff --git a/configs/main.yaml b/configs/main.yaml
@@ -20,13 +20,15 @@ data:
   _target_: strhub.data.module.SceneTextDataModule
   root_dir: data
   train_dir: ???
+  batch_size: ${model.batch_size}
   img_size: ${model.img_size}
   charset_train: ${model.charset_train}
   charset_test: ${model.charset_test}
   max_label_length: ${model.max_label_length}
-  batch_size: ${model.batch_size}
-  num_workers: 2
+  remove_whitespace: true
+  normalize_unicode: true
   augment: true
+  num_workers: 2
 
 trainer:
   _target_: pytorch_lightning.Trainer

diff --git a/strhub/data/dataset.py b/strhub/data/dataset.py
@@ -55,12 +55,14 @@ class LmdbDataset(Dataset):
     """
 
     def __init__(self, root: str, charset: str, max_label_len: int, min_image_dim: int = 0,
-                 normalize_unicode: bool = True, unlabelled: bool = False, transform: Optional[Callable] = None,
+                 remove_whitespace: bool = True, normalize_unicode: bool = True,
+                 unlabelled: bool = False, transform: Optional[Callable] = None,
                  num_workers: int = 1):
         self.env = lmdb.open(root, max_readers=num_workers, max_spare_txns=num_workers,
                              readonly=True, create=False, readahead=False, meminit=False, lock=False)
         self.max_label_len = max_label_len
         self.min_image_dim = min_image_dim
+        self.remove_whitespace = remove_whitespace
         self.normalize_unicode = normalize_unicode
         self.unlabelled = unlabelled
         self.transform = transform
@@ -81,8 +83,9 @@ def _preprocess_labels(self, charset):
                 index += 1  # lmdb starts with 1
                 label_key = f'label-{index:09d}'.encode()
                 label = txn.get(label_key).decode()
-                # There shouldn't be any whitespace in the labels but try to remove them for good measure
-                label = ''.join(label.split())
+                # Normally, whitespace is removed from the labels.
+                if self.remove_whitespace:
+                    label = ''.join(label.split())
                 # Normalize unicode composites (if any) and convert to compatible ASCII characters
                 if self.normalize_unicode:
                     label = unicodedata.normalize('NFKD', label).encode('ascii', 'ignore').decode()

diff --git a/strhub/data/module.py b/strhub/data/module.py
@@ -31,6 +31,7 @@ class SceneTextDataModule(pl.LightningDataModule):
 
     def __init__(self, root_dir: str, train_dir: str, img_size: Sequence[int], max_label_length: int,
                  charset_train: str, charset_test: str, batch_size: int, num_workers: int, augment: bool,
+                 remove_whitespace: bool = True, normalize_unicode: bool = True,
                  min_image_dim: int = 0, rotation: int = 0, collate_fn: Optional[Callable] = None):
         super().__init__()
         self.root_dir = root_dir
@@ -42,6 +43,8 @@ def __init__(self, root_dir: str, train_dir: str, img_size: Sequence[int], max_l
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.augment = augment
+        self.remove_whitespace = remove_whitespace
+        self.normalize_unicode = normalize_unicode
         self.min_image_dim = min_image_dim
         self.rotation = rotation
         self.collate_fn = collate_fn
@@ -69,7 +72,7 @@ def train_dataset(self):
             transform = self.get_transform(self.img_size, self.augment)
             root = PurePath(self.root_dir, 'train', self.train_dir)
             self._train_dataset = build_tree_dataset(root, self.charset_train, self.max_label_length,
-                                                     self.min_image_dim,
+                                                     self.min_image_dim, self.remove_whitespace, self.normalize_unicode,
                                                      transform=transform, num_workers=self.num_workers)
         return self._train_dataset
 
@@ -79,6 +82,7 @@ def val_dataset(self):
             transform = self.get_transform(self.img_size)
             root = PurePath(self.root_dir, 'val')
             self._val_dataset = build_tree_dataset(root, self.charset_test, self.max_label_length,
+                                                   self.min_image_dim, self.remove_whitespace, self.normalize_unicode,
                                                    transform=transform, num_workers=self.num_workers)
         return self._val_dataset
 
@@ -96,6 +100,7 @@ def test_dataloaders(self, subset):
         transform = self.get_transform(self.img_size, rotation=self.rotation)
         root = PurePath(self.root_dir, 'test')
         datasets = {s: LmdbDataset(str(root.joinpath(s)), self.charset_test, self.max_label_length,
+                                   self.min_image_dim, self.remove_whitespace, self.normalize_unicode,
                                    transform=transform) for s in subset}
         return {k: DataLoader(v, batch_size=self.batch_size, num_workers=self.num_workers,
                               pin_memory=True, collate_fn=self.collate_fn)

diff --git a/test.py b/test.py
@@ -84,7 +84,7 @@ def main():
     model.freeze()  # disable autograd
     hp = model.hparams
     datamodule = SceneTextDataModule('data', '_unused_', hp.img_size, hp.max_label_length, hp.charset_train,
-                                     hp.charset_test, args.batch_size, args.num_workers, False, args.rotation)
+                                     hp.charset_test, args.batch_size, args.num_workers, False, rotation=args.rotation)
 
     test_set = SceneTextDataModule.TEST_BENCHMARK_SUB + SceneTextDataModule.TEST_BENCHMARK
     if args.new: