Making Token class a "slots" class (#4312)

* ensure linting and typechecking ran on all code * make Token a __slots__ class * add benchmarks * update CHANGELOG * fix test with custom token subclass * Update allennlp/data/tokenizers/token.py Co-authored-by: Matt Gardner <mattg@allenai.org> Co-authored-by: Matt Gardner <mattg@allenai.org>
allenai · Jun 2, 2020 · 11a08ae · 11a08ae
1 parent 32bccfb
commit 11a08ae
Show file tree

Hide file tree

Showing 12 changed files with 115 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Similar to our caching mechanism, we introduced a lock file to the vocab to avoid race
   conditions when saving/loading the vocab from/to the same serialization directory in different processes.
+- Changed the `Token` class to a "slots" class, which dramatically reduces the size in memory of `Token` instances.
 
 ## [v1.0.0rc5](https://github.com/allenai/allennlp/releases/tag/v1.0.0rc5) - 2020-05-26
 

diff --git a/Makefile b/Makefile
@@ -42,15 +42,15 @@ check-for-cuda :
 
 .PHONY : lint
 lint :
-	flake8 ./scripts ./tests $(SRC)
+	flake8 .
 
 .PHONY : format
 format :
-	black --check ./scripts ./tests $(SRC)
+	black --check .
 
 .PHONY : typecheck
 typecheck :
-	mypy $(SRC) \
+	mypy . \
 		--ignore-missing-imports \
 		--no-strict-optional \
 		--no-site-packages \
@@ -71,6 +71,10 @@ test-with-cov :
 gpu-test :
 	pytest --color=yes -v -rf -m gpu
 
+.PHONY : benchmarks
+benchmarks :
+	pytest -c benchmarks/pytest.ini benchmarks/
+
 #
 # Setup helpers
 #

diff --git a/allennlp/data/tokenizers/token.py b/allennlp/data/tokenizers/token.py
@@ -2,7 +2,7 @@
 from typing import Optional
 
 
-@dataclass
+@dataclass(init=False, repr=False)
 class Token:
     """
     A simple token representation, keeping track of the token's text, offset in the passage it was
@@ -40,16 +40,57 @@ class Token:
         added, similar to spacy's `lex_id`.
     """
 
-    text: Optional[str] = None
-    idx: Optional[int] = None
-    idx_end: Optional[int] = None
-    lemma_: Optional[str] = None
-    pos_: Optional[str] = None
-    tag_: Optional[str] = None
-    dep_: Optional[str] = None
-    ent_type_: Optional[str] = None
-    text_id: Optional[int] = None
-    type_id: Optional[int] = None
+    __slots__ = [
+        "text",
+        "idx",
+        "idx_end",
+        "lemma_",
+        "pos_",
+        "tag_",
+        "dep_",
+        "ent_type_",
+        "text_id",
+        "type_id",
+    ]
+    # Defining the `__slots__` of this class is an optimization that dramatically reduces
+    # the size in memory of a `Token` instance. The downside of using `__slots__`
+    # with a dataclass is that you can't assign default values at the class level,
+    # which is why we need a custom `__init__` function that provides the default values.
+
+    text: Optional[str]
+    idx: Optional[int]
+    idx_end: Optional[int]
+    lemma_: Optional[str]
+    pos_: Optional[str]
+    tag_: Optional[str]
+    dep_: Optional[str]
+    ent_type_: Optional[str]
+    text_id: Optional[int]
+    type_id: Optional[int]
+
+    def __init__(
+        self,
+        text: str = None,
+        idx: int = None,
+        idx_end: int = None,
+        lemma_: str = None,
+        pos_: str = None,
+        tag_: str = None,
+        dep_: str = None,
+        ent_type_: str = None,
+        text_id: int = None,
+        type_id: int = None,
+    ) -> None:
+        self.text = text
+        self.idx = idx
+        self.idx_end = idx_end
+        self.lemma_ = lemma_
+        self.pos_ = pos_
+        self.tag_ = tag_
+        self.dep_ = dep_
+        self.ent_type_ = ent_type_
+        self.text_id = text_id
+        self.type_id = type_id
 
     def __str__(self):
         return self.text

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/data/__init__.py b/benchmarks/data/__init__.py
diff --git a/benchmarks/data/tokenizers/__init__.py b/benchmarks/data/tokenizers/__init__.py
diff --git a/benchmarks/data/tokenizers/character_tokenizer_bench.py b/benchmarks/data/tokenizers/character_tokenizer_bench.py
@@ -0,0 +1,16 @@
+from allennlp.data.tokenizers import CharacterTokenizer
+
+
+tokenizer = CharacterTokenizer()
+passage = (
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
+    "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis "
+    "nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. "
+    "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu "
+    "fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in "
+    "culpa qui officia deserunt mollit anim id est laborum."
+)
+
+
+def bench_character_tokenizer(benchmark):
+    benchmark(tokenizer.tokenize, passage)
diff --git a/benchmarks/pytest.ini b/benchmarks/pytest.ini
@@ -0,0 +1,8 @@
+# We use pytest to run benchmarks, which is weird, but so far the best benchmarking
+# framework we've found is only available as a pytest plugin.
+# That said, we like to organize our benchmarks seperately and with different naming
+# conventions from our tests, which requires using a seperate pytest configuration.
+[pytest]
+python_files = *_bench.py
+python_functions = bench_* *_bench
+python_classes = 
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -28,6 +28,9 @@ responses>=0.7
 # For running tests that aren't 100% reliable.
 flaky
 
+# For running benchmarks.
+pytest-benchmark
+
 #### DOC-RELATED PACKAGES ####
 
 # YAML manipulation

diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 # version.py defines the VERSION and VERSION_SHORT variables.
 # We use exec here so we don't import allennlp whilst setting up.
-VERSION = {}
+VERSION = {}  # type: ignore
 with open("allennlp/version.py", "r") as version_file:
     exec(version_file.read(), VERSION)
 
@@ -38,7 +38,16 @@
     author_email="allennlp@allenai.org",
     license="Apache",
     packages=find_packages(
-        exclude=["*.tests", "*.tests.*", "tests.*", "tests", "test_fixtures", "test_fixtures.*"]
+        exclude=[
+            "*.tests",
+            "*.tests.*",
+            "tests.*",
+            "tests",
+            "test_fixtures",
+            "test_fixtures.*",
+            "benchmarks",
+            "benchmarks.*",
+        ]
     ),
     install_requires=[
         "torch>=1.5.0,<1.6.0",

diff --git a/tests/commands/train_test.py b/tests/commands/train_test.py
@@ -32,10 +32,10 @@
 
 @BatchCallback.register("training_data_logger")
 class TrainingDataLoggerBatchCallback(BatchCallback):
-    def __call__(
+    def __call__(  # type: ignore
         self,
         trainer: "GradientDescentTrainer",
-        batch_inputs: List[List[TensorDict]],
+        batch_inputs: List[TensorDict],
         batch_outputs: List[Dict[str, Any]],
         epoch: int,
         batch_number: int,
@@ -46,7 +46,7 @@ def __call__(
             logger = logging.getLogger(__name__)
             for batch in batch_inputs:
                 for metadata in batch["metadata"]:
-                    logger.info(f"First word from training data: '{metadata['words'][0]}'")
+                    logger.info(f"First word from training data: '{metadata['words'][0]}'")  # type: ignore
 
 
 class TestTrain(AllenNlpTestCase):
@@ -311,7 +311,7 @@ def test_train_model_distributed_without_sharded_reader(self, lazy: bool):
         import re
 
         pattern = re.compile(r"First word from training data: '([^']*)'")
-        first_word_counts = Counter()
+        first_word_counts = Counter()  # type: ignore
         with open(os.path.join(out_dir, "stdout_worker0.log")) as f:
             worker0_log = f.read()
             assert train_complete in worker0_log

diff --git a/tests/data/token_indexers/single_id_token_indexer_test.py b/tests/data/token_indexers/single_id_token_indexer_test.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from dataclasses import dataclass
 
 import pytest
 
@@ -8,6 +9,17 @@
 from allennlp.data.tokenizers import SpacyTokenizer
 
 
+@dataclass(init=False)
+class TokenWithStyle(Token):
+    __slots__ = ["is_bold"]
+
+    is_bold: bool
+
+    def __init__(self, text: str = None, is_bold: bool = False):
+        super().__init__(text=text)
+        self.is_bold = is_bold
+
+
 class TestSingleIdTokenIndexer(AllenNlpTestCase):
     def test_count_vocab_items_respects_casing(self):
         indexer = SingleIdTokenIndexer("words")
@@ -30,7 +42,7 @@ def test_as_array_produces_token_sequence(self):
     def test_count_other_features(self):
         indexer = SingleIdTokenIndexer("other_features", feature_name="is_bold")
         counter = defaultdict(lambda: defaultdict(int))
-        token = Token("Header")
+        token = TokenWithStyle("Header")
         token.is_bold = "True"
         indexer.count_vocab_items(token, counter)
         assert counter["other_features"] == {"True": 1}