Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Making Token class a "slots" class (#4312)
Browse files Browse the repository at this point in the history
* ensure linting and typechecking ran on all code

* make Token a __slots__ class

* add benchmarks

* update CHANGELOG

* fix test with custom token subclass

* Update allennlp/data/tokenizers/token.py

Co-authored-by: Matt Gardner <mattg@allenai.org>

Co-authored-by: Matt Gardner <mattg@allenai.org>
  • Loading branch information
epwalsh and matt-gardner committed Jun 2, 2020
1 parent 32bccfb commit 11a08ae
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 21 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Similar to our caching mechanism, we introduced a lock file to the vocab to avoid race
conditions when saving/loading the vocab from/to the same serialization directory in different processes.
- Changed the `Token` class to a "slots" class, which dramatically reduces the size in memory of `Token` instances.

## [v1.0.0rc5](https://github.com/allenai/allennlp/releases/tag/v1.0.0rc5) - 2020-05-26

Expand Down
10 changes: 7 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ check-for-cuda :

.PHONY : lint
lint :
flake8 ./scripts ./tests $(SRC)
flake8 .

.PHONY : format
format :
black --check ./scripts ./tests $(SRC)
black --check .

.PHONY : typecheck
typecheck :
mypy $(SRC) \
mypy . \
--ignore-missing-imports \
--no-strict-optional \
--no-site-packages \
Expand All @@ -71,6 +71,10 @@ test-with-cov :
gpu-test :
pytest --color=yes -v -rf -m gpu

.PHONY : benchmarks
benchmarks :
pytest -c benchmarks/pytest.ini benchmarks/

#
# Setup helpers
#
Expand Down
63 changes: 52 additions & 11 deletions allennlp/data/tokenizers/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional


@dataclass
@dataclass(init=False, repr=False)
class Token:
"""
A simple token representation, keeping track of the token's text, offset in the passage it was
Expand Down Expand Up @@ -40,16 +40,57 @@ class Token:
added, similar to spacy's `lex_id`.
"""

text: Optional[str] = None
idx: Optional[int] = None
idx_end: Optional[int] = None
lemma_: Optional[str] = None
pos_: Optional[str] = None
tag_: Optional[str] = None
dep_: Optional[str] = None
ent_type_: Optional[str] = None
text_id: Optional[int] = None
type_id: Optional[int] = None
__slots__ = [
"text",
"idx",
"idx_end",
"lemma_",
"pos_",
"tag_",
"dep_",
"ent_type_",
"text_id",
"type_id",
]
# Defining the `__slots__` of this class is an optimization that dramatically reduces
# the size in memory of a `Token` instance. The downside of using `__slots__`
# with a dataclass is that you can't assign default values at the class level,
# which is why we need a custom `__init__` function that provides the default values.

text: Optional[str]
idx: Optional[int]
idx_end: Optional[int]
lemma_: Optional[str]
pos_: Optional[str]
tag_: Optional[str]
dep_: Optional[str]
ent_type_: Optional[str]
text_id: Optional[int]
type_id: Optional[int]

def __init__(
self,
text: str = None,
idx: int = None,
idx_end: int = None,
lemma_: str = None,
pos_: str = None,
tag_: str = None,
dep_: str = None,
ent_type_: str = None,
text_id: int = None,
type_id: int = None,
) -> None:
self.text = text
self.idx = idx
self.idx_end = idx_end
self.lemma_ = lemma_
self.pos_ = pos_
self.tag_ = tag_
self.dep_ = dep_
self.ent_type_ = ent_type_
self.text_id = text_id
self.type_id = type_id

def __str__(self):
return self.text
Expand Down
Empty file added benchmarks/__init__.py
Empty file.
Empty file added benchmarks/data/__init__.py
Empty file.
Empty file.
16 changes: 16 additions & 0 deletions benchmarks/data/tokenizers/character_tokenizer_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from allennlp.data.tokenizers import CharacterTokenizer


tokenizer = CharacterTokenizer()
passage = (
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
"incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis "
"nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. "
"Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu "
"fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in "
"culpa qui officia deserunt mollit anim id est laborum."
)


def bench_character_tokenizer(benchmark):
benchmark(tokenizer.tokenize, passage)
8 changes: 8 additions & 0 deletions benchmarks/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# We use pytest to run benchmarks, which is weird, but so far the best benchmarking
# framework we've found is only available as a pytest plugin.
# That said, we like to organize our benchmarks seperately and with different naming
# conventions from our tests, which requires using a seperate pytest configuration.
[pytest]
python_files = *_bench.py
python_functions = bench_* *_bench
python_classes =
3 changes: 3 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ responses>=0.7
# For running tests that aren't 100% reliable.
flaky

# For running benchmarks.
pytest-benchmark

#### DOC-RELATED PACKAGES ####

# YAML manipulation
Expand Down
13 changes: 11 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# version.py defines the VERSION and VERSION_SHORT variables.
# We use exec here so we don't import allennlp whilst setting up.
VERSION = {}
VERSION = {} # type: ignore
with open("allennlp/version.py", "r") as version_file:
exec(version_file.read(), VERSION)

Expand All @@ -38,7 +38,16 @@
author_email="allennlp@allenai.org",
license="Apache",
packages=find_packages(
exclude=["*.tests", "*.tests.*", "tests.*", "tests", "test_fixtures", "test_fixtures.*"]
exclude=[
"*.tests",
"*.tests.*",
"tests.*",
"tests",
"test_fixtures",
"test_fixtures.*",
"benchmarks",
"benchmarks.*",
]
),
install_requires=[
"torch>=1.5.0,<1.6.0",
Expand Down
8 changes: 4 additions & 4 deletions tests/commands/train_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@

@BatchCallback.register("training_data_logger")
class TrainingDataLoggerBatchCallback(BatchCallback):
def __call__(
def __call__( # type: ignore
self,
trainer: "GradientDescentTrainer",
batch_inputs: List[List[TensorDict]],
batch_inputs: List[TensorDict],
batch_outputs: List[Dict[str, Any]],
epoch: int,
batch_number: int,
Expand All @@ -46,7 +46,7 @@ def __call__(
logger = logging.getLogger(__name__)
for batch in batch_inputs:
for metadata in batch["metadata"]:
logger.info(f"First word from training data: '{metadata['words'][0]}'")
logger.info(f"First word from training data: '{metadata['words'][0]}'") # type: ignore


class TestTrain(AllenNlpTestCase):
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_train_model_distributed_without_sharded_reader(self, lazy: bool):
import re

pattern = re.compile(r"First word from training data: '([^']*)'")
first_word_counts = Counter()
first_word_counts = Counter() # type: ignore
with open(os.path.join(out_dir, "stdout_worker0.log")) as f:
worker0_log = f.read()
assert train_complete in worker0_log
Expand Down
14 changes: 13 additions & 1 deletion tests/data/token_indexers/single_id_token_indexer_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
from dataclasses import dataclass

import pytest

Expand All @@ -8,6 +9,17 @@
from allennlp.data.tokenizers import SpacyTokenizer


@dataclass(init=False)
class TokenWithStyle(Token):
__slots__ = ["is_bold"]

is_bold: bool

def __init__(self, text: str = None, is_bold: bool = False):
super().__init__(text=text)
self.is_bold = is_bold


class TestSingleIdTokenIndexer(AllenNlpTestCase):
def test_count_vocab_items_respects_casing(self):
indexer = SingleIdTokenIndexer("words")
Expand All @@ -30,7 +42,7 @@ def test_as_array_produces_token_sequence(self):
def test_count_other_features(self):
indexer = SingleIdTokenIndexer("other_features", feature_name="is_bold")
counter = defaultdict(lambda: defaultdict(int))
token = Token("Header")
token = TokenWithStyle("Header")
token.is_bold = "True"
indexer.count_vocab_items(token, counter)
assert counter["other_features"] == {"True": 1}
Expand Down

0 comments on commit 11a08ae

Please sign in to comment.