diff --git a/Dockerfile b/Dockerfile index 4fb465202e2..419dfb7d301 100644 --- a/Dockerfile +++ b/Dockerfile @@ -49,6 +49,7 @@ COPY allennlp/ allennlp/ COPY tests/ tests/ COPY pytest.ini pytest.ini COPY scripts/ scripts/ +COPY tutorials/ tutorials/ # Run tests to verify the Docker build RUN PYTHONDONTWRITEBYTECODE=1 pytest diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py index 81d4454597b..66e1f40da3c 100644 --- a/allennlp/data/fields/label_field.py +++ b/allennlp/data/fields/label_field.py @@ -42,9 +42,9 @@ def __init__(self, self.label = label self._label_namespace = label_namespace self._label_id = None - if not self._label_namespace.endswith("labels"): + if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")): logger.warning("Your label namespace was '%s'. We recommend you use a namespace " - "ending with 'labels', so we don't add UNK and PAD tokens by " + "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " "default to your vocabulary. See documentation for " "`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace) if skip_indexing: diff --git a/tests/notebooks_test.py b/tests/notebooks_test.py index e7d3f452bf2..8240810eb06 100644 --- a/tests/notebooks_test.py +++ b/tests/notebooks_test.py @@ -1,5 +1,4 @@ import os -import pytest import nbformat from nbconvert.preprocessors.execute import CellExecutionError @@ -7,13 +6,13 @@ from allennlp.common.testing import AllenNlpTestCase -# This test started failing in the Docker build of -# https://github.com/allenai/allennlp/commit/cb2913d52765ba3d63a0c85b3da92d4e01871d8d -@pytest.mark.skip(reason="this test throws a low-level C exception in our Docker build") class TestNotebooks(AllenNlpTestCase): def test_vocabulary_tutorial(self): assert self.execute_notebook("tutorials/notebooks/vocabulary.ipynb") + def test_data_pipeline_tutorial(self): + assert self.execute_notebook("tutorials/notebooks/data_pipeline.ipynb") + @staticmethod def execute_notebook(notebook_path: str): with open(notebook_path, encoding='utf-8') as notebook: diff --git a/tutorials/notebooks/data_pipeline.ipynb b/tutorials/notebooks/data_pipeline.ipynb new file mode 100644 index 00000000000..f5717c16c02 --- /dev/null +++ b/tutorials/notebooks/data_pipeline.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "\n", + "Allennlp uses a hierarchical system of data structures to represent a Dataset which allow easy padding, batching and iteration. This tutorial will cover some of the basic concepts.\n", + "\n", + "\n", + "At a high level, we use `DatasetReaders` to read a particular dataset into a `Dataset` of self-contained individual `Instances`, \n", + "which are made up of a dictionary of named `Fields`. There are many types of `Fields` which are useful for different types of data, such as `TextField`, for sentences, or `LabelField` for representing a categorical class label. Users who are familiar with the `torchtext` library from `Pytorch` will find a similar abstraction here. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# This cell just makes sure the library paths are correct. \n", + "# You need to run this cell before you run the rest of this\n", + "# tutorial, but you can ignore the contents!\n", + "import os\n", + "import sys\n", + "module_path = os.path.abspath(os.path.join('../..'))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create two of the most common `Fields`, imagining we are preparing some data for a sentiment analysis model. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['This', 'movie', 'was', 'awful', '!']\n", + "negative\n" + ] + } + ], + "source": [ + "from allennlp.data.fields import TextField, LabelField\n", + "from allennlp.data.token_indexers import SingleIdTokenIndexer\n", + "\n", + "review = TextField([\"This\", \"movie\", \"was\", \"awful\", \"!\"], token_indexers={\"tokens\": SingleIdTokenIndexer()})\n", + "review_sentiment = LabelField(\"negative\", label_namespace=\"tags\")\n", + "\n", + "# Access the original strings and labels using the methods on the Fields.\n", + "print(\"Tokens in TextField: \", review.tokens)\n", + "print(\"Label of LabelField\", review_sentiment.label)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we've made our `Fields`, we need to pair them together to form an `Instance`. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'review': , 'label': }\n" + ] + } + ], + "source": [ + "from allennlp.data import Instance\n", + "\n", + "instance1 = Instance({\"review\": review, \"label\": review_sentiment})\n", + "print(\"Fields in instance: \", instance1.fields)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "... and once we've made our `Instance`, we can group several of these into a `Dataset`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from allennlp.data import Dataset\n", + "# Create another \n", + "review2 = TextField([\"This\", \"movie\", \"was\", \"quite\", \"slow\", \"but\", \"good\" \".\"], token_indexers={\"tokens\": SingleIdTokenIndexer()})\n", + "review_sentiment2 = LabelField(\"positive\", label_namespace=\"tags\")\n", + "instance2 = Instance({\"review\": review2, \"label\": review_sentiment2})\n", + "\n", + "review_dataset = Dataset([instance1, instance2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to get our tiny sentiment analysis dataset ready for use in a model, we need to be able to do a few things: \n", + "- Create a vocabulary from the Dataset (using `Vocabulary.from_dataset`)\n", + "- Index the words and labels in the`Fields` to use the integer indices specified by the `Vocabulary`\n", + "- Pad the instances to the same length\n", + "- Convert them into arrays.\n", + "The `Dataset`, `Instance` and `Fields` have some similar parts of their API. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00<00:00, 9857.35it/s]\n", + "100%|██████████| 2/2 [00:00<00:00, 10578.32it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is the id -> word mapping for the 'tokens' namespace: \n", + "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'This', 3: 'was', 4: 'movie', 5: 'slow', 6: 'quite', 7: '!', 8: 'good.', 9: 'but', 10: 'awful'}\n", + "This is the id -> word mapping for the 'tags' namespace: \n", + "{0: 'positive', 1: 'negative'}\n", + "defaultdict(None, {'tokens': {'slow': 5, '@@PADDING@@': 0, 'This': 2, '!': 7, 'quite': 6, 'was': 3, 'good.': 8, '@@UNKNOWN@@': 1, 'awful': 10, 'but': 9, 'movie': 4}, 'tags': {'positive': 0, 'negative': 1}})\n", + "Lengths used for padding: {'review': {'num_tokens': 7}}\n", + "{'review': {'tokens': array([[ 2, 4, 3, 10, 7, 0, 0],\n", + " [ 2, 4, 3, 6, 5, 9, 8]])}, 'label': array([[1],\n", + " [0]])}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from allennlp.data import Vocabulary \n", + "\n", + "# This will automatically create a vocab from our dataset.\n", + "# It will have \"namespaces\" which correspond to two things:\n", + "# 1. Namespaces passed to fields (e.g. the \"tags\" namespace we passed to our LabelField)\n", + "# 2. The keys of the 'Token Indexer' dictionary in 'TextFields'.\n", + "# passed to Fields (so it will have a 'tags' namespace).\n", + "vocab = Vocabulary.from_dataset(review_dataset)\n", + "\n", + "print(\"This is the id -> word mapping for the 'tokens' namespace: \")\n", + "print(vocab.get_index_to_token_vocabulary(\"tokens\"), \"\\n\")\n", + "print(\"This is the id -> word mapping for the 'tags' namespace: \")\n", + "print(vocab.get_index_to_token_vocabulary(\"tags\"), \"\\n\")\n", + "print(\"Vocab Token to Index dictionary: \", vocab._token_to_index, \"\\n\")\n", + "# Note that the \"tags\" namespace doesn't contain padding or unknown tokens.\n", + "\n", + "# Next, we index our dataset using our newly generated vocabulary.\n", + "# This modifies the current object. You must perform this step before \n", + "# trying to generate arrays. \n", + "review_dataset.index_instances(vocab)\n", + "\n", + "# Finally, we return the dataset as arrays, padded using padding lengths\n", + "# extracted from the dataset itself, which will be the max sentence length\n", + "# from our two instances.\n", + "padding_lengths = review_dataset.get_padding_lengths()\n", + "print(\"Lengths used for padding: \", padding_lengths, \"\\n\")\n", + "array_dict = review_dataset.as_array_dict(padding_lengths, verbose=False)\n", + "print(array_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we've seen how to transform a dataset of 2 instances into arrays for feeding into an allennlp `Model`. One nice thing about the `Dataset` API is that we don't require the concept of a `Batch` - it's just a small dataset! If you are iterating over a large number of `Instances`, such as during training, you may want to look into `allennlp.data.Iterators`, which specify several different ways of iterating over a `Dataset` in batches, such as fixed batch sizes, bucketing and stochastic sorting. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There's been one thing we've left out of this tutorial so far - explaining the role of the `TokenIndexer` in `TextField`. We decided to introduce a new step into the typical `tokenisation -> indexing -> embedding` pipeline, because for more complicated encodings of words, such as those including character embeddings, this pipeline becomes difficult. Our pipeline contains the following steps: `tokenisation -> TokenIndexers -> TokenEmbedders -> TextFieldEmbedders`. \n", + "\n", + "The token indexer we used above is the most basic one - it assigns a single ID to each word in the `TextField`. This is classically what you might think of when indexing words. \n", + "However, let's take a look at using a `TokenCharacterIndexer` as well - this takes the words in a `TextField` and generates indices for the characters in the words.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 4364.52it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3758.34it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is the id -> word mapping for the 'tokens' namespace: \n", + "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'This', 3: 'was', 4: 'movie', 5: 'slow', 6: 'quite', 7: '!', 8: 'good.', 9: 'but', 10: 'awful'}\n", + "This is the id -> word mapping for the 'chars' namespace: \n", + "{0: '@@PADDING@@', 1: '@@UNKNOWN@@'}\n", + "Lengths used for padding (Note that we now have a new padding key from the TokenCharactersIndexer): {'sentence': {'num_tokens': 5, 'num_token_characters': 5}}\n", + "{'sentence': {'num_tokens': 5, 'num_token_characters': 5}}\n", + "{'sentence': {'chars': array([[[ 6, 2, 3, 2, 0],\n", + " [11, 3, 2, 0, 0],\n", + " [ 5, 4, 10, 2, 0],\n", + " [ 8, 4, 3, 7, 5],\n", + " [ 9, 0, 0, 0, 0]]]), 'tokens': array([[2, 5, 3, 4, 6]])}}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from allennlp.data.token_indexers import TokenCharactersIndexer\n", + "\n", + "word_and_character_text_field = TextField([\"Here\", \"are\", \"some\", \"longer\", \"words\", \".\"], \n", + " token_indexers={\"tokens\": SingleIdTokenIndexer(), \"chars\": TokenCharactersIndexer()})\n", + "mini_dataset = Dataset([Instance({\"sentence\": word_and_character_text_field})])\n", + "\n", + "# Fit a new vocabulary to this Field and index it:\n", + "word_and_char_vocab = Vocabulary.from_dataset(mini_dataset)\n", + "mini_dataset.index_instances(word_and_char_vocab)\n", + "\n", + "print(\"This is the id -> word mapping for the 'tokens' namespace: \")\n", + "print(vocab.get_index_to_token_vocabulary(\"tokens\"), \"\\n\")\n", + "print(\"This is the id -> word mapping for the 'chars' namespace: \")\n", + "print(vocab.get_index_to_token_vocabulary(\"chars\"), \"\\n\")\n", + "\n", + "\n", + "# Now, the padding lengths method will find the max sentence length \n", + "# _and_ max word length in the batch and pad all sentences to the max\n", + "# sentence length and all words to the max word length.\n", + "padding_lengths = mini_dataset.get_padding_lengths()\n", + "print(\"Lengths used for padding (Note that we now have a new \"\n", + " \"padding key num_token_characters from the TokenCharactersIndexer): \")\n", + "print(padding_lengths, \"\\n\")\n", + "\n", + "array_dict = mini_dataset.as_array_dict(padding_lengths, verbose=False)\n", + "\n", + "print(array_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Now we've used a new token indexer, you can see that the `review` field of the returned dictionary now has 2 elements: `tokens`, an array representing the indexed tokens and `chars`, an array representing each word in the `TextField` as a list of character indices. Crucially, each list of integers for each word has been padded to the length of the maximum word in the sentence. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/tutorials/notebooks/vocabulary.ipynb b/tutorials/notebooks/vocabulary.ipynb index 0d1b6f3de3a..9857ec41d53 100644 --- a/tutorials/notebooks/vocabulary.ipynb +++ b/tutorials/notebooks/vocabulary.ipynb @@ -108,7 +108,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{0: '@@PADDING@@', 1: '@@UNKOWN@@', 2: 'Barack', 3: 'Obama'}\n", + "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'Barack', 3: 'Obama'}\n", "{0: 'PERSON', 1: 'PLACE'}\n" ] } @@ -195,18 +195,16 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ - "from allennlp.data.fields import TextField, TagField\n", + "from allennlp.data.fields import TextField, SequenceLabelField\n", "from allennlp.data import Dataset, Instance\n", "from allennlp.data.token_indexers import SingleIdTokenIndexer\n", "sentence = TextField(tokens=[\"Barack\", \"Obama\", \"is\", \"a\", \"great\", \"guy\", \".\"],\n", " token_indexers={\"tokens\": SingleIdTokenIndexer()})\n", - "tags = TagField([\"PERSON\", \"PERSON\", \"O\", \"O\", \"O\", \"O\", \"O\"], sentence, tag_namespace=\"tags\")\n", + "tags = SequenceLabelField([\"PERSON\", \"PERSON\", \"O\", \"O\", \"O\", \"O\", \"O\"], sentence, label_namespace=\"tags\")\n", "toy_dataset = Dataset([Instance({\"sentence\": sentence, \"tags\": tags})])" ] }, @@ -229,21 +227,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 6132.02it/s]" + "100%|██████████| 1/1 [00:00<00:00, 6797.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{0: '@@PADDING@@', 1: '@@UNKOWN@@', 2: 'Barack', 3: 'is', 4: 'Obama', 5: '.', 6: 'great', 7: 'guy', 8: 'a'}\n", + "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'Barack', 3: 'is', 4: 'guy', 5: 'Obama', 6: 'a', 7: '.', 8: 'great'}\n", "{0: 'O', 1: 'PERSON'}\n" ] }, @@ -267,6 +265,17 @@ "source": [ "Note that the vocab we created has `tokens` and `tags` namespaces. These come from the key in the `token_indexers` dict in the `TextField` and the `tag_namespace` parameter in the `TagField`. At first, it seems confusing as to why it's possible to have multiple `TokenIndexers`. This is because in `allennlp`, we make a distinction between _tokenisation_ and _token representation_. More on this in the NLP API Tutorial!" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] } ], "metadata": { @@ -290,4 +299,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file