allenai · DeNeutoy · Sep 1, 2017 · Aug 29, 2017 · Aug 29, 2017 · Aug 29, 2017
diff --git a/Dockerfile b/Dockerfile
@@ -49,6 +49,7 @@ COPY allennlp/ allennlp/
 COPY tests/ tests/
 COPY pytest.ini pytest.ini
 COPY scripts/ scripts/
+COPY tutorials/ tutorials/
 
 # Run tests to verify the Docker build
 RUN PYTHONDONTWRITEBYTECODE=1 pytest
diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py
@@ -42,9 +42,9 @@ def __init__(self,
         self.label = label
         self._label_namespace = label_namespace
         self._label_id = None
-        if not self._label_namespace.endswith("labels"):
+        if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
             logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
-                           "ending with 'labels', so we don't add UNK and PAD tokens by "
+                           "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                            "default to your vocabulary.  See documentation for "
                            "`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace)
         if skip_indexing:

diff --git a/tests/notebooks_test.py b/tests/notebooks_test.py
@@ -1,19 +1,18 @@
 import os
-import pytest
 
 import nbformat
 from nbconvert.preprocessors.execute import CellExecutionError
 from nbconvert.preprocessors import ExecutePreprocessor
 
 from allennlp.common.testing import AllenNlpTestCase
 
-# This test started failing in the Docker build of
-# https://github.com/allenai/allennlp/commit/cb2913d52765ba3d63a0c85b3da92d4e01871d8d
-@pytest.mark.skip(reason="this test throws a low-level C exception in our Docker build")
 class TestNotebooks(AllenNlpTestCase):
     def test_vocabulary_tutorial(self):
         assert self.execute_notebook("tutorials/notebooks/vocabulary.ipynb")
 
+    def test_data_pipeline_tutorial(self):
+        assert self.execute_notebook("tutorials/notebooks/data_pipeline.ipynb")
+
     @staticmethod
     def execute_notebook(notebook_path: str):
         with open(notebook_path, encoding='utf-8') as notebook:

diff --git a/tutorials/notebooks/data_pipeline.ipynb b/tutorials/notebooks/data_pipeline.ipynb
@@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "\n",
+    "Allennlp uses a hierarchical system of data structures to represent a Dataset which allow easy padding, batching and iteration. This tutorial will cover some of the basic concepts.\n",
+    "\n",
+    "\n",
+    "At a high level, we use `DatasetReaders` to read a particular dataset into a `Dataset` of self-contained individual `Instances`, \n",
+    "which are made up of a dictionary of named `Fields`. There are many types of `Fields` which are useful for different types of data, such as `TextField`, for sentences, or `LabelField` for representing a categorical class label. Users who are familiar with the `torchtext` library from `Pytorch` will find a similar abstraction here. \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# This cell just makes sure the library paths are correct. \n",
+    "# You need to run this cell before you run the rest of this\n",
+    "# tutorial, but you can ignore the contents!\n",
+    "import os\n",
+    "import sys\n",
+    "module_path = os.path.abspath(os.path.join('../..'))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create two of the most common `Fields`, imagining we are preparing some data for a sentiment analysis model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['This', 'movie', 'was', 'awful', '!']\n",
+      "negative\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data.fields import TextField, LabelField\n",
+    "from allennlp.data.token_indexers import SingleIdTokenIndexer\n",
+    "\n",
+    "review = TextField([\"This\", \"movie\", \"was\", \"awful\", \"!\"], token_indexers={\"tokens\": SingleIdTokenIndexer()})\n",
+    "review_sentiment = LabelField(\"negative\", label_namespace=\"tags\")\n",
+    "\n",
+    "# Access the original strings and labels using the methods on the Fields.\n",
+    "print(\"Tokens in TextField: \", review.tokens)\n",
+    "print(\"Label of LabelField\", review_sentiment.label)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we've made our `Fields`, we need to pair them together to form an `Instance`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'review': <allennlp.data.fields.text_field.TextField object at 0x10bc93eb8>, 'label': <allennlp.data.fields.label_field.LabelField object at 0x10bc93e80>}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data import Instance\n",
+    "\n",
+    "instance1 = Instance({\"review\": review, \"label\": review_sentiment})\n",
+    "print(\"Fields in instance: \", instance1.fields)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "... and once we've made our `Instance`, we can group several of these into a `Dataset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from allennlp.data import Dataset\n",
+    "# Create another \n",
+    "review2 = TextField([\"This\", \"movie\", \"was\", \"quite\", \"slow\", \"but\", \"good\" \".\"], token_indexers={\"tokens\": SingleIdTokenIndexer()})\n",
+    "review_sentiment2 = LabelField(\"positive\", label_namespace=\"tags\")\n",
+    "instance2 = Instance({\"review\": review2, \"label\": review_sentiment2})\n",
+    "\n",
+    "review_dataset = Dataset([instance1, instance2])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to get our tiny sentiment analysis dataset ready for use in a model, we need to be able to do a few things: \n",
+    "- Create a vocabulary from the Dataset (using `Vocabulary.from_dataset`)\n",
+    "- Index the words and labels in the`Fields` to use the integer indices specified by the `Vocabulary`\n",
+    "- Pad the instances to the same length\n",
+    "- Convert them into arrays.\n",
+    "The `Dataset`, `Instance` and `Fields` have some similar parts of their API. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2/2 [00:00<00:00, 9857.35it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 10578.32it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the id -> word mapping for the 'tokens' namespace: \n",
+      "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'This', 3: 'was', 4: 'movie', 5: 'slow', 6: 'quite', 7: '!', 8: 'good.', 9: 'but', 10: 'awful'}\n",
+      "This is the id -> word mapping for the 'tags' namespace: \n",
+      "{0: 'positive', 1: 'negative'}\n",
+      "defaultdict(None, {'tokens': {'slow': 5, '@@PADDING@@': 0, 'This': 2, '!': 7, 'quite': 6, 'was': 3, 'good.': 8, '@@UNKNOWN@@': 1, 'awful': 10, 'but': 9, 'movie': 4}, 'tags': {'positive': 0, 'negative': 1}})\n",
+      "Lengths used for padding:  {'review': {'num_tokens': 7}}\n",
+      "{'review': {'tokens': array([[ 2,  4,  3, 10,  7,  0,  0],\n",
+      "       [ 2,  4,  3,  6,  5,  9,  8]])}, 'label': array([[1],\n",
+      "       [0]])}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data import Vocabulary \n",
+    "\n",
+    "# This will automatically create a vocab from our dataset.\n",
+    "# It will have \"namespaces\" which correspond to two things:\n",
+    "# 1. Namespaces passed to fields (e.g. the \"tags\" namespace we passed to our LabelField)\n",
+    "# 2. The keys of the 'Token Indexer' dictionary in 'TextFields'.\n",
+    "# passed to Fields (so it will have a 'tags' namespace).\n",
+    "vocab = Vocabulary.from_dataset(review_dataset)\n",
+    "\n",
+    "print(\"This is the id -> word mapping for the 'tokens' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"tokens\"), \"\\n\")\n",
+    "print(\"This is the id -> word mapping for the 'tags' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"tags\"), \"\\n\")\n",
+    "print(\"Vocab Token to Index dictionary: \", vocab._token_to_index, \"\\n\")\n",
+    "# Note that the \"tags\" namespace doesn't contain padding or unknown tokens.\n",
+    "\n",
+    "# Next, we index our dataset using our newly generated vocabulary.\n",
+    "# This modifies the current object. You must perform this step before \n",
+    "# trying to generate arrays. \n",
+    "review_dataset.index_instances(vocab)\n",
+    "\n",
+    "# Finally, we return the dataset as arrays, padded using padding lengths\n",
+    "# extracted from the dataset itself, which will be the max sentence length\n",
+    "# from our two instances.\n",
+    "padding_lengths = review_dataset.get_padding_lengths()\n",
+    "print(\"Lengths used for padding: \", padding_lengths, \"\\n\")\n",
+    "array_dict = review_dataset.as_array_dict(padding_lengths, verbose=False)\n",
+    "print(array_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we've seen how to transform a dataset of 2 instances into arrays for feeding into an allennlp `Model`. One nice thing about the `Dataset` API is that we don't require the concept of a `Batch` - it's just a small dataset! If you are iterating over a large number of `Instances`, such as during training, you may want to look into `allennlp.data.Iterators`, which specify several different ways of iterating over a `Dataset` in batches, such as fixed batch sizes, bucketing and stochastic sorting. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There's been one thing we've left out of this tutorial so far - explaining the role of the `TokenIndexer` in `TextField`. We decided to introduce a new step into the typical `tokenisation -> indexing -> embedding` pipeline, because for more complicated encodings of words, such as those including character embeddings, this pipeline becomes difficult. Our pipeline contains the following steps: `tokenisation -> TokenIndexers -> TokenEmbedders -> TextFieldEmbedders`. \n",
+    "\n",
+    "The token indexer we used above is the most basic one - it assigns a single ID to each word in the `TextField`. This is classically what you might think of when indexing words. \n",
+    "However, let's take a look at using a `TokenCharacterIndexer` as well - this takes the words in a `TextField` and generates indices for the characters in the words.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00, 4364.52it/s]\n",
+      "100%|██████████| 1/1 [00:00<00:00, 3758.34it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the id -> word mapping for the 'tokens' namespace: \n",
+      "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'This', 3: 'was', 4: 'movie', 5: 'slow', 6: 'quite', 7: '!', 8: 'good.', 9: 'but', 10: 'awful'}\n",
+      "This is the id -> word mapping for the 'chars' namespace: \n",
+      "{0: '@@PADDING@@', 1: '@@UNKNOWN@@'}\n",
+      "Lengths used for padding (Note that we now have a new padding key from the TokenCharactersIndexer):  {'sentence': {'num_tokens': 5, 'num_token_characters': 5}}\n",
+      "{'sentence': {'num_tokens': 5, 'num_token_characters': 5}}\n",
+      "{'sentence': {'chars': array([[[ 6,  2,  3,  2,  0],\n",
+      "        [11,  3,  2,  0,  0],\n",
+      "        [ 5,  4, 10,  2,  0],\n",
+      "        [ 8,  4,  3,  7,  5],\n",
+      "        [ 9,  0,  0,  0,  0]]]), 'tokens': array([[2, 5, 3, 4, 6]])}}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data.token_indexers import TokenCharactersIndexer\n",
+    "\n",
+    "word_and_character_text_field = TextField([\"Here\", \"are\", \"some\", \"longer\", \"words\", \".\"], \n",
+    "                                          token_indexers={\"tokens\": SingleIdTokenIndexer(), \"chars\": TokenCharactersIndexer()})\n",
+    "mini_dataset = Dataset([Instance({\"sentence\": word_and_character_text_field})])\n",
+    "\n",
+    "# Fit a new vocabulary to this Field and index it:\n",
+    "word_and_char_vocab = Vocabulary.from_dataset(mini_dataset)\n",
+    "mini_dataset.index_instances(word_and_char_vocab)\n",
+    "\n",
+    "print(\"This is the id -> word mapping for the 'tokens' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"tokens\"), \"\\n\")\n",
+    "print(\"This is the id -> word mapping for the 'chars' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"chars\"), \"\\n\")\n",
+    "\n",
+    "\n",
+    "# Now, the padding lengths method will find the max sentence length \n",
+    "# _and_ max word length in the batch and pad all sentences to the max\n",
+    "# sentence length and all words to the max word length.\n",
+    "padding_lengths = mini_dataset.get_padding_lengths()\n",
+    "print(\"Lengths used for padding (Note that we now have a new \"\n",
+    "      \"padding key num_token_characters from the TokenCharactersIndexer): \")\n",
+    "print(padding_lengths, \"\\n\")\n",
+    "\n",
+    "array_dict = mini_dataset.as_array_dict(padding_lengths, verbose=False)\n",
+    "\n",
+    "print(array_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "Now we've used a new token indexer, you can see that the `review` field of the returned dictionary now has 2 elements: `tokens`, an array representing the indexed tokens and `chars`, an array representing each word in the `TextField` as a list of character indices. Crucially, each list of integers for each word has been padded to the length of the maximum word in the sentence. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}