allenai · DeNeutoy · Sep 1, 2017 · Aug 29, 2017 · Aug 29, 2017 · Aug 29, 2017
diff --git a/Dockerfile b/Dockerfile
@@ -50,6 +50,7 @@ COPY allennlp/ allennlp/
 COPY tests/ tests/
 COPY pytest.ini pytest.ini
 COPY scripts/ scripts/
+COPY tutorials/ tutorials/
 
 # Run tests to verify the Docker build
 RUN PYTHONDONTWRITEBYTECODE=1 pytest
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -50,6 +50,7 @@ COPY allennlp/ allennlp/
 COPY tests/ tests/
 COPY pytest.ini pytest.ini
 COPY scripts/ scripts/
+COPY tutorials/ tutorials/
 
 # Run tests to verify the Docker build
 RUN PYTHONDONTWRITEBYTECODE=1 pytest
diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py
@@ -42,9 +42,9 @@ def __init__(self,
         self.label = label
         self._label_namespace = label_namespace
         self._label_id = None
-        if not self._label_namespace.endswith("labels"):
+        if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
             logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
-                           "ending with 'labels', so we don't add UNK and PAD tokens by "
+                           "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                            "default to your vocabulary.  See documentation for "
                            "`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace)
         if skip_indexing:

diff --git a/tests/notebooks_test.py b/tests/notebooks_test.py
@@ -1,19 +1,18 @@
 import os
-import pytest
 
 import nbformat
 from nbconvert.preprocessors.execute import CellExecutionError
 from nbconvert.preprocessors import ExecutePreprocessor
 
 from allennlp.common.testing import AllenNlpTestCase
 
-# This test started failing in the Docker build of
-# https://github.com/allenai/allennlp/commit/cb2913d52765ba3d63a0c85b3da92d4e01871d8d
-@pytest.mark.skip(reason="this test throws a low-level C exception in our Docker build")
 class TestNotebooks(AllenNlpTestCase):
     def test_vocabulary_tutorial(self):
         assert self.execute_notebook("tutorials/notebooks/vocabulary.ipynb")
 
+    def test_data_pipeline_tutorial(self):
+        assert self.execute_notebook("tutorials/notebooks/data_pipeline.ipynb")
+
     @staticmethod
     def execute_notebook(notebook_path: str):
         with open(notebook_path, encoding='utf-8') as notebook:

diff --git a/tutorials/notebooks/data_pipeline.ipynb b/tutorials/notebooks/data_pipeline.ipynb
@@ -0,0 +1,301 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "\n",
+    "Allennlp uses \n",
+    "\n",
+    "\n",
+    "At a high level, we use `DatasetReaders` to read a particular dataset into a `Dataset` of self-contained individual `Instances`, \n",
+    "which are made up of a dictionary of named `Fields`. There are many types of `Fields` which are useful for different types of data, such as `TextField`, for sentences, or `LabelField` for representing a categorical class label. Users who are familiar with the `torchtext` library from `Pytorch` will find a similar abstraction here. \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# This cell just makes sure the library paths are correct. \n",
+    "# You need to run this cell before you run the rest of this\n",
+    "# tutorial, but you can ignore the contents!\n",
+    "import os\n",
+    "import sys\n",
+    "module_path = os.path.abspath(os.path.join('../..'))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create two of the most common `Fields`, imagining we are preparing some data for a sentiment analysis model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['This', 'movie', 'was', 'awful', '!']\n",
+      "negative\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data.fields import TextField, LabelField\n",
+    "from allennlp.data.token_indexers import SingleIdTokenIndexer\n",
+    "\n",
+    "review = TextField([\"This\", \"movie\", \"was\", \"awful\", \"!\"], token_indexers={\"tokens\": SingleIdTokenIndexer()})\n",
+    "review_sentiment = LabelField(\"negative\", label_namespace=\"tags\")\n",
+    "\n",
+    "# Access the original strings and labels using the methods on the Fields.\n",
+    "print(review.tokens)\n",
+    "print(review_sentiment.label)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we've made our `Fields`, we need to pair them together to form an `Instance`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'review': <allennlp.data.fields.text_field.TextField object at 0x105f39eb8>, 'label': <allennlp.data.fields.label_field.LabelField object at 0x105f39e80>}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data import Instance\n",
+    "\n",
+    "instance1 = Instance({\"review\": review, \"label\": review_sentiment})\n",
+    "print(instance1.fields)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "... and once we've made our `Instance`, we can group several of these into a `Dataset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from allennlp.data import Dataset\n",
+    "# Create another \n",
+    "review2 = TextField([\"This\", \"movie\", \"was\", \"quite\", \"slow\", \"but\", \"good\" \".\"], token_indexers={\"tokens\": SingleIdTokenIndexer()})\n",
+    "review_sentiment2 = LabelField(\"positive\", label_namespace=\"tags\")\n",
+    "instance2 = Instance({\"review\": review2, \"label\": review_sentiment2})\n",
+    "\n",
+    "review_dataset = Dataset([instance1, instance2])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to get our tiny sentiment analysis dataset ready for use in a model, we need to be able to do a few things: \n",
+    "- Create a vocabulary from the Dataset (using `Vocabulary.from_dataset`)\n",
+    "- Index the words and labels in the`Fields` to use the integer indices specified by the `Vocabulary`\n",
+    "- Pad the instances to the same length\n",
+    "- Convert them into arrays.\n",
+    "The `Dataset`, `Instance` and `Fields` have some similar parts of their API. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2/2 [00:00<00:00, 11618.57it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 10472.67it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the id -> word mapping for the 'tokens' namespace: \n",
+      "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'movie', 3: 'was', 4: 'This', 5: 'but', 6: 'good.', 7: '!', 8: 'awful', 9: 'quite', 10: 'slow'}\n",
+      "This is the id -> word mapping for the 'tags' namespace: \n",
+      "{0: 'positive', 1: 'negative'}\n",
+      "defaultdict(None, {'tokens': {'slow': 10, 'good.': 6, 'quite': 9, 'movie': 2, '!': 7, '@@UNKNOWN@@': 1, 'but': 5, 'was': 3, '@@PADDING@@': 0, 'This': 4, 'awful': 8}, 'tags': {'positive': 0, 'negative': 1}})\n",
+      "{'review': {'tokens': array([[ 4,  2,  3,  8,  7,  0,  0],\n",
+      "       [ 4,  2,  3,  9, 10,  5,  6]])}, 'label': array([[1],\n",
+      "       [0]])}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data import Vocabulary \n",
+    "\n",
+    "# This will automatically create a vocab from our dataset.\n",
+    "# It will have \"namespaces\" which correspond to two things:\n",
+    "# 1. Namespaces passed to fields (e.g. the \"tags\" namespace we passed to our LabelField)\n",
+    "# 2. The keys of the 'Token Indexer' dictionary in 'TextFields'.\n",
+    "# passed to Fields (so it will have a 'tags' namespace).\n",
+    "vocab = Vocabulary.from_dataset(review_dataset)\n",
+    "\n",
+    "print(\"This is the id -> word mapping for the 'tokens' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"tokens\"))\n",
+    "print(\"This is the id -> word mapping for the 'tags' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"tags\"))\n",
+    "print(vocab._token_to_index)\n",
+    "# Note that the \"tags\" namespace doesn't contain padding or unknown tokens.\n",
+    "\n",
+    "# Next, we index our dataset using our newly generated vocabulary.\n",
+    "# This modifies the current object. You must perform this step before \n",
+    "# trying to generate arrays. \n",
+    "review_dataset.index_instances(vocab)\n",
+    "\n",
+    "# Finally, we return the dataset as arrays, padded using padding lengths\n",
+    "# extracted from the dataset itself.\n",
+    "padding_lengths = review_dataset.get_padding_lengths()\n",
+    "array_dict = review_dataset.as_array_dict(padding_lengths, verbose=False)\n",
+    "print(array_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we've seen how to transform a dataset of 2 instances into arrays for feeding into an allennlp `Model`. One nice thing about the `Dataset` API is that we don't require the concept of a `Batch` - it's just a small dataset! If you are iterating over a large number of `Instances`, such as during training, you may want to look into `allennlp.data.Iterators`, which specify several different ways of iterating over a `Dataset` in batches, such as fixed batch sizes, bucketing and stochastic sorting. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There's been one thing we've left out of this tutorial so far - explaining the role of the `TokenIndexer` in `TextField`. We decided to introduce a new step into the typical `tokenisation -> indexing -> embedding` pipeline, because for more complicated encodings of words, such as those including character embeddings, this pipeline becomes difficult. Our pipeline contains the following steps: `tokenisation -> TokenIndexers -> TokenEmbedders -> TextFieldEmbedders`. \n",
+    "\n",
+    "The token indexer we used above is the most basic one - it assigns a single ID to each word in the `TextField`. This is classically what you might think of when indexing words. \n",
+    "However, let's take a look at using a `TokenCharacterIndexer` as well - this takes the words in a `TextField` and generates indices for the characters in the words.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00, 5468.45it/s]\n",
+      "100%|██████████| 1/1 [00:00<00:00, 10512.04it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the id -> word mapping for the 'tokens' namespace: \n",
+      "{0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'movie', 3: 'was', 4: 'This', 5: 'but', 6: 'good.', 7: '!', 8: 'awful', 9: 'quite', 10: 'slow'}\n",
+      "This is the id -> word mapping for the 'chars' namespace: \n",
+      "{0: '@@PADDING@@', 1: '@@UNKNOWN@@'}\n",
+      "{'sentence': {'tokens': array([[6, 3, 2, 4, 5]]), 'chars': array([[[ 6,  2,  3,  2,  0],\n",
+      "        [ 7,  3,  2,  0,  0],\n",
+      "        [ 4,  5,  9,  2,  0],\n",
+      "        [10,  5,  3, 11,  4],\n",
+      "        [ 8,  0,  0,  0,  0]]])}}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from allennlp.data.token_indexers import TokenCharactersIndexer\n",
+    "\n",
+    "word_and_character_text_field = TextField([\"Here\", \"are\", \"some\", \"words\", \".\"], \n",
+    "                                          token_indexers={\"tokens\": SingleIdTokenIndexer(), \"chars\": TokenCharactersIndexer()})\n",
+    "mini_dataset = Dataset([Instance({\"sentence\": word_and_character_text_field})])\n",
+    "\n",
+    "# Fit a new vocabulary to this Field and index it:\n",
+    "word_and_char_vocab = Vocabulary.from_dataset(mini_dataset)\n",
+    "mini_dataset.index_instances(word_and_char_vocab)\n",
+    "\n",
+    "print(\"This is the id -> word mapping for the 'tokens' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"tokens\"))\n",
+    "print(\"This is the id -> word mapping for the 'chars' namespace: \")\n",
+    "print(vocab.get_index_to_token_vocabulary(\"chars\"))\n",
+    "\n",
+    "padding_lengths = mini_dataset.get_padding_lengths()\n",
+    "array_dict = mini_dataset.as_array_dict(padding_lengths, verbose=False)\n",
+    "\n",
+    "print(array_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "Now we've used a new token indexer, you can see that the `review` field of the returned dictionary now has 2 elements: `tokens`, an array representing the indexed tokens and `chars`, an array representing each word in the `TextField` as a list of character indices. Crucially, each list of integers for each word has been padded to the length of the maximum word in the sentence. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}