Upgrade conllu from 0.11 to 1.3.1 (#3115)

* Upgrade to latest version of conllu. * Elided tokens are no longer None, use isinstance. https://github.com/EmilStenstrom/conllu/wiki/Migrating-from-0.1-to-1.0#parsing-of-ids-now-include-ranges-and-decimals * No need to cast to int, ids are already ints now. https://github.com/EmilStenstrom/conllu/wiki/Migrating-from-0.1-to-1.0#parsing-of-ids-now-include-ranges-and-decimals * Incremental parsing is built into conllu. See Advanced usage under https://github.com/EmilStenstrom/conllu#use-parse-to-parse-into-a-list-of-sentences
allenai · Aug 6, 2019 · 9db0042 · 9db0042
1 parent 6746d12
commit 9db0042
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 16 deletions.
diff --git a/allennlp/data/dataset_readers/universal_dependencies.py b/allennlp/data/dataset_readers/universal_dependencies.py
@@ -2,7 +2,7 @@
 import logging
 
 from overrides import overrides
-from conllu.parser import parse_line, DEFAULT_FIELDS
+from conllu import parse_incr
 
 from allennlp.common.file_utils import cached_path
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
@@ -14,14 +14,6 @@
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
-def lazy_parse(text: str, fields: Tuple[str, ...]=DEFAULT_FIELDS):
-    for sentence in text.split("\n\n"):
-        if sentence:
-            yield [parse_line(line, fields)
-                   for line in sentence.split("\n")
-                   if line and not line.strip().startswith("#")]
-
-
 @DatasetReader.register("universal_dependencies")
 class UniversalDependenciesDatasetReader(DatasetReader):
     """
@@ -56,13 +48,13 @@ def _read(self, file_path: str):
         with open(file_path, 'r') as conllu_file:
             logger.info("Reading UD instances from conllu dataset at: %s", file_path)
 
-            for annotation in  lazy_parse(conllu_file.read()):
+            for annotation in parse_incr(conllu_file):
                 # CoNLLU annotations sometimes add back in words that have been elided
                 # in the original sentence; we remove these, as we're just predicting
                 # dependencies for the original sentence.
                 # We filter by None here as elided words have a non-integer word id,
                 # and are replaced with None by the conllu python library.
-                annotation = [x for x in annotation if x["id"] is not None]
+                annotation = [x for x in annotation if isinstance(x["id"], int)]
 
                 heads = [x["head"] for x in annotation]
                 tags = [x["deprel"] for x in annotation]
@@ -112,7 +104,7 @@ def text_to_instance(self,  # type: ignore
             fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                      text_field,
                                                      label_namespace="head_tags")
-            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
+            fields["head_indices"] = SequenceLabelField([x[1] for x in dependencies],
                                                         text_field,
                                                         label_namespace="head_index_tags")
 

diff --git a/allennlp/data/dataset_readers/universal_dependencies_multilang.py b/allennlp/data/dataset_readers/universal_dependencies_multilang.py
@@ -6,14 +6,14 @@
 import numpy as np
 
 from overrides import overrides
+from conllu import parse_incr
 
 from allennlp.common.checks import ConfigurationError
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 from allennlp.data.fields import Field, TextField, SequenceLabelField, MetadataField
 from allennlp.data.instance import Instance
 from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
 from allennlp.data.tokenizers import Token
-from allennlp.data.dataset_readers.universal_dependencies import lazy_parse
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -103,7 +103,7 @@ def _read_one_file(self, lang: str, file_path: str):
         with open(file_path, 'r') as conllu_file:
             logger.info("Reading UD instances for %s language from conllu dataset at: %s", lang, file_path)
 
-            for annotation in lazy_parse(conllu_file.read()):
+            for annotation in parse_incr(conllu_file):
                 # CoNLLU annotations sometimes add back in words that have been elided
                 # in the original sentence; we remove these, as we're just predicting
                 # dependencies for the original sentence.

diff --git a/requirements.txt b/requirements.txt
@@ -86,7 +86,7 @@ h5py
 pytz>=2017.3
 
 # Reads Universal Dependencies files.
-conllu==0.11
+conllu==1.3.1
 
 #### ESSENTIAL TESTING-RELATED PACKAGES ####
 

diff --git a/setup.py b/setup.py
@@ -125,7 +125,7 @@
           'flaky',
           'responses>=0.7',
           'numpydoc>=0.8.0',
-          'conllu==0.11',
+          'conllu==1.3.1',
           'parsimonious>=0.8.0',
           'ftfy',
           'sqlparse>=0.2.4',