Skip to content

Commit

Permalink
Add spaCy loader for DDT dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
hvingelby committed Sep 22, 2019
1 parent b00ebaf commit 4af8f9e
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 12 deletions.
35 changes: 35 additions & 0 deletions danlp/datasets/ddt.py
@@ -1,5 +1,7 @@
import os



from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS


Expand Down Expand Up @@ -84,3 +86,36 @@ def load_with_flair(self, predefined_splits: bool = False):
token.tags['ner'].value = token.tags['ner'].value.split("=")[1].replace("|SpaceAfter", "")

return corpus

def load_with_spacy(self):
"""
Converts the conllu files to json in the spaCy format.
Not using jsonl because of:
https://github.com/explosion/spaCy/issues/3523
:return:
"""
import srsly
from spacy.cli.converters import conllu2json
from spacy.gold import GoldCorpus
from spacy.gold import Path

for part in ['train', 'dev', 'test']:
conll_path = os.path.join(self.dataset_dir, '{}.{}{}'.format(self.dataset_name, part, self.file_extension))
json_path = os.path.join(self.dataset_dir, "ddt.{}.json".format(part))

if not os.path.isfile(json_path): # Convert the conllu files to json
with open(conll_path, 'r') as file:
file_as_string = file.read()
file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter", "")
file_as_json = conllu2json(file_as_string)

srsly.write_json(json_path, file_as_json)

train_json_path = os.path.join(self.dataset_dir, "ddt.train.json")
dev_json_path = os.path.join(self.dataset_dir, "ddt.dev.json")

assert os.path.isfile(train_json_path)
assert os.path.isfile(dev_json_path)

return GoldCorpus(Path(train_json_path), Path(dev_json_path))
35 changes: 23 additions & 12 deletions tests/test_datasets.py
@@ -1,34 +1,41 @@
import shutil
import unittest

from flair.datasets import ColumnCorpus
from pyconll.unit import Conll
from spacy.gold import GoldCorpus

from danlp.datasets import DDT, WikiAnn, DATASETS


class TestNerDatasets(unittest.TestCase):

def test_ddt_dataset(self):
train_len = 4383
dev_len = 564
test_len = 565
def setUp(self):
self.train_len = 4383
self.dev_len = 564
self.test_len = 565

ddt = DDT() # Load dataset
self.ddt = DDT() # Load dataset

train, dev, test = ddt.load_as_conllu(predefined_splits=True)
def test_ddt_dataset(self):
train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

self.assertIsInstance(train, Conll)
self.assertIsInstance(dev, Conll)
self.assertIsInstance(test, Conll)

self.assertEqual([len(train), len(dev), len(test)], [train_len, dev_len, test_len])
self.assertEqual([len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len])

full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len)

full_dataset = ddt.load_as_conllu(predefined_splits=False)
self.assertEqual(len(full_dataset), train_len+dev_len+test_len)
def test_ddt_dataset_with_flair(self):
flair_corpus = self.ddt.load_with_flair()

self.assertIsInstance(flair_corpus, ColumnCorpus)

flair_corpus = ddt.load_with_flair()
flair_lens = [len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test)]
self.assertEqual(flair_lens, [train_len, dev_len, test_len])
self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len])

ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
asserted_ner_tags = [
Expand All @@ -38,6 +45,11 @@ def test_ddt_dataset(self):
]
self.assertCountEqual(ner_tags, asserted_ner_tags)

def test_ddt_dataset_with_spacy(self):
ddt = DDT() # Load dataset
corpus = ddt.load_with_spacy()
self.assertIsInstance(corpus, GoldCorpus)

def test_wikiann_dataset(self):
# Change to a sample of the full wikiann to ease test computation
DATASETS['wikiann']['url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz"
Expand All @@ -60,4 +72,3 @@ def test_wikiann_dataset(self):

shutil.rmtree(wikiann.dataset_dir)


0 comments on commit 4af8f9e

Please sign in to comment.