diff --git a/danlp/datasets/ddt.py b/danlp/datasets/ddt.py index 3f2aff9..33991d7 100644 --- a/danlp/datasets/ddt.py +++ b/danlp/datasets/ddt.py @@ -1,5 +1,7 @@ import os + + from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS @@ -84,3 +86,36 @@ def load_with_flair(self, predefined_splits: bool = False): token.tags['ner'].value = token.tags['ner'].value.split("=")[1].replace("|SpaceAfter", "") return corpus + + def load_with_spacy(self): + """ + Converts the conllu files to json in the spaCy format. + + Not using jsonl because of: + https://github.com/explosion/spaCy/issues/3523 + :return: + """ + import srsly + from spacy.cli.converters import conllu2json + from spacy.gold import GoldCorpus + from spacy.gold import Path + + for part in ['train', 'dev', 'test']: + conll_path = os.path.join(self.dataset_dir, '{}.{}{}'.format(self.dataset_name, part, self.file_extension)) + json_path = os.path.join(self.dataset_dir, "ddt.{}.json".format(part)) + + if not os.path.isfile(json_path): # Convert the conllu files to json + with open(conll_path, 'r') as file: + file_as_string = file.read() + file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter", "") + file_as_json = conllu2json(file_as_string) + + srsly.write_json(json_path, file_as_json) + + train_json_path = os.path.join(self.dataset_dir, "ddt.train.json") + dev_json_path = os.path.join(self.dataset_dir, "ddt.dev.json") + + assert os.path.isfile(train_json_path) + assert os.path.isfile(dev_json_path) + + return GoldCorpus(Path(train_json_path), Path(dev_json_path)) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index dff2f1c..7948427 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,34 +1,41 @@ import shutil import unittest +from flair.datasets import ColumnCorpus from pyconll.unit import Conll +from spacy.gold import GoldCorpus from danlp.datasets import DDT, WikiAnn, DATASETS class TestNerDatasets(unittest.TestCase): - def test_ddt_dataset(self): - train_len = 4383 - dev_len = 564 - test_len = 565 + def setUp(self): + self.train_len = 4383 + self.dev_len = 564 + self.test_len = 565 - ddt = DDT() # Load dataset + self.ddt = DDT() # Load dataset - train, dev, test = ddt.load_as_conllu(predefined_splits=True) + def test_ddt_dataset(self): + train, dev, test = self.ddt.load_as_conllu(predefined_splits=True) self.assertIsInstance(train, Conll) self.assertIsInstance(dev, Conll) self.assertIsInstance(test, Conll) - self.assertEqual([len(train), len(dev), len(test)], [train_len, dev_len, test_len]) + self.assertEqual([len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len]) + + full_dataset = self.ddt.load_as_conllu(predefined_splits=False) + self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len) - full_dataset = ddt.load_as_conllu(predefined_splits=False) - self.assertEqual(len(full_dataset), train_len+dev_len+test_len) + def test_ddt_dataset_with_flair(self): + flair_corpus = self.ddt.load_with_flair() + + self.assertIsInstance(flair_corpus, ColumnCorpus) - flair_corpus = ddt.load_with_flair() flair_lens = [len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test)] - self.assertEqual(flair_lens, [train_len, dev_len, test_len]) + self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len]) ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ @@ -38,6 +45,11 @@ def test_ddt_dataset(self): ] self.assertCountEqual(ner_tags, asserted_ner_tags) + def test_ddt_dataset_with_spacy(self): + ddt = DDT() # Load dataset + corpus = ddt.load_with_spacy() + self.assertIsInstance(corpus, GoldCorpus) + def test_wikiann_dataset(self): # Change to a sample of the full wikiann to ease test computation DATASETS['wikiann']['url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz" @@ -60,4 +72,3 @@ def test_wikiann_dataset(self): shutil.rmtree(wikiann.dataset_dir) -