-
Notifications
You must be signed in to change notification settings - Fork 33
/
test_datasets.py
74 lines (52 loc) · 2.47 KB
/
test_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import shutil
import unittest
from flair.datasets import ColumnCorpus
from pyconll.unit import Conll
from spacy.gold import GoldCorpus
from danlp.datasets import DDT, WikiAnn, DATASETS
class TestNerDatasets(unittest.TestCase):
def setUp(self):
self.train_len = 4383
self.dev_len = 564
self.test_len = 565
self.ddt = DDT() # Load dataset
def test_ddt_dataset(self):
train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)
self.assertIsInstance(train, Conll)
self.assertIsInstance(dev, Conll)
self.assertIsInstance(test, Conll)
self.assertEqual([len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len])
full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len)
def test_ddt_dataset_with_flair(self):
flair_corpus = self.ddt.load_with_flair()
self.assertIsInstance(flair_corpus, ColumnCorpus)
flair_lens = [len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test)]
self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len])
ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
asserted_ner_tags = [
b'B-ORG', b'B-PER', b'B-LOC',
b'I-ORG', b'I-PER', b'I-LOC',
b'O', b'<START>', b'<STOP>', b'<unk>'
]
self.assertCountEqual(ner_tags, asserted_ner_tags)
def test_ddt_dataset_with_spacy(self):
ddt = DDT() # Load dataset
corpus = ddt.load_with_spacy()
self.assertIsInstance(corpus, GoldCorpus)
def test_wikiann_dataset(self):
# Change to a sample of the full wikiann to ease test computation
DATASETS['wikiann']['url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz"
DATASETS['wikiann']['size'] = 2502
DATASETS['wikiann']['md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'
wikiann = WikiAnn()
corpus = wikiann.load_ner_with_flair()
self.assertEqual([len(corpus.train), len(corpus.test)], [21, 3])
ner_tags = corpus.make_tag_dictionary('ner').idx2item
asserted_ner_tags = [
b'B-ORG', b'B-PER', b'B-LOC',
b'I-ORG', b'I-PER', b'I-LOC',
b'O', b'<START>', b'<STOP>', b'<unk>'
]
self.assertCountEqual(ner_tags, asserted_ner_tags)
shutil.rmtree(wikiann.dataset_dir)