Skip to content

Commit

Permalink
Use smaller models for the tests
Browse files Browse the repository at this point in the history
  • Loading branch information
hvingelby committed Sep 5, 2019
1 parent 8c9685d commit 10501b2
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 22 deletions.
2 changes: 1 addition & 1 deletion docs/datasets.md
Expand Up @@ -5,7 +5,7 @@ This section keeps a list of Danish NLP datasets publicly available.
| Dataset | Task | Annotated | Size | Author/Org | License |
|---------|------|------|--------|---------|---------|
| [WordSim-353-da](https://github.com/fnielsen/dasem/tree/master/dasem/data/wordsim353-da) | Word Similarity | :heavy_check_mark: | 353 words | Finn Årup Nielsen, Original English Data belongs to [Evgeniy Gabrilovich](<http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/>) | [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/). (For the original English data) |
| [Danish UD treebank](<https://github.com/UniversalDependencies/UD_Danish-DDT/tree/master>) | Part of speech tags | :heavy_check_mark: | 100k words | Annotations: PAROLE-DK project by the Danish Society for Language and Literature, Code: [github contributors ](<https://github.com/UniversalDependencies/UD_Danish-DDT/graphs/contributors>) | GNU GPL |
| [Danish Dependency Treebank](<https://github.com/UniversalDependencies/UD_Danish-DDT/tree/master>) | Part of speech tags | :heavy_check_mark: | 100k words | Annotations: PAROLE-DK project by the Danish Society for Language and Literature, Code: [github contributors ](<https://github.com/UniversalDependencies/UD_Danish-DDT/graphs/contributors>) | GNU GPL |
| [EuroParl](<http://opus.nlpl.eu/Europarl.php>) | Plain text/ Embeddings | :x: | 0.3 GB | [Statictical Machine Translation](<http://www.statmt.org/europarl/>) and [OPUS](<<http://opus.nlpl.eu/>) with the paper by J. Tiedemann, 2012, [*Parallel Data, Tools and Interfaces in OPUS.*](http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf) | From [Statictical Machine Translation](<http://www.statmt.org/europarl/>): "Not aware of any copyright restrictions of the material" |
| [OpenSubtitels2018](<http://opus.nlpl.eu/OpenSubtitles2018.php>) | Plain text/ Embeddings | :x: | 0.8GB | [Open Subtitles](<https://www.opensubtitles.org/da>) and [OPUS](<http://opus.nlpl.eu/OpenSubtitles2018.php>) with the paper P. Lison and J. Tiedemann, 2016, [*OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.*](http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf) | NONE. But please link to [Open Subtitles](<https://www.opensubtitles.org/da>) and cite the paper |
| [Wikipedia](<https://dumps.wikimedia.org/dawiki/latest/>) | Plain text/ Embeddings | :x: | 0.3GB | [Wikipedia Dumps](<https://dumps.wikimedia.org>) | NONE |
Expand Down
32 changes: 32 additions & 0 deletions tests/README.md
@@ -0,0 +1,32 @@
Tests in DaNLP
==============
In order to make the CI more efficient, some of the models have been shrunk when running the tests.
Currently the static word embeddings as well as the subword embeddings (fastText) has been made smaller.

## Smaller static word embeddings
To shrink the static word embeddings the number of word vectors has simply been reduced to the 5000 most frequent
words. This has been done with the `wiki.da.wv` embeddings using the following code. The code is inspired
from an answer on [stack overflow](https://stackoverflow.com/a/53899885).

```python
words_to_trim = wv.index2word[5000:]
ids_to_trim = [wv.vocab[w].index for w in words_to_trim]

for w in words_to_trim:
del wv.vocab[w]

wv.vectors = np.delete(wv.vectors, ids_to_trim, axis=0)
wv.init_sims(replace=True)

for i in sorted(ids_to_trim, reverse=True):
del(wv.index2word[i])
```

## Smaller subword embeddings
To make smaller subword embeddings we have trained a new fastText model on the train part of the [Danish Dependency Treebank](<https://github.com/UniversalDependencies/UD_Danish-DDT/tree/master>) dataset.
The training has been done using the official [fastText implementation](https://github.com/facebookresearch/fastText/)
with the following command.

```bash
./fasttext skipgram -input ddt_train.txt -output ddt.swv -cutoff 5000
```
69 changes: 48 additions & 21 deletions tests/test_embeddings.py
Expand Up @@ -2,14 +2,32 @@

from gensim.models.keyedvectors import FastTextKeyedVectors

from danlp.models.embeddings import load_wv_with_spacy, load_wv_with_gensim, load_context_embeddings_with_flair
from danlp.download import MODELS, download_model, _unzip_process_func
from danlp.models.embeddings import load_wv_with_spacy, load_wv_with_gensim, load_context_embeddings_with_flair, \
AVAILABLE_EMBEDDINGS, AVAILABLE_SUBWORD_EMBEDDINGS


class TestEmbeddings(unittest.TestCase):

def setUp(self):
# First we will add smaller test embeddings to the
MODELS['wiki.da.small.wv'] = {
'url': 'https://danlp.s3.eu-central-1.amazonaws.com/test-models/wiki.da.small.zip',
'vocab_size': 5000,
'dimensions': 300,
'md5_checksum': 'fcaa981a613b325ae4dc61aba235aa82',
'size': 5594508,
'file_extension': '.bin'
}

AVAILABLE_EMBEDDINGS.append('wiki.da.small.wv')

# Lets download the model and unzip it
download_model('wiki.da.small.wv', process_func=_unzip_process_func)

def test_embeddings_with_spacy(self):
with self.assertRaises(ValueError):
load_wv_with_spacy("wiki.da.swv")
load_wv_with_spacy("wiki.da.small.swv")

embeddings = load_wv_with_spacy("wiki.da.wv")

Expand All @@ -18,11 +36,11 @@ def test_embeddings_with_spacy(self):
self.assertTrue(token.has_vector)

def test_embeddings_with_gensim(self):
embeddings = load_wv_with_gensim('connl.da.wv')
embeddings = load_wv_with_gensim('wiki.da.small.wv')

most_similar = embeddings.most_similar(positive=['københavn', 'england'], negative=['danmark'], topn=1)

self.assertEqual(most_similar[0], ('london', 0.7156291604042053))
self.assertEqual(most_similar[0], ('london', 0.5180857181549072))

def test_embeddings_with_flair(self):
from flair.data import Sentence
Expand All @@ -39,22 +57,31 @@ def test_embeddings_with_flair(self):
self.assertEqual(len(sentence1[2].embedding), 2364)
self.assertEqual(len(sentence2[4].embedding), 2364)

# Show the embeddings are different
self.assertEqual(int(sum(sentence2[4].embedding == sentence1[2].embedding)), 52)

####################################################################################
# Commented out as this test requires too much memory for the instances on Travis CI
#
# def test_fasttext_embeddings(self):
# fasttext_embeddings = load_wv_with_gensim('wiki.da.swv')
#
# self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors)
#
# # The word is not in the vocab
# self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab)
#
# # However we can get an embedding because of subword units
# self.assertEqual(fasttext_embeddings['institutmedarbejdskontrakt'].size, 300)
def test_fasttext_embeddings(self):
# First we will add smaller test embeddings to the
MODELS['ddt.swv'] = {
'url': 'https://danlp.s3.eu-central-1.amazonaws.com/test-models/ddt.swv.zip',
'vocab_size': 5000,
'dimensions': 100,
'md5_checksum': 'c50c61e1b434908e2732c80660abf8bf',
'size': 741125088,
'file_extension': '.bin'
}

AVAILABLE_SUBWORD_EMBEDDINGS.append('ddt.swv')

download_model('ddt.swv', process_func=_unzip_process_func)

fasttext_embeddings = load_wv_with_gensim('ddt.swv')

self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors)

# The word is not in the vocab
self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab)

# However we can get an embedding because of subword units
self.assertEqual(fasttext_embeddings['institutmedarbejdskontrakt'].size, 100)


if __name__ == '__main__':
unittest.main()
unittest.main()

0 comments on commit 10501b2

Please sign in to comment.