Use smaller models for the tests

alexandrainst · Sep 5, 2019 · 10501b2 · 10501b2
1 parent 8c9685d
commit 10501b2
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 22 deletions.
diff --git a/docs/datasets.md b/docs/datasets.md
@@ -5,7 +5,7 @@ This section keeps a list of Danish NLP datasets publicly available.
 | Dataset | Task | Annotated | Size | Author/Org | License |
 |---------|------|------|--------|---------|---------|
 | [WordSim-353-da](https://github.com/fnielsen/dasem/tree/master/dasem/data/wordsim353-da) | Word Similarity | :heavy_check_mark: | 353 words | Finn Årup Nielsen, Original English Data belongs to [Evgeniy Gabrilovich](<http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/>) | [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/). (For the original English data) |
-| [Danish UD treebank](<https://github.com/UniversalDependencies/UD_Danish-DDT/tree/master>) | Part of speech tags | :heavy_check_mark: | 100k words | Annotations: PAROLE-DK project by the Danish Society for Language and Literature, Code: [github contributors ](<https://github.com/UniversalDependencies/UD_Danish-DDT/graphs/contributors>) | GNU GPL |
+| [Danish Dependency Treebank](<https://github.com/UniversalDependencies/UD_Danish-DDT/tree/master>) | Part of speech tags | :heavy_check_mark: | 100k words | Annotations: PAROLE-DK project by the Danish Society for Language and Literature, Code: [github contributors ](<https://github.com/UniversalDependencies/UD_Danish-DDT/graphs/contributors>) | GNU GPL |
 | [EuroParl](<http://opus.nlpl.eu/Europarl.php>) | Plain text/ Embeddings | :x: | 0.3 GB | [Statictical Machine Translation](<http://www.statmt.org/europarl/>) and [OPUS](<<http://opus.nlpl.eu/>) with the paper by J. Tiedemann, 2012, [*Parallel Data, Tools and Interfaces in OPUS.*](http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf) | From [Statictical Machine Translation](<http://www.statmt.org/europarl/>): "Not aware of any copyright restrictions of the material" |
 | [OpenSubtitels2018](<http://opus.nlpl.eu/OpenSubtitles2018.php>) | Plain text/ Embeddings | :x: | 0.8GB | [Open Subtitles](<https://www.opensubtitles.org/da>) and [OPUS](<http://opus.nlpl.eu/OpenSubtitles2018.php>) with the paper  P. Lison and J. Tiedemann, 2016, [*OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.*](http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf) | NONE. But please link to [Open Subtitles](<https://www.opensubtitles.org/da>) and cite the paper |
 | [Wikipedia](<https://dumps.wikimedia.org/dawiki/latest/>) | Plain text/ Embeddings | :x: | 0.3GB | [Wikipedia Dumps](<https://dumps.wikimedia.org>) | NONE |

diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,32 @@
+Tests in DaNLP
+==============
+In order to make the CI more efficient, some of the models have been shrunk when running the tests.
+Currently the static word embeddings as well as the subword embeddings (fastText) has been made smaller.
+
+## Smaller static word embeddings
+To shrink the static word embeddings the number of word vectors has simply been reduced to the 5000 most frequent 
+words. This has been done with the `wiki.da.wv` embeddings using the following code. The code is inspired
+from an answer on [stack overflow](https://stackoverflow.com/a/53899885).
+
+```python
+words_to_trim = wv.index2word[5000:]
+ids_to_trim = [wv.vocab[w].index for w in words_to_trim]
+
+for w in words_to_trim:
+    del wv.vocab[w]
+
+wv.vectors = np.delete(wv.vectors, ids_to_trim, axis=0)
+wv.init_sims(replace=True)
+
+for i in sorted(ids_to_trim, reverse=True):
+    del(wv.index2word[i])
+```
+
+## Smaller subword embeddings
+To make smaller subword embeddings we have trained a new fastText model on the train part of the [Danish Dependency Treebank](<https://github.com/UniversalDependencies/UD_Danish-DDT/tree/master>) dataset.
+The training has been done using the official [fastText implementation](https://github.com/facebookresearch/fastText/) 
+with the following command.
+
+```bash
+./fasttext skipgram -input ddt_train.txt -output ddt.swv -cutoff 5000
+```
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
@@ -2,14 +2,32 @@
 
 from gensim.models.keyedvectors import FastTextKeyedVectors
 
-from danlp.models.embeddings import load_wv_with_spacy, load_wv_with_gensim, load_context_embeddings_with_flair
+from danlp.download import MODELS, download_model, _unzip_process_func
+from danlp.models.embeddings import load_wv_with_spacy, load_wv_with_gensim, load_context_embeddings_with_flair, \
+    AVAILABLE_EMBEDDINGS, AVAILABLE_SUBWORD_EMBEDDINGS
 
 
 class TestEmbeddings(unittest.TestCase):
 
+    def setUp(self):
+        # First we will add smaller test embeddings to the
+        MODELS['wiki.da.small.wv'] = {
+            'url': 'https://danlp.s3.eu-central-1.amazonaws.com/test-models/wiki.da.small.zip',
+            'vocab_size': 5000,
+            'dimensions': 300,
+            'md5_checksum': 'fcaa981a613b325ae4dc61aba235aa82',
+            'size': 5594508,
+            'file_extension': '.bin'
+        }
+
+        AVAILABLE_EMBEDDINGS.append('wiki.da.small.wv')
+
+        # Lets download the model and unzip it
+        download_model('wiki.da.small.wv', process_func=_unzip_process_func)
+
     def test_embeddings_with_spacy(self):
         with self.assertRaises(ValueError):
-            load_wv_with_spacy("wiki.da.swv")
+            load_wv_with_spacy("wiki.da.small.swv")
 
         embeddings = load_wv_with_spacy("wiki.da.wv")
 
@@ -18,11 +36,11 @@ def test_embeddings_with_spacy(self):
             self.assertTrue(token.has_vector)
 
     def test_embeddings_with_gensim(self):
-        embeddings = load_wv_with_gensim('connl.da.wv')
+        embeddings = load_wv_with_gensim('wiki.da.small.wv')
 
         most_similar = embeddings.most_similar(positive=['københavn', 'england'], negative=['danmark'], topn=1)
 
-        self.assertEqual(most_similar[0], ('london', 0.7156291604042053))
+        self.assertEqual(most_similar[0], ('london', 0.5180857181549072))
 
     def test_embeddings_with_flair(self):
         from flair.data import Sentence
@@ -39,22 +57,31 @@ def test_embeddings_with_flair(self):
         self.assertEqual(len(sentence1[2].embedding), 2364)
         self.assertEqual(len(sentence2[4].embedding), 2364)
 
-        # Show the embeddings are different
-        self.assertEqual(int(sum(sentence2[4].embedding == sentence1[2].embedding)), 52)
-
-    ####################################################################################
-    # Commented out as this test requires too much memory for the instances on Travis CI
-    #
-    # def test_fasttext_embeddings(self):
-    #     fasttext_embeddings = load_wv_with_gensim('wiki.da.swv')
-    #
-    #     self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors)
-    #
-    #     # The word is not in the vocab
-    #     self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab)
-    #
-    #     # However we can get an embedding because of subword units
-    #     self.assertEqual(fasttext_embeddings['institutmedarbejdskontrakt'].size, 300)
+    def test_fasttext_embeddings(self):
+        # First we will add smaller test embeddings to the
+        MODELS['ddt.swv'] = {
+            'url': 'https://danlp.s3.eu-central-1.amazonaws.com/test-models/ddt.swv.zip',
+            'vocab_size': 5000,
+            'dimensions': 100,
+            'md5_checksum': 'c50c61e1b434908e2732c80660abf8bf',
+            'size': 741125088,
+            'file_extension': '.bin'
+        }
+
+        AVAILABLE_SUBWORD_EMBEDDINGS.append('ddt.swv')
+
+        download_model('ddt.swv', process_func=_unzip_process_func)
+
+        fasttext_embeddings = load_wv_with_gensim('ddt.swv')
+
+        self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors)
+
+        # The word is not in the vocab
+        self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab)
+
+        # However we can get an embedding because of subword units
+        self.assertEqual(fasttext_embeddings['institutmedarbejdskontrakt'].size, 100)
+
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()