Merge 0584ca9 into bf48772

alexandrainst · Dec 7, 2020 · 37e80eb · 37e80eb
2 parents bf48772 + 0584ca9
commit 37e80eb
Show file tree

Hide file tree

Showing 5 changed files with 326 additions and 23 deletions.
diff --git a/danlp/datasets/__init__.py b/danlp/datasets/__init__.py
@@ -2,4 +2,5 @@
 from .wiki_ann import *
 from .word_sim import *
 from .sentiment import *
-from .dacoref import * 
+from .dacoref import * 
+from .dannet import *
diff --git a/danlp/datasets/dannet.py b/danlp/datasets/dannet.py
@@ -0,0 +1,214 @@
+import os
+import pandas as pd
+import json
+
+from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS
+
+
+class DanNet():
+    """
+    DanNet wrapper, providing functions to access the main features of DanNet.
+    See also : https://cst.ku.dk/projekter/dannet/.
+
+    Dannet consists of a set of 4 databases: 
+
+        * words
+        * word senses
+        * relations
+        * synsets
+
+    :param str cache_dir: the directory for storing cached models
+    :param bool verbose: `True` to increase verbosity
+
+    """
+
+    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
+
+        self.dataset_name = 'dannet'
+        self.file_extension = DATASETS[self.dataset_name]['file_extension']
+
+        self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)
+
+        self.words = pd.read_csv(os.path.join(self.dataset_dir, "words.csv"), 
+                                sep='@', 
+                                names=['word_id', 'form', 'pos', 'nan'], 
+                                encoding='unicode_escape', 
+                                usecols=[0,1,2], 
+                                dtype={'word_id':str})
+        self.wordsenses = pd.read_csv(os.path.join(self.dataset_dir, "wordsenses.csv"), 
+                                    sep='@', 
+                                    names=['wordsense_id', 'word_id', 'synset_id', 'register', 'nan'], 
+                                    encoding='unicode_escape', 
+                                    usecols=[1,2],
+                                    dtype={'wordsense_id':str, 'word_id':str, 'synset_id':str})
+        self.relations = pd.read_csv(os.path.join(self.dataset_dir, "relations.csv"), 
+                                    sep='@', 
+                                    names=['synset_id', 'wordnetowl', 'relation', 'value', 'taxonomic', 'inheritance_comment', 'nan'], 
+                                    encoding='unicode_escape', 
+                                    usecols=[0,1,2,3,4,5],
+                                    dtype={'synset_id':str, 'value':str})
+        self.synsets = pd.read_csv(os.path.join(self.dataset_dir, "synsets.csv"), 
+                                    sep='@', 
+                                    names=['synset_id', 'label', 'gloss', 'ontological_type'], 
+                                    encoding='unicode_escape', 
+                                    usecols=[0,1,2,3],
+                                    dtype={'synset_id':str})
+
+    def load_with_pandas(self):
+        """
+        Loads the datasets in 4 dataframes
+
+        :return: 4 dataframes: words, wordsenses, relations, synsets
+
+        """
+        return self.words, self.wordsenses, self.relations, self.synsets
+
+
+    def synonyms(self, word, pos=None):
+        """
+        Returns the synonyms of `word`. 
+
+        :param word: text
+        :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
+        :return: list of synonyms
+
+        :Example:
+
+            "`hav`" 
+            returns
+            ["sø", "ocean"]
+        """
+
+        word_ids = self._word_ids(word, pos)
+        synset_ids = self._synset_ids(word, pos)
+        synonym_ids = self.wordsenses[self.wordsenses['synset_id'].isin(synset_ids) & ~self.wordsenses['word_id'].isin(word_ids)]['word_id'].tolist()
+        synonyms = self.words[self.words['word_id'].isin(synonym_ids)]['form'].tolist()
+        return synonyms
+
+    def meanings(self, word, pos=None):
+        """
+        Returns the meanings of `word`.
+
+        :param word: text
+        :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
+        :return: list of meanings
+
+        """
+
+        synset_ids = self._synset_ids(word, pos)
+        meanings = self.synsets[self.synsets['synset_id'].isin(synset_ids)]['gloss'].tolist()
+
+        return meanings
+
+
+    def hypernyms(self, word, pos=None):
+        """
+        Returns the hypernyms of `word`.
+
+        :param word: text
+        :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
+        :return: list of hypernyms
+
+        """
+
+        word_synset_ids = self._synset_ids(word)
+        hyper_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['value'].tolist()
+        hyper_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['synset_id'].tolist()
+        hyper_synset_ids = [val for val in hyper_synset_ids if val.isdigit()]
+        hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hyper_synset_ids)]['word_id'].tolist()
+        hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist()
+
+        return hypernyms
+
+
+    def hyponyms(self, word, pos=None):
+        """
+        Returns the hyponyms of `word`.
+
+        :param word: text
+        :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
+        :return: list of hypernyms
+
+        """
+
+        word_synset_ids = self._synset_ids(word, pos)
+        hypo_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['value'].tolist()
+        hypo_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['synset_id'].tolist()
+        hypo_synset_ids = [val for val in hypo_synset_ids if val.isdigit()]
+        hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hypo_synset_ids)]['word_id'].tolist()
+        hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist()
+
+        return hypernyms
+
+    def wordnet_relations(self, word, pos=None, eurowordnet=True):
+        """
+        Returns the name of the relations `word` is associated with.
+
+        :param word: text
+        :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
+        :return: list of relations 
+
+        """
+        if eurowordnet:
+            rel_name = "relation"
+        else:
+            rel_name = "wordnetowl"
+
+        synset_ids = self._synset_ids(word, pos)
+        relations = self.relations[self.relations['synset_id'].isin(synset_ids)][rel_name].tolist()
+
+        return set(relations)
+
+
+
+    def pos(self, word):
+        """
+        Returns the part-of-speech tags `word` can be categorized with among "Noun", "Verb" or "Adjective".
+
+        :param word: text
+        :return: list of part-of-speech tags
+        """
+
+        return list(self.words[self.words['form'] == word]['pos'].unique())
+
+    def _word_ids(self, word, pos=None):
+
+        pos = _get_pos_list(pos)
+        word = word.lower()
+
+        return self.words[(self.words['form'] == word) & self.words['pos'].isin(pos)]['word_id'].tolist()
+
+    def _synset_ids(self, word, pos=None):
+
+        word_ids = self._word_ids(word, pos)
+        return self.wordsenses[self.wordsenses['word_id'].isin(word_ids)]['synset_id'].tolist()
+
+    def _word_from_id(self, word_id):
+
+        assert(type(word_id) == int or (type(word_id) == str and word_id.is_digit()))
+        word_id = str(word_id)
+
+        return self.words[self.words['word_id'] == word_id]['form'].tolist()
+
+    def _synset_from_id(self, synset_id):
+
+        assert(type(synset_id) == int or (type(synset_id) == str and synset_id.is_digit()))
+        synset_id = str(synset_id)
+
+        synset_labels = self.synsets[self.synsets['synset_id'] == synset_id]['label'].tolist()
+        return set([w.split('_')[0] for s in synset_labels for w in s[1:-1].split('; ')])
+
+
+    def __str__(self):
+
+        return "DanNet: {} word forms, {} lexemes, {} synsets".format(len(set(self.words['form'])), len(self.words['word_id']), len(set(self.wordsenses['synset_id'])))
+
+
+def _get_pos_list(pos):
+    if pos == None:
+        return ['Noun', 'Verb', 'Adjective']
+    elif type(pos) == str:
+        return [pos]
+    assert(type(pos) == list)
+    return pos
+
diff --git a/danlp/download.py b/danlp/download.py
@@ -194,15 +194,21 @@
         'md5_checksum': '5e7dad9e6c8c32aa9dd17830bed5e0f6',
         'size': 3489,
         'file_extension': '.csv'
-        },
+    },
      # coreference dataset 
      'dacoref': {
         'url': 'http://danlp-downloads.alexandra.dk/datasets/dacoref.zip',
         'md5_checksum': 'e6f2707f4f600a0d357dc7afa1b01f92',
         'size': 1005278,
         'file_extension': ''
-        },
-
+    },
+    # Danish Wordnet
+    'dannet': {
+        'url': DANLP_STORAGE_URL + '/datasets/dannet.zip',
+        'md5_checksum': 'a5aa388bb08487bd59d72257aa15d8fa',
+        'size': 6083044,
+        'file_extension': '.csv'
+    },
 
     # SENTIMENT EVALUATION
     'europarl.sentiment1': {

diff --git a/docs/docs/datasets.md b/docs/docs/datasets.md
@@ -3,22 +3,23 @@ Datasets
 
 This section keeps a list of Danish NLP datasets publicly available. 
 
-| Dataset                                                      | Task                   | Words           | Sents                  | License                                                      | DaNLP |
-| ------------------------------------------------------------ | ---------------------- | --------------- | ---------------------- | ------------------------------------------------------------ | ----- |
-| [OpenSubtitles2018](<http://opus.nlpl.eu/OpenSubtitles2018.php>) | Translation            | 206,700,000     | 30,178,452             | [None](http://opus.nlpl.eu/OpenSubtitles2018.php)            | ❌     |
-| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php)         | Translation            | 208,175,843     | 8,650,537              | -                                                            | ❌     |
-| [Europarl7](http://www.statmt.org/europarl/)                 | Translation            | 47,761,381      | 2,323,099              | [None](http://www.statmt.org/europarl/)                      | ❌     |
-| [ParaCrawl5](https://paracrawl.eu/)                          | Translation            | -               | -                      | [CC0](https://paracrawl.eu/releases.html)                    | ❌     |
-| [WikiANN](#wikiann)                                          | NER                    | 832.901         | 95.924                 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/)                 | ✔️     |
-| [UD-DDT (DaNE)](#dane)                                       | DEP, POS, NER          | 100,733         | 5,512                  | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️     |
-| [LCC Sentiment](#lcc-sentiment)                              | Sentiment              | 10.588          | 499                    | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️     |
-| [Europarl Sentiment1](#europarl-sentiment1)                  | Sentiment              | 3.359           | 184                    | None                                                         | ✔️     |
-| [Europarl Sentiment2](#europarl-sentiment2)                  | sentiment              |                 | 957                    | CC BY-SA 4.0                                                 | ✔️     |
-| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/)      | Raw                    | -               | -                      | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html)       | ❌     |
-| [WordSim-353](#wordsim-353)                                  | Word Similarity        | 353             | -                      | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️     |
-| [Danish Similarity Dataset](#danish-similarity-dataset)      | Word Similarity        | 99              | -                      | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️     |
-| [Twitter Sentiment](#twitter-sentiment)                      | Sentiment              | -               | train: 1215, test: 512 | Twitter privacy policy applies                               | ✔️     |
-| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403                  | GNU Public License version 2                                 | ✔️     |
+| Dataset                                                      | Task                   | Words             | Sents                  | License                                                      | DaNLP |
+| ------------------------------------------------------------ | ---------------------- | ----------------- | ---------------------- | ------------------------------------------------------------ | ----- |
+| [OpenSubtitles2018](<http://opus.nlpl.eu/OpenSubtitles2018.php>) | Translation            | 206,700,000       | 30,178,452             | [None](http://opus.nlpl.eu/OpenSubtitles2018.php)            | ❌     |
+| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php)         | Translation            | 208,175,843       | 8,650,537              | -                                                            | ❌     |
+| [Europarl7](http://www.statmt.org/europarl/)                 | Translation            | 47,761,381        | 2,323,099              | [None](http://www.statmt.org/europarl/)                      | ❌     |
+| [ParaCrawl5](https://paracrawl.eu/)                          | Translation            | -                 | -                      | [CC0](https://paracrawl.eu/releases.html)                    | ❌     |
+| [WikiANN](#wikiann)                                          | NER                    | 832.901           | 95.924                 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/)                 | ✔️     |
+| [UD-DDT (DaNE)](#dane)                                       | DEP, POS, NER          | 100,733           | 5,512                  | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️     |
+| [LCC Sentiment](#lcc-sentiment)                              | Sentiment              | 10.588            | 499                    | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️     |
+| [Europarl Sentiment1](#europarl-sentiment1)                  | Sentiment              | 3.359             | 184                    | None                                                         | ✔️     |
+| [Europarl Sentiment2](#europarl-sentiment2)                  | sentiment              |                   | 957                    | CC BY-SA 4.0                                                 | ✔️     |
+| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/)      | Raw                    | -                 | -                      | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html)       | ❌     |
+| [WordSim-353](#wordsim-353)                                  | Word Similarity        | 353               | -                      | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️     |
+| [Danish Similarity Dataset](#danish-similarity-dataset)      | Word Similarity        | 99                | -                      | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️     |
+| [Twitter Sentiment](#twitter-sentiment)                      | Sentiment              | -                 | train: 1215, test: 512 | Twitter privacy policy applies                               | ✔️     |
+| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens)   | 3.403                  | GNU Public License version 2                                 | ✔️     |
+| [DanNet](#dannet)                                            | Wordnet                | 66.308 (concepts) | -                      | [license](https://cst.ku.dk/projekter/dannet/license.txt)    | ✔️     |
 
 It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus. 
 
@@ -147,11 +148,78 @@ df = lccsent.load_with_pandas()
 ```
 
 
+### DanNet
+
+[DanNet](https://cst.ku.dk/projekter/dannet/) is a lexical database such as [Wordnet](https://wordnet.princeton.edu/). "Center for sprogteknologi" at The University of Copenhagen is behind it and more details about it can be found in the paper Pedersen et al 2009. 
+
+DanNet depicts the relations between words in Danish (mostly nouns, verbs and adjectives). 
+The main relation among words in WordNet is synonymy.
+
+The dataset consists of 4 databases:
+
+    * words
+    * word senses
+    * relations
+    * synsets
+
+DanNet uses the concept of `synset` to link words together. All the words in the database are part of one or multiple synsets. A synset is a set of synonyms (words which have the same meanings).
+
+
+For downloading DanNet through DaNLP, you can do: 
+
+```python
+from danlp.datasets import DanNet
+
+dannet = DanNet()
+
+# you can load the databases if you want to look into the databases by yourself
+words, wordsenses, relations, synsets = dannet.load_with_pandas()
+```
+
+We also provide helper functions to search for synonyms, hyperonyms and hyponyms through the databases. 
+Once you have downloaded the DanNet wrapper, you can use the following features: 
+
+```python
+
+word = "myre"
+# synonyms
+dannet.synonyms(word)
+""" ['tissemyre'] """
+# hypernyms
+dannet.hypernyms(word)
+""" ['årevingede insekter'] """
+# hyponyms
+dannet.hyponyms(word)
+""" ['hærmyre', 'skovmyre', 'pissemyre', 'tissemyre'] """
+# meanings
+dannet.meanings(word)
+""" ['ca. 1 cm langt, årevinget insekt med en kraftig in ... (Brug: "Myrer på terrassen, og andre steder udendørs, kan hurtigt blive meget generende")'] """
+
+
+# to help you dive into the databases
+# we also provide the following functions: 
+
+# part-of-speech (returns a list comprised in 'Noun', 'Verb' or 'Adjective')
+dannet.pos(word)
+# wordnet relations (EUROWORDNET or WORDNETOWL)
+dannet.wordnet_relations(word, eurowordnet=True))
+# word ids
+dannet._word_ids(word)
+# synset ids
+dannet._synset_ids(word)
+# word from id
+dannet._word_from_id(11034863)
+# synset from id
+dannet._synset_from_id(3514)
+```
+
+
 ## 🎓 References
 - Johannsen, Anders, Martínez Alonso, Héctor and Plank, Barbara. “Universal Dependencies for Danish”. TLT14, 2015.
 - Keson, Britt (1998). Documentation of The Danish Morpho-syntactically Tagged PAROLE Corpus. Technical report, DSL
 - Matthias T. Buch-Kromann, Line Mikkelsen, and Stine Kern Lynge. 2003. "Danish dependency treebank". In **TLT**.
 - Rasmus Hvingelby, Amalie B. Pauli, Maria Barrett, Christina Rosted, Lasse M. Lidegaard and Anders Søgaard. 2020. DaNE: A Named Entity Resource for Danish. In **LREC**.
+- Pedersen, Bolette S. Sanni Nimb, Jørg Asmussen, Nicolai H. Sørensen, Lars Trap-Jensen og Henrik Lorentzen (2009). [DanNet – the challenge of compiling a WordNet for Danish by reusing a monolingual dictionary](https://pdfs.semanticscholar.org/6891/69de00c63d58bd68229cb0b3469a617f5ab3.pdf). *Lang Resources & Evaluation* 43:269–299.
 - Xiaoman Pan, Boliang Zhang, Jonathan May, Joel Nothman, Kevin Knight and Heng Ji. 2017. [Cross-lingual Name Tagging and Linking for 282 Languages](https://aclweb.org/anthology/P17-1178). In **ACL**.
 - Lev Finkelstein, Evgeniy Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan Ruppin. 2002. [Placing Search in Context: The Concept Revisited](http://www.cs.technion.ac.il/~gabr/papers/tois_context.pdf). In  **ACM TOIS**.
 - Uwe Quasthoff, Matthias Richter and Christian Biemann. 2006. [Corpus Portal for Search in Monolingual Corpora](https://www.aclweb.org/anthology/L06-1396/). In **LREC**.