From e03f605bf1c5949fe8b3cdc4cf50b0c01f5022b5 Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Thu, 3 Dec 2020 15:53:54 +0100 Subject: [PATCH 1/4] add DanNet dataset * class DanNet in datasets * download params * unit test --- danlp/datasets/__init__.py | 3 +- danlp/datasets/dannet.py | 214 +++++++++++++++++++++++++++++++++++++ danlp/download.py | 12 ++- tests/test_datasets.py | 10 +- 4 files changed, 234 insertions(+), 5 deletions(-) create mode 100644 danlp/datasets/dannet.py diff --git a/danlp/datasets/__init__.py b/danlp/datasets/__init__.py index 1445347..00ff2f7 100644 --- a/danlp/datasets/__init__.py +++ b/danlp/datasets/__init__.py @@ -2,4 +2,5 @@ from .wiki_ann import * from .word_sim import * from .sentiment import * -from .dacoref import * \ No newline at end of file +from .dacoref import * +from .dannet import * \ No newline at end of file diff --git a/danlp/datasets/dannet.py b/danlp/datasets/dannet.py new file mode 100644 index 0000000..203bd1c --- /dev/null +++ b/danlp/datasets/dannet.py @@ -0,0 +1,214 @@ +import os +import pandas as pd +import json + +from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS + + +class DanNet(): + """ + DanNet wrapper, providing functions to access the main features of DanNet. + See also : https://cst.ku.dk/projekter/dannet/. + + Dannet consists of a set of 4 databases: + + * words + * word senses + * relations + * synsets + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + + """ + + def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): + + self.dataset_name = 'dannet' + self.file_extension = DATASETS[self.dataset_name]['file_extension'] + + self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir) + + self.words = pd.read_csv(os.path.join(self.dataset_dir, "words.csv"), + sep='@', + names=['word_id', 'form', 'pos', 'nan'], + encoding='unicode_escape', + usecols=[0,1,2], + dtype={'word_id':str}) + self.wordsenses = pd.read_csv(os.path.join(self.dataset_dir, "wordsenses.csv"), + sep='@', + names=['wordsense_id', 'word_id', 'synset_id', 'register', 'nan'], + encoding='unicode_escape', + usecols=[1,2], + dtype={'wordsense_id':str, 'word_id':str, 'synset_id':str}) + self.relations = pd.read_csv(os.path.join(self.dataset_dir, "relations.csv"), + sep='@', + names=['synset_id', 'wordnetowl', 'relation', 'value', 'taxonomic', 'inheritance_comment', 'nan'], + encoding='unicode_escape', + usecols=[0,1,2,3,4,5], + dtype={'synset_id':str, 'value':str}) + self.synsets = pd.read_csv(os.path.join(self.dataset_dir, "synsets.csv"), + sep='@', + names=['synset_id', 'label', 'gloss', 'ontological_type'], + encoding='unicode_escape', + usecols=[0,1,2,3], + dtype={'synset_id':str}) + + def load_with_pandas(self): + """ + Loads the datasets in 4 dataframes + + :return: 4 dataframes: words, wordsenses, relations, synsets + + """ + return self.words, self.wordsenses, self.relations, self.synsets + + + def synonyms(self, word, pos=None): + """ + Returns the synonyms of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of synonyms + + :Example: + + "`hav`" + returns + ["sø", "ocean"] + """ + + word_ids = self._word_ids(word, pos) + synset_ids = self._synset_ids(word, pos) + synonym_ids = self.wordsenses[self.wordsenses['synset_id'].isin(synset_ids) & ~self.wordsenses['word_id'].isin(word_ids)]['word_id'].tolist() + synonyms = self.words[self.words['word_id'].isin(synonym_ids)]['form'].tolist() + return synonyms + + def meanings(self, word, pos=None): + """ + Returns the meanings of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of meanings + + """ + + synset_ids = self._synset_ids(word, pos) + meanings = self.synsets[self.synsets['synset_id'].isin(synset_ids)]['gloss'].tolist() + + return meanings + + + def hypernyms(self, word, pos=None): + """ + Returns the hypernyms of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of hypernyms + + """ + + word_synset_ids = self._synset_ids(word) + hyper_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['value'].tolist() + hyper_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['synset_id'].tolist() + hyper_synset_ids = [val for val in hyper_synset_ids if val.isdigit()] + hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hyper_synset_ids)]['word_id'].tolist() + hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist() + + return hypernyms + + + def hyponyms(self, word, pos=None): + """ + Returns the hyponyms of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of hypernyms + + """ + + word_synset_ids = self._synset_ids(word, pos) + hypo_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['value'].tolist() + hypo_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['synset_id'].tolist() + hypo_synset_ids = [val for val in hypo_synset_ids if val.isdigit()] + hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hypo_synset_ids)]['word_id'].tolist() + hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist() + + return hypernyms + + def wordnet_relations(self, word, pos=None, eurowordnet=True): + """ + Returns the name of the relations `word` is associated with. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of relations + + """ + if eurowordnet: + rel_name = "relation" + else: + rel_name = "wordnetowl" + + synset_ids = self._synset_ids(word, pos) + relations = self.relations[self.relations['synset_id'].isin(synset_ids)][rel_name].tolist() + + return set(relations) + + + + def pos(self, word): + """ + Returns the part-of-speech tags `word` can be categorized with among "Noun", "Verb" or "Adjective". + + :param word: text + :return: list of part-of-speech tags + """ + + return list(self.words[self.words['form'] == word]['pos'].unique()) + + def _word_ids(self, word, pos=None): + + pos = _get_pos_list(pos) + word = word.lower() + + return self.words[(self.words['form'] == word) & self.words['pos'].isin(pos)]['word_id'].tolist() + + def _synset_ids(self, word, pos=None): + + word_ids = self._word_ids(word, pos) + return self.wordsenses[self.wordsenses['word_id'].isin(word_ids)]['synset_id'].tolist() + + def _word_from_id(self, word_id): + + assert(type(word_id) == int or (type(word_id) == str and word_id.is_digit())) + word_id = str(word_id) + + return self.words[self.words['word_id'] == word_id]['form'].tolist() + + def _synset_from_id(self, synset_id): + + assert(type(synset_id) == int or (type(synset_id) == str and synset_id.is_digit())) + synset_id = str(synset_id) + + synset_labels = self.synsets[self.synsets['synset_id'] == synset_id]['label'].tolist() + return set([w.split('_')[0] for s in synset_labels for w in s[1:-1].split('; ')]) + + + def __str__(self): + + return "DanNet: {} word forms, {} lexemes, {} synsets".format(len(set(self.words['form'])), len(self.words['word_id']), len(set(self.wordsenses['synset_id']))) + + +def _get_pos_list(pos): + if pos == None: + return ['Noun', 'Verb', 'Adjective'] + elif type(pos) == str: + return [pos] + assert(type(pos) == list) + return pos + diff --git a/danlp/download.py b/danlp/download.py index 7a68731..16e70ed 100644 --- a/danlp/download.py +++ b/danlp/download.py @@ -194,15 +194,21 @@ 'md5_checksum': '5e7dad9e6c8c32aa9dd17830bed5e0f6', 'size': 3489, 'file_extension': '.csv' - }, + }, # coreference dataset 'dacoref': { 'url': 'http://danlp-downloads.alexandra.dk/datasets/dacoref.zip', 'md5_checksum': 'e6f2707f4f600a0d357dc7afa1b01f92', 'size': 1005278, 'file_extension': '' - }, - + }, + # Danish Wordnet + 'dannet': { + 'url': DANLP_STORAGE_URL + '/datasets/dannet.zip', + 'md5_checksum': 'a5aa388bb08487bd59d72257aa15d8fa', + 'size': 6083044, + 'file_extension': '.csv' + }, # SENTIMENT EVALUATION 'europarl.sentiment1': { diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 23f25de..61cb130 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -7,7 +7,7 @@ from pyconll.unit import Conll from spacy.gold import GoldCorpus -from danlp.datasets import DDT, WikiAnn, DATASETS, DSD, EuroparlSentiment1,EuroparlSentiment2, LccSentiment, TwitterSent, Dacoref +from danlp.datasets import DDT, WikiAnn, DATASETS, DSD, EuroparlSentiment1,EuroparlSentiment2, LccSentiment, TwitterSent, Dacoref, DanNet from danlp.datasets.word_sim import WordSim353Da from danlp.utils import write_simple_ner_dataset, read_simple_ner_dataset @@ -168,6 +168,14 @@ def test_dacoreg(self): self.assertEqual(len(corpus[0])+len(corpus[1])+len(corpus[2]), 3403) self.assertEqual(corpus[0][0][0]['form'], 'På') + +class TestDannetDataset(unittest.TestCase): + def test_dannet(self): + dannet = Dannet() + corpus = dannet.load_with_pandas() + self.assertEqual(len(corpus), 4) + self.assertEqual(dannet.synonyms('kat'), ['missekat', 'mis']) + if __name__ == '__main__': unittest.main() \ No newline at end of file From 74f39110cbe3294b7d47b814ccc366d1a08adb68 Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Thu, 3 Dec 2020 16:03:33 +0100 Subject: [PATCH 2/4] add dannet documentation --- docs/docs/datasets.md | 66 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/docs/docs/datasets.md b/docs/docs/datasets.md index 6c9bd05..1832815 100644 --- a/docs/docs/datasets.md +++ b/docs/docs/datasets.md @@ -147,6 +147,72 @@ df = lccsent.load_with_pandas() ``` +### DanNet + +[DanNet](https://cst.ku.dk/projekter/dannet/) is a lexical database such as [Wordnet](https://wordnet.princeton.edu/). + +DanNet depicts the relations between words in Danish (mostly nouns, verbs and adjectives). +The main relation among words in WordNet is synonymy. + +The dataset consists of 4 databases: + + * words + * word senses + * relations + * synsets + +DanNet uses the concept of `synset` to link words together. All the words in the database are part of one or multiple synsets. A synset is a set of synonyms (words which have the same meanings). + + +For downloading DanNet through DaNLP, you can do: + +```python +from danlp.datasets import DanNet + +dannet = DanNet() + +# you can load the databases if you want to look into the databases by yourself +words, wordsenses, relations, synsets = dannet.load_with_pandas() +``` + +We also provide helper functions to search for synonyms, hyperonyms and hyponyms through the databases. +Once you have downloaded the DanNet wrapper, you can use the following features: + +```python + +word = "myre" +# synonyms +dannet.synonyms(word) +""" ['tissemyre'] """ +# hypernyms +dannet.hypernyms(word) +""" ['årevingede insekter'] """ +# hyponyms +dannet.hyponyms(word) +""" ['hærmyre', 'skovmyre', 'pissemyre', 'tissemyre'] """ +# meanings +dannet.meanings(word) +""" ['ca. 1 cm langt, årevinget insekt med en kraftig in ... (Brug: "Myrer på terrassen, og andre steder udendørs, kan hurtigt blive meget generende")'] """ + + +# to help you dive into the databases +# we also provide the following functions: + +# part-of-speech (returns a list comprised in 'Noun', 'Verb' or 'Adjective') +dannet.pos(word) +# wordnet relations (EUROWORDNET or WORDNETOWL) +dannet.wordnet_relations(word, eurowordnet=True)) +# word ids +dannet._word_ids(word) +# synset ids +dannet._synset_ids(word) +# word from id +dannet._word_from_id(11034863) +# synset from id +dannet._synset_from_id(3514) +``` + + ## 🎓 References - Johannsen, Anders, Martínez Alonso, Héctor and Plank, Barbara. “Universal Dependencies for Danish”. TLT14, 2015. - Keson, Britt (1998). Documentation of The Danish Morpho-syntactically Tagged PAROLE Corpus. Technical report, DSL From 6fa7d4b081c90c0eca3e33efd4001b92524e2b9d Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Fri, 4 Dec 2020 10:47:14 +0100 Subject: [PATCH 3/4] fix typo --- tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 61cb130..3ac9a9a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -171,7 +171,7 @@ def test_dacoreg(self): class TestDannetDataset(unittest.TestCase): def test_dannet(self): - dannet = Dannet() + dannet = DanNet() corpus = dannet.load_with_pandas() self.assertEqual(len(corpus), 4) self.assertEqual(dannet.synonyms('kat'), ['missekat', 'mis']) From e21cbf56b401b6cc351548fc0993240e9db923ff Mon Sep 17 00:00:00 2001 From: "ALEX5739\\amalien" Date: Fri, 4 Dec 2020 16:48:34 +0100 Subject: [PATCH 4/4] Add more test for dannet functions, and add reference in documentation --- docs/docs/datasets.md | 36 +++++++++++++++++++----------------- tests/test_datasets.py | 10 ++++++---- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/docs/docs/datasets.md b/docs/docs/datasets.md index 1832815..333737c 100644 --- a/docs/docs/datasets.md +++ b/docs/docs/datasets.md @@ -3,22 +3,23 @@ Datasets This section keeps a list of Danish NLP datasets publicly available. -| Dataset | Task | Words | Sents | License | DaNLP | -| ------------------------------------------------------------ | ---------------------- | --------------- | ---------------------- | ------------------------------------------------------------ | ----- | -| [OpenSubtitles2018]() | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) | ❌ | -| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - | ❌ | -| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) | ❌ | -| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) | ❌ | -| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ | -| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ | -| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ | -| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ | -| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ | -| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) | ❌ | -| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | -| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | -| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ | -| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403 | GNU Public License version 2 | ✔️ | +| Dataset | Task | Words | Sents | License | DaNLP | +| ------------------------------------------------------------ | ---------------------- | ----------------- | ---------------------- | ------------------------------------------------------------ | ----- | +| [OpenSubtitles2018]() | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) | ❌ | +| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - | ❌ | +| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) | ❌ | +| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) | ❌ | +| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ | +| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ | +| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ | +| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ | +| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ | +| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) | ❌ | +| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | +| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | +| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ | +| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403 | GNU Public License version 2 | ✔️ | +| [DanNet](#dannet) | Wordnet | 66.308 (concepts) | - | [license](https://cst.ku.dk/projekter/dannet/license.txt) | ✔️ | It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus. @@ -149,7 +150,7 @@ df = lccsent.load_with_pandas() ### DanNet -[DanNet](https://cst.ku.dk/projekter/dannet/) is a lexical database such as [Wordnet](https://wordnet.princeton.edu/). +[DanNet](https://cst.ku.dk/projekter/dannet/) is a lexical database such as [Wordnet](https://wordnet.princeton.edu/). "Center for sprogteknologi" at The University of Copenhagen is behind it and more details about it can be found in the paper Pedersen et al 2009. DanNet depicts the relations between words in Danish (mostly nouns, verbs and adjectives). The main relation among words in WordNet is synonymy. @@ -218,6 +219,7 @@ dannet._synset_from_id(3514) - Keson, Britt (1998). Documentation of The Danish Morpho-syntactically Tagged PAROLE Corpus. Technical report, DSL - Matthias T. Buch-Kromann, Line Mikkelsen, and Stine Kern Lynge. 2003. "Danish dependency treebank". In **TLT**. - Rasmus Hvingelby, Amalie B. Pauli, Maria Barrett, Christina Rosted, Lasse M. Lidegaard and Anders Søgaard. 2020. DaNE: A Named Entity Resource for Danish. In **LREC**. +- Pedersen, Bolette S. Sanni Nimb, Jørg Asmussen, Nicolai H. Sørensen, Lars Trap-Jensen og Henrik Lorentzen (2009). [DanNet – the challenge of compiling a WordNet for Danish by reusing a monolingual dictionary](https://pdfs.semanticscholar.org/6891/69de00c63d58bd68229cb0b3469a617f5ab3.pdf). *Lang Resources & Evaluation* 43:269–299. - Xiaoman Pan, Boliang Zhang, Jonathan May, Joel Nothman, Kevin Knight and Heng Ji. 2017. [Cross-lingual Name Tagging and Linking for 282 Languages](https://aclweb.org/anthology/P17-1178). In **ACL**. - Lev Finkelstein, Evgeniy Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan Ruppin. 2002. [Placing Search in Context: The Concept Revisited](http://www.cs.technion.ac.il/~gabr/papers/tois_context.pdf). In **ACM TOIS**. - Uwe Quasthoff, Matthias Richter and Christian Biemann. 2006. [Corpus Portal for Search in Monolingual Corpora](https://www.aclweb.org/anthology/L06-1396/). In **LREC**. diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3ac9a9a..acd74e9 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -167,15 +167,17 @@ def test_dacoreg(self): self.assertEqual(len(corpus), 3) self.assertEqual(len(corpus[0])+len(corpus[1])+len(corpus[2]), 3403) self.assertEqual(corpus[0][0][0]['form'], 'På') - - + class TestDannetDataset(unittest.TestCase): def test_dannet(self): dannet = DanNet() corpus = dannet.load_with_pandas() self.assertEqual(len(corpus), 4) self.assertEqual(dannet.synonyms('kat'), ['missekat', 'mis']) - - + self.assertEqual(dannet.hypernyms('myre'), ['årevingede insekter']) + self.assertEqual(dannet.hyponyms('myre'), ['hærmyre', 'skovmyre', 'pissemyre', 'tissemyre']) + self.assertEqual(dannet.pos('myre'), ['Noun']) + self.assertEqual(dannet.meanings('myre'), ['ca. 1 cm langt, årevinget insekt med en kraftig in ... (Brug: "Myrer på terrassen, og andre steder udendørs, kan hurtigt blive meget generende")']) + if __name__ == '__main__': unittest.main() \ No newline at end of file