diff --git a/danlp/datasets/__init__.py b/danlp/datasets/__init__.py index 1445347..00ff2f7 100644 --- a/danlp/datasets/__init__.py +++ b/danlp/datasets/__init__.py @@ -2,4 +2,5 @@ from .wiki_ann import * from .word_sim import * from .sentiment import * -from .dacoref import * \ No newline at end of file +from .dacoref import * +from .dannet import * \ No newline at end of file diff --git a/danlp/datasets/dannet.py b/danlp/datasets/dannet.py new file mode 100644 index 0000000..203bd1c --- /dev/null +++ b/danlp/datasets/dannet.py @@ -0,0 +1,214 @@ +import os +import pandas as pd +import json + +from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS + + +class DanNet(): + """ + DanNet wrapper, providing functions to access the main features of DanNet. + See also : https://cst.ku.dk/projekter/dannet/. + + Dannet consists of a set of 4 databases: + + * words + * word senses + * relations + * synsets + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + + """ + + def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): + + self.dataset_name = 'dannet' + self.file_extension = DATASETS[self.dataset_name]['file_extension'] + + self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir) + + self.words = pd.read_csv(os.path.join(self.dataset_dir, "words.csv"), + sep='@', + names=['word_id', 'form', 'pos', 'nan'], + encoding='unicode_escape', + usecols=[0,1,2], + dtype={'word_id':str}) + self.wordsenses = pd.read_csv(os.path.join(self.dataset_dir, "wordsenses.csv"), + sep='@', + names=['wordsense_id', 'word_id', 'synset_id', 'register', 'nan'], + encoding='unicode_escape', + usecols=[1,2], + dtype={'wordsense_id':str, 'word_id':str, 'synset_id':str}) + self.relations = pd.read_csv(os.path.join(self.dataset_dir, "relations.csv"), + sep='@', + names=['synset_id', 'wordnetowl', 'relation', 'value', 'taxonomic', 'inheritance_comment', 'nan'], + encoding='unicode_escape', + usecols=[0,1,2,3,4,5], + dtype={'synset_id':str, 'value':str}) + self.synsets = pd.read_csv(os.path.join(self.dataset_dir, "synsets.csv"), + sep='@', + names=['synset_id', 'label', 'gloss', 'ontological_type'], + encoding='unicode_escape', + usecols=[0,1,2,3], + dtype={'synset_id':str}) + + def load_with_pandas(self): + """ + Loads the datasets in 4 dataframes + + :return: 4 dataframes: words, wordsenses, relations, synsets + + """ + return self.words, self.wordsenses, self.relations, self.synsets + + + def synonyms(self, word, pos=None): + """ + Returns the synonyms of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of synonyms + + :Example: + + "`hav`" + returns + ["sø", "ocean"] + """ + + word_ids = self._word_ids(word, pos) + synset_ids = self._synset_ids(word, pos) + synonym_ids = self.wordsenses[self.wordsenses['synset_id'].isin(synset_ids) & ~self.wordsenses['word_id'].isin(word_ids)]['word_id'].tolist() + synonyms = self.words[self.words['word_id'].isin(synonym_ids)]['form'].tolist() + return synonyms + + def meanings(self, word, pos=None): + """ + Returns the meanings of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of meanings + + """ + + synset_ids = self._synset_ids(word, pos) + meanings = self.synsets[self.synsets['synset_id'].isin(synset_ids)]['gloss'].tolist() + + return meanings + + + def hypernyms(self, word, pos=None): + """ + Returns the hypernyms of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of hypernyms + + """ + + word_synset_ids = self._synset_ids(word) + hyper_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['value'].tolist() + hyper_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['synset_id'].tolist() + hyper_synset_ids = [val for val in hyper_synset_ids if val.isdigit()] + hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hyper_synset_ids)]['word_id'].tolist() + hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist() + + return hypernyms + + + def hyponyms(self, word, pos=None): + """ + Returns the hyponyms of `word`. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of hypernyms + + """ + + word_synset_ids = self._synset_ids(word, pos) + hypo_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['value'].tolist() + hypo_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['synset_id'].tolist() + hypo_synset_ids = [val for val in hypo_synset_ids if val.isdigit()] + hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hypo_synset_ids)]['word_id'].tolist() + hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist() + + return hypernyms + + def wordnet_relations(self, word, pos=None, eurowordnet=True): + """ + Returns the name of the relations `word` is associated with. + + :param word: text + :param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective") + :return: list of relations + + """ + if eurowordnet: + rel_name = "relation" + else: + rel_name = "wordnetowl" + + synset_ids = self._synset_ids(word, pos) + relations = self.relations[self.relations['synset_id'].isin(synset_ids)][rel_name].tolist() + + return set(relations) + + + + def pos(self, word): + """ + Returns the part-of-speech tags `word` can be categorized with among "Noun", "Verb" or "Adjective". + + :param word: text + :return: list of part-of-speech tags + """ + + return list(self.words[self.words['form'] == word]['pos'].unique()) + + def _word_ids(self, word, pos=None): + + pos = _get_pos_list(pos) + word = word.lower() + + return self.words[(self.words['form'] == word) & self.words['pos'].isin(pos)]['word_id'].tolist() + + def _synset_ids(self, word, pos=None): + + word_ids = self._word_ids(word, pos) + return self.wordsenses[self.wordsenses['word_id'].isin(word_ids)]['synset_id'].tolist() + + def _word_from_id(self, word_id): + + assert(type(word_id) == int or (type(word_id) == str and word_id.is_digit())) + word_id = str(word_id) + + return self.words[self.words['word_id'] == word_id]['form'].tolist() + + def _synset_from_id(self, synset_id): + + assert(type(synset_id) == int or (type(synset_id) == str and synset_id.is_digit())) + synset_id = str(synset_id) + + synset_labels = self.synsets[self.synsets['synset_id'] == synset_id]['label'].tolist() + return set([w.split('_')[0] for s in synset_labels for w in s[1:-1].split('; ')]) + + + def __str__(self): + + return "DanNet: {} word forms, {} lexemes, {} synsets".format(len(set(self.words['form'])), len(self.words['word_id']), len(set(self.wordsenses['synset_id']))) + + +def _get_pos_list(pos): + if pos == None: + return ['Noun', 'Verb', 'Adjective'] + elif type(pos) == str: + return [pos] + assert(type(pos) == list) + return pos + diff --git a/danlp/download.py b/danlp/download.py index 7a68731..16e70ed 100644 --- a/danlp/download.py +++ b/danlp/download.py @@ -194,15 +194,21 @@ 'md5_checksum': '5e7dad9e6c8c32aa9dd17830bed5e0f6', 'size': 3489, 'file_extension': '.csv' - }, + }, # coreference dataset 'dacoref': { 'url': 'http://danlp-downloads.alexandra.dk/datasets/dacoref.zip', 'md5_checksum': 'e6f2707f4f600a0d357dc7afa1b01f92', 'size': 1005278, 'file_extension': '' - }, - + }, + # Danish Wordnet + 'dannet': { + 'url': DANLP_STORAGE_URL + '/datasets/dannet.zip', + 'md5_checksum': 'a5aa388bb08487bd59d72257aa15d8fa', + 'size': 6083044, + 'file_extension': '.csv' + }, # SENTIMENT EVALUATION 'europarl.sentiment1': { diff --git a/docs/docs/datasets.md b/docs/docs/datasets.md index 6c9bd05..333737c 100644 --- a/docs/docs/datasets.md +++ b/docs/docs/datasets.md @@ -3,22 +3,23 @@ Datasets This section keeps a list of Danish NLP datasets publicly available. -| Dataset | Task | Words | Sents | License | DaNLP | -| ------------------------------------------------------------ | ---------------------- | --------------- | ---------------------- | ------------------------------------------------------------ | ----- | -| [OpenSubtitles2018]() | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) | ❌ | -| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - | ❌ | -| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) | ❌ | -| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) | ❌ | -| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ | -| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ | -| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ | -| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ | -| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ | -| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) | ❌ | -| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | -| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | -| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ | -| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403 | GNU Public License version 2 | ✔️ | +| Dataset | Task | Words | Sents | License | DaNLP | +| ------------------------------------------------------------ | ---------------------- | ----------------- | ---------------------- | ------------------------------------------------------------ | ----- | +| [OpenSubtitles2018]() | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) | ❌ | +| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - | ❌ | +| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) | ❌ | +| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) | ❌ | +| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ | +| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ | +| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ | +| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ | +| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ | +| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) | ❌ | +| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | +| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | +| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ | +| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403 | GNU Public License version 2 | ✔️ | +| [DanNet](#dannet) | Wordnet | 66.308 (concepts) | - | [license](https://cst.ku.dk/projekter/dannet/license.txt) | ✔️ | It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus. @@ -147,11 +148,78 @@ df = lccsent.load_with_pandas() ``` +### DanNet + +[DanNet](https://cst.ku.dk/projekter/dannet/) is a lexical database such as [Wordnet](https://wordnet.princeton.edu/). "Center for sprogteknologi" at The University of Copenhagen is behind it and more details about it can be found in the paper Pedersen et al 2009. + +DanNet depicts the relations between words in Danish (mostly nouns, verbs and adjectives). +The main relation among words in WordNet is synonymy. + +The dataset consists of 4 databases: + + * words + * word senses + * relations + * synsets + +DanNet uses the concept of `synset` to link words together. All the words in the database are part of one or multiple synsets. A synset is a set of synonyms (words which have the same meanings). + + +For downloading DanNet through DaNLP, you can do: + +```python +from danlp.datasets import DanNet + +dannet = DanNet() + +# you can load the databases if you want to look into the databases by yourself +words, wordsenses, relations, synsets = dannet.load_with_pandas() +``` + +We also provide helper functions to search for synonyms, hyperonyms and hyponyms through the databases. +Once you have downloaded the DanNet wrapper, you can use the following features: + +```python + +word = "myre" +# synonyms +dannet.synonyms(word) +""" ['tissemyre'] """ +# hypernyms +dannet.hypernyms(word) +""" ['årevingede insekter'] """ +# hyponyms +dannet.hyponyms(word) +""" ['hærmyre', 'skovmyre', 'pissemyre', 'tissemyre'] """ +# meanings +dannet.meanings(word) +""" ['ca. 1 cm langt, årevinget insekt med en kraftig in ... (Brug: "Myrer på terrassen, og andre steder udendørs, kan hurtigt blive meget generende")'] """ + + +# to help you dive into the databases +# we also provide the following functions: + +# part-of-speech (returns a list comprised in 'Noun', 'Verb' or 'Adjective') +dannet.pos(word) +# wordnet relations (EUROWORDNET or WORDNETOWL) +dannet.wordnet_relations(word, eurowordnet=True)) +# word ids +dannet._word_ids(word) +# synset ids +dannet._synset_ids(word) +# word from id +dannet._word_from_id(11034863) +# synset from id +dannet._synset_from_id(3514) +``` + + ## 🎓 References - Johannsen, Anders, Martínez Alonso, Héctor and Plank, Barbara. “Universal Dependencies for Danish”. TLT14, 2015. - Keson, Britt (1998). Documentation of The Danish Morpho-syntactically Tagged PAROLE Corpus. Technical report, DSL - Matthias T. Buch-Kromann, Line Mikkelsen, and Stine Kern Lynge. 2003. "Danish dependency treebank". In **TLT**. - Rasmus Hvingelby, Amalie B. Pauli, Maria Barrett, Christina Rosted, Lasse M. Lidegaard and Anders Søgaard. 2020. DaNE: A Named Entity Resource for Danish. In **LREC**. +- Pedersen, Bolette S. Sanni Nimb, Jørg Asmussen, Nicolai H. Sørensen, Lars Trap-Jensen og Henrik Lorentzen (2009). [DanNet – the challenge of compiling a WordNet for Danish by reusing a monolingual dictionary](https://pdfs.semanticscholar.org/6891/69de00c63d58bd68229cb0b3469a617f5ab3.pdf). *Lang Resources & Evaluation* 43:269–299. - Xiaoman Pan, Boliang Zhang, Jonathan May, Joel Nothman, Kevin Knight and Heng Ji. 2017. [Cross-lingual Name Tagging and Linking for 282 Languages](https://aclweb.org/anthology/P17-1178). In **ACL**. - Lev Finkelstein, Evgeniy Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan Ruppin. 2002. [Placing Search in Context: The Concept Revisited](http://www.cs.technion.ac.il/~gabr/papers/tois_context.pdf). In **ACM TOIS**. - Uwe Quasthoff, Matthias Richter and Christian Biemann. 2006. [Corpus Portal for Search in Monolingual Corpora](https://www.aclweb.org/anthology/L06-1396/). In **LREC**. diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 23f25de..acd74e9 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -7,7 +7,7 @@ from pyconll.unit import Conll from spacy.gold import GoldCorpus -from danlp.datasets import DDT, WikiAnn, DATASETS, DSD, EuroparlSentiment1,EuroparlSentiment2, LccSentiment, TwitterSent, Dacoref +from danlp.datasets import DDT, WikiAnn, DATASETS, DSD, EuroparlSentiment1,EuroparlSentiment2, LccSentiment, TwitterSent, Dacoref, DanNet from danlp.datasets.word_sim import WordSim353Da from danlp.utils import write_simple_ner_dataset, read_simple_ner_dataset @@ -167,7 +167,17 @@ def test_dacoreg(self): self.assertEqual(len(corpus), 3) self.assertEqual(len(corpus[0])+len(corpus[1])+len(corpus[2]), 3403) self.assertEqual(corpus[0][0][0]['form'], 'På') - - + +class TestDannetDataset(unittest.TestCase): + def test_dannet(self): + dannet = DanNet() + corpus = dannet.load_with_pandas() + self.assertEqual(len(corpus), 4) + self.assertEqual(dannet.synonyms('kat'), ['missekat', 'mis']) + self.assertEqual(dannet.hypernyms('myre'), ['årevingede insekter']) + self.assertEqual(dannet.hyponyms('myre'), ['hærmyre', 'skovmyre', 'pissemyre', 'tissemyre']) + self.assertEqual(dannet.pos('myre'), ['Noun']) + self.assertEqual(dannet.meanings('myre'), ['ca. 1 cm langt, årevinget insekt med en kraftig in ... (Brug: "Myrer på terrassen, og andre steder udendørs, kan hurtigt blive meget generende")']) + if __name__ == '__main__': unittest.main() \ No newline at end of file