Skip to content

Commit

Permalink
Merge 0584ca9 into bf48772
Browse files Browse the repository at this point in the history
  • Loading branch information
ophelielacroix committed Dec 7, 2020
2 parents bf48772 + 0584ca9 commit 37e80eb
Show file tree
Hide file tree
Showing 5 changed files with 326 additions and 23 deletions.
3 changes: 2 additions & 1 deletion danlp/datasets/__init__.py
Expand Up @@ -2,4 +2,5 @@
from .wiki_ann import *
from .word_sim import *
from .sentiment import *
from .dacoref import *
from .dacoref import *
from .dannet import *
214 changes: 214 additions & 0 deletions danlp/datasets/dannet.py
@@ -0,0 +1,214 @@
import os
import pandas as pd
import json

from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS


class DanNet():
"""
DanNet wrapper, providing functions to access the main features of DanNet.
See also : https://cst.ku.dk/projekter/dannet/.
Dannet consists of a set of 4 databases:
* words
* word senses
* relations
* synsets
:param str cache_dir: the directory for storing cached models
:param bool verbose: `True` to increase verbosity
"""

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):

self.dataset_name = 'dannet'
self.file_extension = DATASETS[self.dataset_name]['file_extension']

self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)

self.words = pd.read_csv(os.path.join(self.dataset_dir, "words.csv"),
sep='@',
names=['word_id', 'form', 'pos', 'nan'],
encoding='unicode_escape',
usecols=[0,1,2],
dtype={'word_id':str})
self.wordsenses = pd.read_csv(os.path.join(self.dataset_dir, "wordsenses.csv"),
sep='@',
names=['wordsense_id', 'word_id', 'synset_id', 'register', 'nan'],
encoding='unicode_escape',
usecols=[1,2],
dtype={'wordsense_id':str, 'word_id':str, 'synset_id':str})
self.relations = pd.read_csv(os.path.join(self.dataset_dir, "relations.csv"),
sep='@',
names=['synset_id', 'wordnetowl', 'relation', 'value', 'taxonomic', 'inheritance_comment', 'nan'],
encoding='unicode_escape',
usecols=[0,1,2,3,4,5],
dtype={'synset_id':str, 'value':str})
self.synsets = pd.read_csv(os.path.join(self.dataset_dir, "synsets.csv"),
sep='@',
names=['synset_id', 'label', 'gloss', 'ontological_type'],
encoding='unicode_escape',
usecols=[0,1,2,3],
dtype={'synset_id':str})

def load_with_pandas(self):
"""
Loads the datasets in 4 dataframes
:return: 4 dataframes: words, wordsenses, relations, synsets
"""
return self.words, self.wordsenses, self.relations, self.synsets


def synonyms(self, word, pos=None):
"""
Returns the synonyms of `word`.
:param word: text
:param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
:return: list of synonyms
:Example:
"`hav`"
returns
["sø", "ocean"]
"""

word_ids = self._word_ids(word, pos)
synset_ids = self._synset_ids(word, pos)
synonym_ids = self.wordsenses[self.wordsenses['synset_id'].isin(synset_ids) & ~self.wordsenses['word_id'].isin(word_ids)]['word_id'].tolist()
synonyms = self.words[self.words['word_id'].isin(synonym_ids)]['form'].tolist()
return synonyms

def meanings(self, word, pos=None):
"""
Returns the meanings of `word`.
:param word: text
:param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
:return: list of meanings
"""

synset_ids = self._synset_ids(word, pos)
meanings = self.synsets[self.synsets['synset_id'].isin(synset_ids)]['gloss'].tolist()

return meanings


def hypernyms(self, word, pos=None):
"""
Returns the hypernyms of `word`.
:param word: text
:param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
:return: list of hypernyms
"""

word_synset_ids = self._synset_ids(word)
hyper_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['value'].tolist()
hyper_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['synset_id'].tolist()
hyper_synset_ids = [val for val in hyper_synset_ids if val.isdigit()]
hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hyper_synset_ids)]['word_id'].tolist()
hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist()

return hypernyms


def hyponyms(self, word, pos=None):
"""
Returns the hyponyms of `word`.
:param word: text
:param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
:return: list of hypernyms
"""

word_synset_ids = self._synset_ids(word, pos)
hypo_synset_ids = self.relations[self.relations['synset_id'].isin(word_synset_ids) & (self.relations['relation']=='has_hyponym')]['value'].tolist()
hypo_synset_ids += self.relations[self.relations['value'].isin(word_synset_ids) & (self.relations['relation']=='has_hyperonym')]['synset_id'].tolist()
hypo_synset_ids = [val for val in hypo_synset_ids if val.isdigit()]
hypernyms_ids = self.wordsenses[self.wordsenses['synset_id'].isin(hypo_synset_ids)]['word_id'].tolist()
hypernyms = self.words[self.words['word_id'].isin(hypernyms_ids)]['form'].tolist()

return hypernyms

def wordnet_relations(self, word, pos=None, eurowordnet=True):
"""
Returns the name of the relations `word` is associated with.
:param word: text
:param pos: (list of) part of speech tag(s) (in "Noun", "Verb", "Adjective")
:return: list of relations
"""
if eurowordnet:
rel_name = "relation"
else:
rel_name = "wordnetowl"

synset_ids = self._synset_ids(word, pos)
relations = self.relations[self.relations['synset_id'].isin(synset_ids)][rel_name].tolist()

return set(relations)



def pos(self, word):
"""
Returns the part-of-speech tags `word` can be categorized with among "Noun", "Verb" or "Adjective".
:param word: text
:return: list of part-of-speech tags
"""

return list(self.words[self.words['form'] == word]['pos'].unique())

def _word_ids(self, word, pos=None):

pos = _get_pos_list(pos)
word = word.lower()

return self.words[(self.words['form'] == word) & self.words['pos'].isin(pos)]['word_id'].tolist()

def _synset_ids(self, word, pos=None):

word_ids = self._word_ids(word, pos)
return self.wordsenses[self.wordsenses['word_id'].isin(word_ids)]['synset_id'].tolist()

def _word_from_id(self, word_id):

assert(type(word_id) == int or (type(word_id) == str and word_id.is_digit()))
word_id = str(word_id)

return self.words[self.words['word_id'] == word_id]['form'].tolist()

def _synset_from_id(self, synset_id):

assert(type(synset_id) == int or (type(synset_id) == str and synset_id.is_digit()))
synset_id = str(synset_id)

synset_labels = self.synsets[self.synsets['synset_id'] == synset_id]['label'].tolist()
return set([w.split('_')[0] for s in synset_labels for w in s[1:-1].split('; ')])


def __str__(self):

return "DanNet: {} word forms, {} lexemes, {} synsets".format(len(set(self.words['form'])), len(self.words['word_id']), len(set(self.wordsenses['synset_id'])))


def _get_pos_list(pos):
if pos == None:
return ['Noun', 'Verb', 'Adjective']
elif type(pos) == str:
return [pos]
assert(type(pos) == list)
return pos

12 changes: 9 additions & 3 deletions danlp/download.py
Expand Up @@ -194,15 +194,21 @@
'md5_checksum': '5e7dad9e6c8c32aa9dd17830bed5e0f6',
'size': 3489,
'file_extension': '.csv'
},
},
# coreference dataset
'dacoref': {
'url': 'http://danlp-downloads.alexandra.dk/datasets/dacoref.zip',
'md5_checksum': 'e6f2707f4f600a0d357dc7afa1b01f92',
'size': 1005278,
'file_extension': ''
},

},
# Danish Wordnet
'dannet': {
'url': DANLP_STORAGE_URL + '/datasets/dannet.zip',
'md5_checksum': 'a5aa388bb08487bd59d72257aa15d8fa',
'size': 6083044,
'file_extension': '.csv'
},

# SENTIMENT EVALUATION
'europarl.sentiment1': {
Expand Down
100 changes: 84 additions & 16 deletions docs/docs/datasets.md
Expand Up @@ -3,22 +3,23 @@ Datasets

This section keeps a list of Danish NLP datasets publicly available.

| Dataset | Task | Words | Sents | License | DaNLP |
| ------------------------------------------------------------ | ---------------------- | --------------- | ---------------------- | ------------------------------------------------------------ | ----- |
| [OpenSubtitles2018](<http://opus.nlpl.eu/OpenSubtitles2018.php>) | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) ||
| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - ||
| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) ||
| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) ||
| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ |
| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ |
| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ |
| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ |
| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ |
| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) ||
| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ |
| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ |
| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ |
| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403 | GNU Public License version 2 | ✔️ |
| Dataset | Task | Words | Sents | License | DaNLP |
| ------------------------------------------------------------ | ---------------------- | ----------------- | ---------------------- | ------------------------------------------------------------ | ----- |
| [OpenSubtitles2018](<http://opus.nlpl.eu/OpenSubtitles2018.php>) | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) ||
| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - ||
| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) ||
| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) ||
| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ |
| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ |
| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ |
| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ |
| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ |
| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) ||
| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ |
| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ |
| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ |
| [Dacoref](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#dacoref) | coreference resolution | 64.076 (tokens) | 3.403 | GNU Public License version 2 | ✔️ |
| [DanNet](#dannet) | Wordnet | 66.308 (concepts) | - | [license](https://cst.ku.dk/projekter/dannet/license.txt) | ✔️ |

It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus.

Expand Down Expand Up @@ -147,11 +148,78 @@ df = lccsent.load_with_pandas()
```


### DanNet

[DanNet](https://cst.ku.dk/projekter/dannet/) is a lexical database such as [Wordnet](https://wordnet.princeton.edu/). "Center for sprogteknologi" at The University of Copenhagen is behind it and more details about it can be found in the paper Pedersen et al 2009.

DanNet depicts the relations between words in Danish (mostly nouns, verbs and adjectives).
The main relation among words in WordNet is synonymy.

The dataset consists of 4 databases:

* words
* word senses
* relations
* synsets

DanNet uses the concept of `synset` to link words together. All the words in the database are part of one or multiple synsets. A synset is a set of synonyms (words which have the same meanings).


For downloading DanNet through DaNLP, you can do:

```python
from danlp.datasets import DanNet

dannet = DanNet()

# you can load the databases if you want to look into the databases by yourself
words, wordsenses, relations, synsets = dannet.load_with_pandas()
```

We also provide helper functions to search for synonyms, hyperonyms and hyponyms through the databases.
Once you have downloaded the DanNet wrapper, you can use the following features:

```python

word = "myre"
# synonyms
dannet.synonyms(word)
""" ['tissemyre'] """
# hypernyms
dannet.hypernyms(word)
""" ['årevingede insekter'] """
# hyponyms
dannet.hyponyms(word)
""" ['hærmyre', 'skovmyre', 'pissemyre', 'tissemyre'] """
# meanings
dannet.meanings(word)
""" ['ca. 1 cm langt, årevinget insekt med en kraftig in ... (Brug: "Myrer på terrassen, og andre steder udendørs, kan hurtigt blive meget generende")'] """


# to help you dive into the databases
# we also provide the following functions:

# part-of-speech (returns a list comprised in 'Noun', 'Verb' or 'Adjective')
dannet.pos(word)
# wordnet relations (EUROWORDNET or WORDNETOWL)
dannet.wordnet_relations(word, eurowordnet=True))
# word ids
dannet._word_ids(word)
# synset ids
dannet._synset_ids(word)
# word from id
dannet._word_from_id(11034863)
# synset from id
dannet._synset_from_id(3514)
```


## 🎓 References
- Johannsen, Anders, Martínez Alonso, Héctor and Plank, Barbara. “Universal Dependencies for Danish”. TLT14, 2015.
- Keson, Britt (1998). Documentation of The Danish Morpho-syntactically Tagged PAROLE Corpus. Technical report, DSL
- Matthias T. Buch-Kromann, Line Mikkelsen, and Stine Kern Lynge. 2003. "Danish dependency treebank". In **TLT**.
- Rasmus Hvingelby, Amalie B. Pauli, Maria Barrett, Christina Rosted, Lasse M. Lidegaard and Anders Søgaard. 2020. DaNE: A Named Entity Resource for Danish. In **LREC**.
- Pedersen, Bolette S. Sanni Nimb, Jørg Asmussen, Nicolai H. Sørensen, Lars Trap-Jensen og Henrik Lorentzen (2009). [DanNet – the challenge of compiling a WordNet for Danish by reusing a monolingual dictionary](https://pdfs.semanticscholar.org/6891/69de00c63d58bd68229cb0b3469a617f5ab3.pdf). *Lang Resources & Evaluation* 43:269–299.
- Xiaoman Pan, Boliang Zhang, Jonathan May, Joel Nothman, Kevin Knight and Heng Ji. 2017. [Cross-lingual Name Tagging and Linking for 282 Languages](https://aclweb.org/anthology/P17-1178). In **ACL**.
- Lev Finkelstein, Evgeniy Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan Ruppin. 2002. [Placing Search in Context: The Concept Revisited](http://www.cs.technion.ac.il/~gabr/papers/tois_context.pdf). In **ACM TOIS**.
- Uwe Quasthoff, Matthias Richter and Christian Biemann. 2006. [Corpus Portal for Search in Monolingual Corpora](https://www.aclweb.org/anthology/L06-1396/). In **LREC**.
Expand Down

0 comments on commit 37e80eb

Please sign in to comment.