Skip to content

Commit

Permalink
Merge 8b976dc into 33c6697
Browse files Browse the repository at this point in the history
  • Loading branch information
ophelielacroix committed Feb 21, 2022
2 parents 33c6697 + 8b976dc commit 4da9fee
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 2 deletions.
3 changes: 2 additions & 1 deletion danlp/datasets/__init__.py
Expand Up @@ -7,4 +7,5 @@
from .dkhate import *
from .unimorph import *
from .daned import *
from .dawikined import *
from .dawikined import *
from .ddisco import *
36 changes: 36 additions & 0 deletions danlp/datasets/ddisco.py
@@ -0,0 +1,36 @@
import os
import pandas as pd

from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS

class DDisco:
"""
Class for loading the DDisco dataset.
The DDisco dataset is annotated for discourse coherence.
It contains user-generated texts from Reddit and Wikipedia.
Annotation labels are:
* 1: low coherence
* 2: medium coherence
* 3: high coherence
:param str cache_dir: the directory for storing cached models
:param bool verbose: `True` to increase verbosity
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'ddisco'
self.file_extension = DATASETS[self.dataset_name]['file_extension']
self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)

def load_with_pandas(self):
"""
Loads the DDisco dataset in dataframes with pandas.
:return: 2 dataframes -- train, test
"""
df_train = pd.read_csv(os.path.join(self.dataset_dir, self.dataset_name + '.train' + self.file_extension), sep='\t', index_col=0, encoding='utf-8').dropna()
df_test = pd.read_csv(os.path.join(self.dataset_dir, self.dataset_name + '.test' + self.file_extension), sep='\t', index_col=0, encoding='utf-8').dropna()

return df_train, df_test
7 changes: 7 additions & 0 deletions danlp/download.py
Expand Up @@ -273,6 +273,13 @@
'size': 12244995,
'file_extension': '.tsv'
},
# DDisco
'ddisco': {
'url': DANLP_STORAGE_URL + '/datasets/ddisco.zip',
'md5_checksum': 'd55d145456699fa8be77fd0939324599',
'size': 421885,
'file_extension': '.tsv'
},

# SENTIMENT EVALUATION
'europarl.sentiment1': {
Expand Down
26 changes: 26 additions & 0 deletions docs/docs/datasets.md
Expand Up @@ -25,6 +25,8 @@ This section keeps a list of Danish NLP datasets publicly available.
| [DaUnimorph](#daunimorph) | Morphological Inflection | 25,503 | - | CC BY-SA 3.0 | ✔️ |
| [DaNED](#daned) | Named Entity Disambiguation | -- | train:4,626 dev:544 test:744 | CC BY-SA 4.0 | ✔️ |
| [DaWikiNED](#dawikined) | Named Entity Disambiguation | -- | 21,302 | CC BY-SA 4.0 | ✔️ |
| [DDisco](#ddisco) | Discourse Coherence | -- | - | CC BY-SA 4.0 | ✔️ |


It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus.

Expand Down Expand Up @@ -394,6 +396,30 @@ The dataset can also be downloaded directly:
[Download DaWikiNED](http://danlp-downloads.alexandra.dk/datasets/dawikined.zip)


### DDisco

The DDisco dataset has been developed for DaNLP, through a Master student project.
Each entry in the dataset is annotated with a discourse coherence label (rating from 1 to 3):

* 1: low coherence (difficult to understand, unorganized, contained unnecessary details and can not be summarized briefly and easily)
* 2: medium coherence
* 3: high coherence (easy to understand, well organized, only contain details that support the main point and can be summarized briefly and easily).

Grammatical and typing errors are ignored (i.e. they do not affect the coherency score) and the coherence of a text is considered within its own domain.

The dataset can be loaded with the DaNLP package:

```python
from danlp.datasets import DDisco
ddisco = DDisco()
train, test = ddisco.load_with_pandas()
```

The dataset can also be downloaded directly:

[Download DDisco](http://danlp-downloads.alexandra.dk/datasets/ddisco.zip)



## 🎓 References

Expand Down
10 changes: 9 additions & 1 deletion tests/test_datasets.py
Expand Up @@ -8,7 +8,7 @@
from spacy.gold import GoldCorpus

from danlp.datasets import DDT, WikiAnn, DATASETS, DSD, EuroparlSentiment1,EuroparlSentiment2, LccSentiment, \
TwitterSent, Dacoref, DanNet, DKHate, DaUnimorph, DaNED, DaWikiNED
TwitterSent, Dacoref, DanNet, DKHate, DaUnimorph, DaNED, DaWikiNED, DDisco
from danlp.datasets.word_sim import WordSim353Da
from danlp.utils import write_simple_ner_dataset, read_simple_ner_dataset

Expand Down Expand Up @@ -239,5 +239,13 @@ def test_dawikined(self):
prop_str, _ = dawikined.get_kg_context_from_qid('Q1748')
self.assertEqual(len(prop_str), 3758)

class TestDDiscoDatasets(unittest.TestCase):
def test_ddisco(self):
ddisco = DDisco()
train, test = ddisco.load_with_pandas()
self.assertEqual(len(test), 201)
self.assertEqual(len(train), 801)
self.assertEqual(set(test['rating'].to_list()), {1, 2, 3})

if __name__ == '__main__':
unittest.main()

0 comments on commit 4da9fee

Please sign in to comment.