Merge 8b976dc into 33c6697

alexandrainst · Feb 21, 2022 · 4da9fee · 4da9fee
2 parents 33c6697 + 8b976dc
commit 4da9fee
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 2 deletions.
diff --git a/danlp/datasets/__init__.py b/danlp/datasets/__init__.py
@@ -7,4 +7,5 @@
 from .dkhate import *
 from .unimorph import *
 from .daned import *
-from .dawikined import *
+from .dawikined import *
+from .ddisco import *
diff --git a/danlp/datasets/ddisco.py b/danlp/datasets/ddisco.py
@@ -0,0 +1,36 @@
+import os
+import pandas as pd
+
+from danlp.download import DEFAULT_CACHE_DIR, download_dataset, _unzip_process_func, DATASETS
+
+class DDisco:
+    """
+
+    Class for loading the DDisco dataset.
+    The DDisco dataset is annotated for discourse coherence. 
+    It contains user-generated texts from Reddit and Wikipedia.
+
+    Annotation labels are: 
+    * 1: low coherence
+    * 2: medium coherence
+    * 3: high coherence
+
+    :param str cache_dir: the directory for storing cached models
+    :param bool verbose: `True` to increase verbosity
+
+    """
+    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
+        self.dataset_name = 'ddisco'
+        self.file_extension = DATASETS[self.dataset_name]['file_extension']
+        self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)
+
+    def load_with_pandas(self):
+        """
+        Loads the DDisco dataset in dataframes with pandas. 
+        
+        :return: 2 dataframes -- train, test
+        """
+        df_train = pd.read_csv(os.path.join(self.dataset_dir, self.dataset_name + '.train' + self.file_extension), sep='\t', index_col=0, encoding='utf-8').dropna()
+        df_test = pd.read_csv(os.path.join(self.dataset_dir, self.dataset_name + '.test' + self.file_extension), sep='\t', index_col=0, encoding='utf-8').dropna()
+
+        return df_train, df_test
diff --git a/danlp/download.py b/danlp/download.py
@@ -273,6 +273,13 @@
         'size': 12244995,
         'file_extension': '.tsv'
     },
+    # DDisco
+    'ddisco': {
+        'url': DANLP_STORAGE_URL + '/datasets/ddisco.zip',
+        'md5_checksum': 'd55d145456699fa8be77fd0939324599',
+        'size': 421885,
+        'file_extension': '.tsv'
+    },
 
     # SENTIMENT EVALUATION
     'europarl.sentiment1': {

diff --git a/docs/docs/datasets.md b/docs/docs/datasets.md
@@ -25,6 +25,8 @@ This section keeps a list of Danish NLP datasets publicly available.
 | [DaUnimorph](#daunimorph)                                                              | Morphological Inflection | 25,503            | -                      | CC BY-SA 3.0                                                                                 | ✔️    |
 | [DaNED](#daned)                                                                      | Named Entity Disambiguation    | --            | train:4,626 dev:544 test:744                  | CC BY-SA 4.0                                                                                    | ✔️    |
 | [DaWikiNED](#dawikined)                                                                      | Named Entity Disambiguation    | --            | 21,302                  | CC BY-SA 4.0                                                                                    | ✔️    |
+| [DDisco](#ddisco)                                                                      | Discourse Coherence    | --            | -                  | CC BY-SA 4.0                                                                                    | ✔️    |
+
 
 It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus. 
 
@@ -394,6 +396,30 @@ The dataset can also be downloaded directly:
 [Download DaWikiNED](http://danlp-downloads.alexandra.dk/datasets/dawikined.zip)
 
 
+### DDisco
+
+The DDisco dataset has been developed for DaNLP, through a Master student project.
+Each entry in the dataset is annotated with a discourse coherence label (rating from 1 to 3): 
+
+ * 1: low coherence (difficult to understand, unorganized, contained unnecessary details and can not be summarized briefly and easily)
+ * 2: medium coherence
+ * 3: high coherence (easy to understand, well organized, only contain details that support the main point and can be summarized briefly and easily).
+
+Grammatical and typing errors are ignored (i.e. they do not affect the coherency score) and the coherence of a text is considered within its own domain.
+
+The dataset can be loaded with the DaNLP package:
+
+```python
+from danlp.datasets import DDisco
+ddisco = DDisco()
+train, test = ddisco.load_with_pandas()
+```
+
+The dataset can also be downloaded directly:
+
+[Download DDisco](http://danlp-downloads.alexandra.dk/datasets/ddisco.zip)
+
+
 
 ## 🎓 References
 

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -8,7 +8,7 @@
 from spacy.gold import GoldCorpus
 
 from danlp.datasets import DDT, WikiAnn, DATASETS, DSD, EuroparlSentiment1,EuroparlSentiment2, LccSentiment, \
-    TwitterSent, Dacoref, DanNet, DKHate, DaUnimorph, DaNED, DaWikiNED
+    TwitterSent, Dacoref, DanNet, DKHate, DaUnimorph, DaNED, DaWikiNED, DDisco
 from danlp.datasets.word_sim import WordSim353Da
 from danlp.utils import write_simple_ner_dataset, read_simple_ner_dataset
 
@@ -239,5 +239,13 @@ def test_dawikined(self):
         prop_str, _ = dawikined.get_kg_context_from_qid('Q1748')
         self.assertEqual(len(prop_str), 3758)
 
+class TestDDiscoDatasets(unittest.TestCase):
+    def test_ddisco(self):
+        ddisco = DDisco() 
+        train, test = ddisco.load_with_pandas()
+        self.assertEqual(len(test), 201)
+        self.assertEqual(len(train), 801)
+        self.assertEqual(set(test['rating'].to_list()), {1, 2, 3})
+
 if __name__ == '__main__':
     unittest.main()