Merge 0973450 into 6ca5d68

alexandrainst · Nov 13, 2020 · f6040eb · f6040eb
2 parents 6ca5d68 + 0973450
commit f6040eb
Show file tree

Hide file tree

Showing 40 changed files with 1,162 additions and 223 deletions.
diff --git a/.gitignore b/.gitignore
@@ -125,3 +125,7 @@ dmypy.json
 
 # PyCharm
 .idea
+
+# readthedocs
+docs/_build
+docs/make.bat
diff --git a/danlp/datasets/ddt.py b/danlp/datasets/ddt.py
@@ -14,11 +14,17 @@ def _any_part_exist(parts: list):
 
 class DDT:
     """
+
+    Class for loading the Danish Dependency Treebank (DDT) through several frameworks/formats.   
+
     The DDT dataset has been annotated with NER tags in the IOB2 format.
     The dataset is downloaded in CoNLL-U format, but with this class
     it can be converted to spaCy format or a simple NER format
     similar to the CoNLL 2003 NER format.
 
+    :param str cache_dir: the directory for storing cached models
+    :param bool verbose: `True` to increase verbosity
+
     """
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'ddt'
@@ -27,9 +33,10 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
 
     def load_as_conllu(self, predefined_splits: bool = False):
         """
+        Load the DDT in CoNLL-U format.
 
-        :param predefined_splits:
-        :return A single pyconll.Conll
+        :param bool predefined_splits:
+        :return: A single pyconll.Conll
                 or a tuple of (train, dev, test) pyconll.Conll
                 depending on predefined_split
         """
@@ -75,16 +82,20 @@ def load_as_simple_ner(self, predefined_splits: bool = False):
 
     def load_with_flair(self, predefined_splits: bool = False):
         """
+        Load the DDT with flair. 
+
         This function is inspired by the "Reading Your Own Sequence Labeling Dataset" from Flairs tutorial
         on reading corpora:
 
         https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md
 
-        TODO: Make a pull request to flair similar to this:
-        https://github.com/zalandoresearch/flair/issues/383
-
         :param predefined_splits:
+        :type predefined_splits: bool
         :return: ColumnCorpus
+
+        .. note:: TODO: Make a pull request to flair similar to this:
+            https://github.com/zalandoresearch/flair/issues/383
+
         """
 
         from flair.data import Corpus
@@ -112,11 +123,14 @@ def load_with_flair(self, predefined_splits: bool = False):
 
     def load_with_spacy(self):
         """
-        Converts the conllu files to json in the spaCy format.
+        Loads the DDT with spaCy. 
+        
+        This function converts the conllu files to json in the spaCy format.
+
+        :return: GoldCorpus
 
-        Not using jsonl because of:
-        https://github.com/explosion/spaCy/issues/3523
-        :return:
+        .. note:: Not using jsonl because of:
+            https://github.com/explosion/spaCy/issues/3523
         """
         import srsly
         from spacy.cli.converters import conllu2json

diff --git a/danlp/datasets/sentiment.py b/danlp/datasets/sentiment.py
@@ -7,6 +7,12 @@
 from danlp.utils import extract_single_file_from_zip
 
 class EuroparlSentiment1:
+    """
+    Class for loading the Europarl Sentiment dataset.
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
 
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'europarl.sentiment1'
@@ -16,27 +22,49 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)
 
     def load_with_pandas(self):
-        """ Load and drop duplicates and nan values"""
+        """
+        Loads the dataset in a dataframe
+        and drop duplicates and nan values
+
+        :return: a dataframe
+
+        """
 
         df = pd.read_csv(self.file_path, sep=',', index_col=0, encoding='utf-8')
 
         df = df[['valence', 'text']].dropna()
         return df.drop_duplicates()
 
 class EuroparlSentiment2:
-
+    """
+    Class for loading the Europarl Sentiment dataset.
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'europarl.sentiment2'
         self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_unzip_process_func)
         self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')
 
     def load_with_pandas(self):
-
+        """
+        Loads the dataset as a dataframe
+
+        :return: a dataframe
+
+        """
         return pd.read_csv(self.file_path, sep=',', encoding='utf-8')
 
 
 
 class LccSentiment:
+    """
+    Class for loading the LCC Sentiment dataset. 
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name1 = 'lcc1.sentiment'
         self.file_extension1 = DATASETS[self.dataset_name1]['file_extension']
@@ -51,7 +79,13 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2)
 
     def load_with_pandas(self):
-        """ Load, combine and drop duplicates and nan values """
+        """ 
+        Loads the dataset in a dataframe,  
+        combines and drops duplicates and nan values 
+        
+        :return: a dataframe
+
+        """
 
         df1 = pd.read_csv(self.file_path1, sep=',', encoding='utf-8')
         df2 = pd.read_csv(self.file_path2, sep=',', encoding='utf-8')
@@ -63,19 +97,30 @@ def load_with_pandas(self):
 
 
 class TwitterSent:
-
-    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR, force: bool =False):
+    """
+    Class for loading the Twitter Sentiment dataset.
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
+    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'twitter.sentiment'
 
         self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_twitter_data_process_func)
         self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')
 
     def load_with_pandas(self):
+        """
+        Loads the dataset in a dataframe.
+
+        :return: a dataframe of the test set and a dataframe of the train set
+
+        """
         df=pd.read_csv(self.file_path, sep=',', encoding='utf-8')
         return df[df['part'] == 'test'].drop(columns=['part']), df[df['part'] == 'train'].drop(columns=['part'])
 
 
-def lookup_tweets(tweet_ids, api):
+def _lookup_tweets(tweet_ids, api):
     import tweepy
     full_tweets = []
     tweet_count = len(tweet_ids)
@@ -97,7 +142,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
                                verbose: bool = True):
     from zipfile import ZipFile
 
-    twitter_api = construct_twitter_api_connection()
+    twitter_api = _construct_twitter_api_connection()
 
     model_name = meta_info['name']
     full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension']
@@ -110,7 +155,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
 
     twitter_ids = list(df['twitterid'])
 
-    full_t = lookup_tweets(twitter_ids, twitter_api)
+    full_t = _lookup_tweets(twitter_ids, twitter_api)
     tweet_texts = [[tweet.id, tweet.full_text] for tweet in full_t]
     tweet_ids, t_texts = list(zip(*tweet_texts))
     tweet_texts_df = pd.DataFrame({'twitterid': tweet_ids, 'text': t_texts})
@@ -127,7 +172,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
         print("Downloaded {} out of {} tweets".format(len(full_t), len(twitter_ids)))
 
 
-def construct_twitter_api_connection():
+def _construct_twitter_api_connection():
     if not('TWITTER_CONSUMER_KEY' in os.environ
            and 'TWITTER_CONSUMER_SECRET' in os.environ
            and 'TWITTER_ACCESS_TOKEN' in os.environ

diff --git a/danlp/datasets/wiki_ann.py b/danlp/datasets/wiki_ann.py
@@ -5,13 +5,25 @@
 
 
 class WikiAnn:
+    """
+    Class for loading the WikiANN dataset.
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'wikiann'
         self.file_extension = DATASETS[self.dataset_name]['file_extension']
 
         self.dataset_dir = download_dataset(self.dataset_name, process_func=_wikiann_process_func, cache_dir=cache_dir)
 
     def load_with_flair(self, predefined_splits: bool = False):
+        """
+        Loads the dataset with flair.
+
+        :param bool predefined_splits:
+        :return: ColumnCorpus
+        """
         from flair.data import Corpus
         from flair.datasets import ColumnCorpus
 
@@ -23,13 +35,16 @@ def load_with_flair(self, predefined_splits: bool = False):
 
     def load_with_spacy(self):
         """
+        Loads the dataset with spaCy. 
+
         This function will convert the CoNLL02/03 format to json format for spaCy.
         As the function will return a spacy.gold.GoldCorpus which needs a dev set
         this function also splits the dataset into a 70/30 split as is done by
         Pan et al. (2017).
 
         - Pan et al. (2017): https://aclweb.org/anthology/P17-1178
-        :return:
+        
+        :return: GoldCorpus
         """
         import srsly
         from spacy.cli.converters import conll_ner2json

diff --git a/danlp/datasets/word_sim.py b/danlp/datasets/word_sim.py
@@ -6,7 +6,13 @@
 
 
 class WordSim353Da:
+    """
 
+    Class for loading the WordSim-353 dataset.
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'wordsim353.da'
         self.file_extension = DATASETS[self.dataset_name]['file_extension']
@@ -15,9 +21,19 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)
 
     def load_with_pandas(self):
+        """
+        Loads the dataset in a dataframe.
+
+        :return: a dataframe
+        """
         return pd.read_csv(self.file_path)
 
     def words(self) -> set:
+        """
+        Loads the vocabulary.
+
+        :rtype: set
+        """
         df = self.load_with_pandas()
         return set(df['da1']) | set(df['da2'])
 
@@ -36,6 +52,13 @@ def _word_sim_process_func(tmp_file_path: str, meta_info: dict, cache_dir: str =
 
 
 class DSD:
+    """
+
+    Class for loading the Danish Similarity Dataset dataset.
+
+    :param str cache_dir: the directory for storing cached models
+
+    """
     def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.dataset_name = 'dsd'
         self.file_extension = DATASETS[self.dataset_name]['file_extension']
@@ -44,8 +67,18 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
         self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)
 
     def load_with_pandas(self):
+        """
+        Loads the dataset in a dataframe.
+
+        :return: a dataframe
+        """
         return pd.read_csv(self.file_path, delimiter="\t")
 
     def words(self) -> set:
+        """
+        Loads the vocabulary.
+
+        :rtype: set
+        """
         df = self.load_with_pandas()
         return set(df['word1']) | set(df['word2'])