Skip to content

Commit

Permalink
Merge 0973450 into 6ca5d68
Browse files Browse the repository at this point in the history
  • Loading branch information
ophelielacroix committed Nov 13, 2020
2 parents 6ca5d68 + 0973450 commit f6040eb
Show file tree
Hide file tree
Showing 40 changed files with 1,162 additions and 223 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -125,3 +125,7 @@ dmypy.json

# PyCharm
.idea

# readthedocs
docs/_build
docs/make.bat
32 changes: 23 additions & 9 deletions danlp/datasets/ddt.py
Expand Up @@ -14,11 +14,17 @@ def _any_part_exist(parts: list):

class DDT:
"""
Class for loading the Danish Dependency Treebank (DDT) through several frameworks/formats.
The DDT dataset has been annotated with NER tags in the IOB2 format.
The dataset is downloaded in CoNLL-U format, but with this class
it can be converted to spaCy format or a simple NER format
similar to the CoNLL 2003 NER format.
:param str cache_dir: the directory for storing cached models
:param bool verbose: `True` to increase verbosity
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'ddt'
Expand All @@ -27,9 +33,10 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):

def load_as_conllu(self, predefined_splits: bool = False):
"""
Load the DDT in CoNLL-U format.
:param predefined_splits:
:return A single pyconll.Conll
:param bool predefined_splits:
:return: A single pyconll.Conll
or a tuple of (train, dev, test) pyconll.Conll
depending on predefined_split
"""
Expand Down Expand Up @@ -75,16 +82,20 @@ def load_as_simple_ner(self, predefined_splits: bool = False):

def load_with_flair(self, predefined_splits: bool = False):
"""
Load the DDT with flair.
This function is inspired by the "Reading Your Own Sequence Labeling Dataset" from Flairs tutorial
on reading corpora:
https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md
TODO: Make a pull request to flair similar to this:
https://github.com/zalandoresearch/flair/issues/383
:param predefined_splits:
:type predefined_splits: bool
:return: ColumnCorpus
.. note:: TODO: Make a pull request to flair similar to this:
https://github.com/zalandoresearch/flair/issues/383
"""

from flair.data import Corpus
Expand Down Expand Up @@ -112,11 +123,14 @@ def load_with_flair(self, predefined_splits: bool = False):

def load_with_spacy(self):
"""
Converts the conllu files to json in the spaCy format.
Loads the DDT with spaCy.
This function converts the conllu files to json in the spaCy format.
:return: GoldCorpus
Not using jsonl because of:
https://github.com/explosion/spaCy/issues/3523
:return:
.. note:: Not using jsonl because of:
https://github.com/explosion/spaCy/issues/3523
"""
import srsly
from spacy.cli.converters import conllu2json
Expand Down
65 changes: 55 additions & 10 deletions danlp/datasets/sentiment.py
Expand Up @@ -7,6 +7,12 @@
from danlp.utils import extract_single_file_from_zip

class EuroparlSentiment1:
"""
Class for loading the Europarl Sentiment dataset.
:param str cache_dir: the directory for storing cached models
"""

def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'europarl.sentiment1'
Expand All @@ -16,27 +22,49 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)

def load_with_pandas(self):
""" Load and drop duplicates and nan values"""
"""
Loads the dataset in a dataframe
and drop duplicates and nan values
:return: a dataframe
"""

df = pd.read_csv(self.file_path, sep=',', index_col=0, encoding='utf-8')

df = df[['valence', 'text']].dropna()
return df.drop_duplicates()

class EuroparlSentiment2:

"""
Class for loading the Europarl Sentiment dataset.
:param str cache_dir: the directory for storing cached models
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'europarl.sentiment2'
self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_unzip_process_func)
self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')

def load_with_pandas(self):

"""
Loads the dataset as a dataframe
:return: a dataframe
"""
return pd.read_csv(self.file_path, sep=',', encoding='utf-8')



class LccSentiment:
"""
Class for loading the LCC Sentiment dataset.
:param str cache_dir: the directory for storing cached models
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name1 = 'lcc1.sentiment'
self.file_extension1 = DATASETS[self.dataset_name1]['file_extension']
Expand All @@ -51,7 +79,13 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2)

def load_with_pandas(self):
""" Load, combine and drop duplicates and nan values """
"""
Loads the dataset in a dataframe,
combines and drops duplicates and nan values
:return: a dataframe
"""

df1 = pd.read_csv(self.file_path1, sep=',', encoding='utf-8')
df2 = pd.read_csv(self.file_path2, sep=',', encoding='utf-8')
Expand All @@ -63,19 +97,30 @@ def load_with_pandas(self):


class TwitterSent:

def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR, force: bool =False):
"""
Class for loading the Twitter Sentiment dataset.
:param str cache_dir: the directory for storing cached models
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'twitter.sentiment'

self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_twitter_data_process_func)
self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')

def load_with_pandas(self):
"""
Loads the dataset in a dataframe.
:return: a dataframe of the test set and a dataframe of the train set
"""
df=pd.read_csv(self.file_path, sep=',', encoding='utf-8')
return df[df['part'] == 'test'].drop(columns=['part']), df[df['part'] == 'train'].drop(columns=['part'])


def lookup_tweets(tweet_ids, api):
def _lookup_tweets(tweet_ids, api):
import tweepy
full_tweets = []
tweet_count = len(tweet_ids)
Expand All @@ -97,7 +142,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
verbose: bool = True):
from zipfile import ZipFile

twitter_api = construct_twitter_api_connection()
twitter_api = _construct_twitter_api_connection()

model_name = meta_info['name']
full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension']
Expand All @@ -110,7 +155,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,

twitter_ids = list(df['twitterid'])

full_t = lookup_tweets(twitter_ids, twitter_api)
full_t = _lookup_tweets(twitter_ids, twitter_api)
tweet_texts = [[tweet.id, tweet.full_text] for tweet in full_t]
tweet_ids, t_texts = list(zip(*tweet_texts))
tweet_texts_df = pd.DataFrame({'twitterid': tweet_ids, 'text': t_texts})
Expand All @@ -127,7 +172,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
print("Downloaded {} out of {} tweets".format(len(full_t), len(twitter_ids)))


def construct_twitter_api_connection():
def _construct_twitter_api_connection():
if not('TWITTER_CONSUMER_KEY' in os.environ
and 'TWITTER_CONSUMER_SECRET' in os.environ
and 'TWITTER_ACCESS_TOKEN' in os.environ
Expand Down
17 changes: 16 additions & 1 deletion danlp/datasets/wiki_ann.py
Expand Up @@ -5,13 +5,25 @@


class WikiAnn:
"""
Class for loading the WikiANN dataset.
:param str cache_dir: the directory for storing cached models
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'wikiann'
self.file_extension = DATASETS[self.dataset_name]['file_extension']

self.dataset_dir = download_dataset(self.dataset_name, process_func=_wikiann_process_func, cache_dir=cache_dir)

def load_with_flair(self, predefined_splits: bool = False):
"""
Loads the dataset with flair.
:param bool predefined_splits:
:return: ColumnCorpus
"""
from flair.data import Corpus
from flair.datasets import ColumnCorpus

Expand All @@ -23,13 +35,16 @@ def load_with_flair(self, predefined_splits: bool = False):

def load_with_spacy(self):
"""
Loads the dataset with spaCy.
This function will convert the CoNLL02/03 format to json format for spaCy.
As the function will return a spacy.gold.GoldCorpus which needs a dev set
this function also splits the dataset into a 70/30 split as is done by
Pan et al. (2017).
- Pan et al. (2017): https://aclweb.org/anthology/P17-1178
:return:
:return: GoldCorpus
"""
import srsly
from spacy.cli.converters import conll_ner2json
Expand Down
33 changes: 33 additions & 0 deletions danlp/datasets/word_sim.py
Expand Up @@ -6,7 +6,13 @@


class WordSim353Da:
"""
Class for loading the WordSim-353 dataset.
:param str cache_dir: the directory for storing cached models
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'wordsim353.da'
self.file_extension = DATASETS[self.dataset_name]['file_extension']
Expand All @@ -15,9 +21,19 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)

def load_with_pandas(self):
"""
Loads the dataset in a dataframe.
:return: a dataframe
"""
return pd.read_csv(self.file_path)

def words(self) -> set:
"""
Loads the vocabulary.
:rtype: set
"""
df = self.load_with_pandas()
return set(df['da1']) | set(df['da2'])

Expand All @@ -36,6 +52,13 @@ def _word_sim_process_func(tmp_file_path: str, meta_info: dict, cache_dir: str =


class DSD:
"""
Class for loading the Danish Similarity Dataset dataset.
:param str cache_dir: the directory for storing cached models
"""
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name = 'dsd'
self.file_extension = DATASETS[self.dataset_name]['file_extension']
Expand All @@ -44,8 +67,18 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)

def load_with_pandas(self):
"""
Loads the dataset in a dataframe.
:return: a dataframe
"""
return pd.read_csv(self.file_path, delimiter="\t")

def words(self) -> set:
"""
Loads the vocabulary.
:rtype: set
"""
df = self.load_with_pandas()
return set(df['word1']) | set(df['word2'])

0 comments on commit f6040eb

Please sign in to comment.