From d339ac319d7b50136b81d74be4f9dda07f0bf303 Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Thu, 5 Nov 2020 16:09:37 +0100 Subject: [PATCH 1/8] Update docstrings in source code --- danlp/datasets/ddt.py | 32 ++++++++--- danlp/datasets/sentiment.py | 56 ++++++++++++++++-- danlp/datasets/wiki_ann.py | 17 +++++- danlp/datasets/word_sim.py | 33 +++++++++++ danlp/models/bert_models.py | 107 ++++++++++++++++++++++++++--------- danlp/models/embeddings.py | 89 +++++++++++++++++++---------- danlp/models/flair_models.py | 14 +++-- danlp/models/spacy_models.py | 53 +++++++++++++---- 8 files changed, 312 insertions(+), 89 deletions(-) diff --git a/danlp/datasets/ddt.py b/danlp/datasets/ddt.py index bf9523d..47f3c6f 100644 --- a/danlp/datasets/ddt.py +++ b/danlp/datasets/ddt.py @@ -14,11 +14,17 @@ def _any_part_exist(parts: list): class DDT: """ + + Class for loading the Danish Dependency Treebank (DDT) through several frameworks/formats. + The DDT dataset has been annotated with NER tags in the IOB2 format. The dataset is downloaded in CoNLL-U format, but with this class it can be converted to spaCy format or a simple NER format similar to the CoNLL 2003 NER format. + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'ddt' @@ -27,9 +33,10 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): def load_as_conllu(self, predefined_splits: bool = False): """ + Load the DDT in CoNLL-U format. - :param predefined_splits: - :return A single pyconll.Conll + :param bool predefined_splits: + :return: A single pyconll.Conll or a tuple of (train, dev, test) pyconll.Conll depending on predefined_split """ @@ -75,16 +82,20 @@ def load_as_simple_ner(self, predefined_splits: bool = False): def load_with_flair(self, predefined_splits: bool = False): """ + Load the DDT with flair. + This function is inspired by the "Reading Your Own Sequence Labeling Dataset" from Flairs tutorial on reading corpora: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md - TODO: Make a pull request to flair similar to this: - https://github.com/zalandoresearch/flair/issues/383 - :param predefined_splits: + :type predefined_splits: bool :return: ColumnCorpus + + .. note:: TODO: Make a pull request to flair similar to this: + https://github.com/zalandoresearch/flair/issues/383 + """ from flair.data import Corpus @@ -112,11 +123,14 @@ def load_with_flair(self, predefined_splits: bool = False): def load_with_spacy(self): """ - Converts the conllu files to json in the spaCy format. + Loads the DDT with spaCy. + + This function converts the conllu files to json in the spaCy format. + + :return: GoldCorpus - Not using jsonl because of: - https://github.com/explosion/spaCy/issues/3523 - :return: + .. note:: Not using jsonl because of: + https://github.com/explosion/spaCy/issues/3523 """ import srsly from spacy.cli.converters import conllu2json diff --git a/danlp/datasets/sentiment.py b/danlp/datasets/sentiment.py index c99f049..71db90a 100644 --- a/danlp/datasets/sentiment.py +++ b/danlp/datasets/sentiment.py @@ -7,6 +7,12 @@ from danlp.utils import extract_single_file_from_zip class EuroparlSentiment1: + """ + Class for loading the Europarl Sentiment dataset. + + :param str cache_dir: the directory for storing cached models + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'europarl.sentiment1' @@ -16,7 +22,13 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension) def load_with_pandas(self): - """ Load and drop duplicates and nan values""" + """ + Loads the dataset in a dataframe + and drop duplicates and nan values + + :return: a dataframe + + """ df = pd.read_csv(self.file_path, sep=',', index_col=0, encoding='utf-8') @@ -24,19 +36,35 @@ def load_with_pandas(self): return df.drop_duplicates() class EuroparlSentiment2: - + """ + Class for loading the Europarl Sentiment dataset. + + :param str cache_dir: the directory for storing cached models + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'europarl.sentiment2' self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_unzip_process_func) self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv') def load_with_pandas(self): - + """ + Loads the dataset as a dataframe + + :return: a dataframe + + """ return pd.read_csv(self.file_path, sep=',', encoding='utf-8') class LccSentiment: + """ + Class for loading the LCC Sentiment dataset. + + :param str cache_dir: the directory for storing cached models + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name1 = 'lcc1.sentiment' self.file_extension1 = DATASETS[self.dataset_name1]['file_extension'] @@ -51,7 +79,13 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2) def load_with_pandas(self): - """ Load, combine and drop duplicates and nan values """ + """ + Loads the dataset in a dataframe, + combines and drops duplicates and nan values + + :return: a dataframe + + """ df1 = pd.read_csv(self.file_path1, sep=',', encoding='utf-8') df2 = pd.read_csv(self.file_path2, sep=',', encoding='utf-8') @@ -63,7 +97,13 @@ def load_with_pandas(self): class TwitterSent: - + """ + Class for loading the Twitter Sentiment dataset. + + :param str cache_dir: the directory for storing cached models + :param bool force: + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR, force: bool =False): self.dataset_name = 'twitter.sentiment' @@ -71,6 +111,12 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR, force: bool =False): self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv') def load_with_pandas(self): + """ + Loads the dataset in a dataframe. + + :return: a dataframe of the test set and a dataframe of the train set + + """ df=pd.read_csv(self.file_path, sep=',', encoding='utf-8') return df[df['part'] == 'test'].drop(columns=['part']), df[df['part'] == 'train'].drop(columns=['part']) diff --git a/danlp/datasets/wiki_ann.py b/danlp/datasets/wiki_ann.py index 9f3a39b..5b2db40 100644 --- a/danlp/datasets/wiki_ann.py +++ b/danlp/datasets/wiki_ann.py @@ -5,6 +5,12 @@ class WikiAnn: + """ + Class for loading the WikiANN dataset. + + :param str cache_dir: the directory for storing cached models + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'wikiann' self.file_extension = DATASETS[self.dataset_name]['file_extension'] @@ -12,6 +18,12 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_dir = download_dataset(self.dataset_name, process_func=_wikiann_process_func, cache_dir=cache_dir) def load_with_flair(self, predefined_splits: bool = False): + """ + Loads the dataset with flair. + + :param bool predefined_splits: + :return: ColumnCorpus + """ from flair.data import Corpus from flair.datasets import ColumnCorpus @@ -23,13 +35,16 @@ def load_with_flair(self, predefined_splits: bool = False): def load_with_spacy(self): """ + Loads the dataset with spaCy. + This function will convert the CoNLL02/03 format to json format for spaCy. As the function will return a spacy.gold.GoldCorpus which needs a dev set this function also splits the dataset into a 70/30 split as is done by Pan et al. (2017). - Pan et al. (2017): https://aclweb.org/anthology/P17-1178 - :return: + + :return: GoldCorpus """ import srsly from spacy.cli.converters import conll_ner2json diff --git a/danlp/datasets/word_sim.py b/danlp/datasets/word_sim.py index b1acc29..82a1a88 100644 --- a/danlp/datasets/word_sim.py +++ b/danlp/datasets/word_sim.py @@ -6,7 +6,13 @@ class WordSim353Da: + """ + Class for loading the WordSim-353 dataset. + + :param str cache_dir: the directory for storing cached models + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'wordsim353.da' self.file_extension = DATASETS[self.dataset_name]['file_extension'] @@ -15,9 +21,19 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension) def load_with_pandas(self): + """ + Loads the dataset in a dataframe. + + :return: a dataframe + """ return pd.read_csv(self.file_path) def words(self) -> set: + """ + Loads the vocabulary. + + :rtype: set + """ df = self.load_with_pandas() return set(df['da1']) | set(df['da2']) @@ -36,6 +52,13 @@ def _word_sim_process_func(tmp_file_path: str, meta_info: dict, cache_dir: str = class DSD: + """ + + Class for loading the Danish Similarity Dataset dataset. + + :param str cache_dir: the directory for storing cached models + + """ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'dsd' self.file_extension = DATASETS[self.dataset_name]['file_extension'] @@ -44,8 +67,18 @@ def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension) def load_with_pandas(self): + """ + Loads the dataset in a dataframe. + + :return: a dataframe + """ return pd.read_csv(self.file_path, delimiter="\t") def words(self) -> set: + """ + Loads the vocabulary. + + :rtype: set + """ df = self.load_with_pandas() return set(df['word1']) | set(df['word2']) diff --git a/danlp/models/bert_models.py b/danlp/models/bert_models.py index b0014d1..2d1af8d 100644 --- a/danlp/models/bert_models.py +++ b/danlp/models/bert_models.py @@ -7,8 +7,12 @@ class BertNer: """ - Bert NER model + BERT NER model + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity """ + def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import AutoModelForTokenClassification from transformers import AutoTokenizer @@ -30,10 +34,12 @@ def predict(self, text: Union[str, List[str]]): a raw string this method will return the string tokenized with BERTs subword tokens. - E.g. "varme vafler" will become ["varme", "va", "##fler"] + :param text: can either be a raw text or a list of tokens + :return: the tokenized text and the predicted labels + + :Example: - :param text: Can either be a raw text or a list of tokens - :return: The tokenized text and the predicted labels + "`varme vafler`" becomes ["varme", "va", "##fler"] """ if isinstance(text, str): @@ -88,8 +94,13 @@ def predict(self, text: Union[str, List[str]]): class BertEmotion: """ - The class load both a BERT model to classify if emotion or not in the text, - and a BERT model to regonizes eight emotions + BERT Emotion model. + + For classifying whether there is emotion in the text, + and recognizing amongst eight emotions. + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity """ def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): @@ -136,13 +147,37 @@ def _get_pred(self, tokenizer, model, max_lenght, sentence): return pred def predict_if_emotion(self, sentence): + """ + Predicts whether there is emotion in the text. + + :param str sentence: raw sentence + :return: 0 if no emotion else 1 + :rtype: int + """ pred=self._get_pred(self.tokenizer_reject, self.model_reject, self.max_length_reject, sentence) pred = pred.argmax().item() return self.labels_no[pred] def predict(self, sentence: str, no_emotion=False): - + """ + Predicts emotion among: + + * 0: `Glæde/Sindsro` + * 1: `Tillid/Accept` + * 2: `Forventning/Interrese` + * 3: `Overasket/Målløs` + * 4: `Vrede/Irritation` + * 5: `Foragt/Modvilje` + * 6: `Sorg/trist` + * 7: `Frygt/Bekymret` + + :param str sentence: raw text + :param bool no_emotion: whether there is emotion or not in the text + :return: index of the emotion + :rtype: int + """ + def predict_emotion(): pred=self._get_pred(self.tokenizer, self.model, self.max_length, sentence) pred = pred.argmax().item() @@ -158,6 +193,15 @@ def predict_emotion(): return predict_emotion() def predict_proba(self, sentence: str, emotions=True, no_emotion=True): + """ + Predicts the probabilities of emotions. + + :param str sentence: raw text + :param bool emotions: whether to return the probability of the emotion + :param bool no_emotion: whether to return the probability of the sentence being emotional + :return: a list of probabilities + :rtype: List + """ proba=[] # which emotion @@ -175,13 +219,16 @@ def predict_proba(self, sentence: str, emotions=True, no_emotion=True): class BertTone: ''' - The class load both a BERT model to classify boteh the tone of [subjective or objective] and the tone og [positive, neutral , negativ] - returns: [label_subjective, label_polarity] + BERT Tone model. + + For classifying both the tone [subjective, objective] + and the polarity [positive, neutral, negativ] of sentences. + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity ''' - def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): - from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func,verbose=verbose) @@ -223,7 +270,15 @@ def _get_pred(self, tokenizer, model, max_lenght, sentence): return pred def predict(self, sentence: str, polarity: bool = True, analytic: bool = True): + """ + Predict the polarity [positive, neutral, negativ] and/or the tone [subjective, objective] of the sentence. + :param str sentence: raw text + :param bool polarity: returns the polarity if `True` + :param bool analytic: returns the tone if `True` + :return: a dictionary for polarity and tone results + :rtype: Dict + """ sentence = self._clean(str(sentence)) predDict = {'analytic': None, 'polarity': None } @@ -264,32 +319,32 @@ def predict_proba(self, sentence: str, polarity: bool = True, analytic: bool = T def load_bert_tone_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ - Wrapper function to ensure that all models in danlp are - loaded in a similar way - :param cache_dir: - :param verbose: - :return: + Loads a BERT Tone model. + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: a BERT Tone model """ return BertTone(cache_dir, verbose) def load_bert_emotion_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ - Wrapper function to ensure that all models in danlp are - loaded in a similar way - :param cache_dir: - :param verbose: - :return: + Loads a BERT Emotion model. + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: a BERT Emotion model """ return BertEmotion(cache_dir, verbose) def load_bert_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ - Wrapper function to ensure that all models in danlp are - loaded in a similar way - :param cache_dir: - :param verbose: - :return: + Loads a BERT NER model. + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: a BERT NER model """ return BertNer(cache_dir, verbose) diff --git a/danlp/models/embeddings.py b/danlp/models/embeddings.py index 43ccc9a..85da97f 100644 --- a/danlp/models/embeddings.py +++ b/danlp/models/embeddings.py @@ -1,3 +1,26 @@ +""" +This module provides you with functions for loading +pretrained Danish word embeddings through several NLP frameworks: + + * flair + * spaCy + * Gensim + +Available word embeddings: + + * wiki.da.wv + * cc.da.wv + * conll17.da.wv + * news.da.wv + * sketchengine.da.wv + +Available subword embeddings: + + * wiki.da.swv + * cc.da.swv + * sketchengine.da.swv +""" + import os from tempfile import TemporaryDirectory from time import sleep @@ -9,30 +32,23 @@ AVAILABLE_EMBEDDINGS = ['wiki.da.wv', 'cc.da.wv', 'conll17.da.wv', 'news.da.wv', 'sketchengine.da.wv', 'dslreddit.da.wv'] +""" +""" AVAILABLE_SUBWORD_EMBEDDINGS = ['wiki.da.swv', 'cc.da.swv', 'sketchengine.da.swv'] +""" +""" def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose: bool = False): """ + Loads word embeddings with Gensim. - Available wordembeddings: - - wiki.da.wv - - cc.da.wv - - conll17.da.wv - - news.da.wv - - sketchengine.da.wv - - Available subwordembeddings: - - wiki.da.swv - - cc.da.swv - - sketchengine.da.swv - - :param pretrained_embedding: + :param str pretrained_embedding: :param cache_dir: the directory for storing cached data - :param verbose: + :param bool verbose: `True` to increase verbosity :return: KeyedVectors or FastTextKeyedVectors """ word_embeddings_available(pretrained_embedding, can_use_subword=True) @@ -51,10 +67,12 @@ def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, def load_wv_with_spacy(pretrained_embedding: str, cache_dir: str = DEFAULT_CACHE_DIR, verbose=False): """ + Loads a spaCy model with pretrained embeddings. + :param str pretrained_embedding: :param str cache_dir: the directory for storing cached data - :param bool verbose: - :return + :param bool verbose: `True` to increase verbosity + :return: spaCy model """ import spacy @@ -86,12 +104,13 @@ def load_keras_embedding_layer(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose=False, **kwargs): """ + Loads a Keras Embedding layer. - :param pretrained_embedding: - :param cache_dir: the directory for storing cached models - :param verbose: - :param kwargs: used to forward arguments to the keras Embedding layer - :return: + :param str pretrained_embedding: + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :param kwargs: used to forward arguments to the Keras Embedding layer + :return: a Keras Embedding layer and index to word dictionary """ word_embeddings_available(pretrained_embedding, can_use_subword=False) @@ -111,10 +130,12 @@ def load_keras_embedding_layer(pretrained_embedding: str, def load_pytorch_embedding_layer(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ + Loads a pytorch embbeding layer. - :param pretrained_embedding: - :param cache_dir: the directory for storing cached models - :return: an pytorch Embedding module and a list id2word + :param str pretrained_embedding: + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: a pytorch Embedding module and a list id2word """ word_embeddings_available(pretrained_embedding, can_use_subword=False) import torch @@ -131,9 +152,12 @@ def load_context_embeddings_with_flair(direction='bi', word_embeddings=None, cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ - :param bidirectional: - :param cache_dir: - :param verbose: + Loads contextutal (dynamic) word embeddings with flair. + + :param str direction: bidirectional 'bi', forward 'fwd' or backward 'bwd' + :param word_embedding: + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity """ from flair.embeddings import FlairEmbeddings from flair.embeddings import WordEmbeddings @@ -196,7 +220,8 @@ def _process_embeddings_for_spacy(tmp_file_path: str, meta_info: dict, :param str tmp_file_path: the file name of the embedding binary file :param str cache_dir: the directory for storing cached data - :param bool verbose: + :param bool clean_up_raw_data: + :param bool verbose: `True` to increase verbosity """ from pathlib import Path from spacy.cli import init_model @@ -237,8 +262,10 @@ def _process_downloaded_embeddings(tmp_file_path: str, meta_info: dict, """ :param str tmp_file_path: + :param dict meta_info: + :param str cache_dir: the directory for storing cached data :param bool clean_up_raw_data: - :param bool verbose: + :param bool verbose: `True` to increase verbosity """ pretrained_embedding = meta_info['name'] @@ -400,10 +427,10 @@ def _process_dslreddit(tmp_file_path: str, cache_dir: str, def assert_wv_dimensions(wv: KeyedVectors, pretrained_embedding: str): """ - This functions will check the dimensions of some wordembeddings wv, + This function will check the dimensions of some word embeddings wv, and check them against the data stored in WORD_EMBEDDINGS. - :param gensim.models.KeyedVectors wv: + :param gensim.models.KeyedVectors wv: word embeddings :param str pretrained_embedding: the name of the pretrained embeddings """ vocab_size = MODELS[pretrained_embedding]['vocab_size'] diff --git a/danlp/models/flair_models.py b/danlp/models/flair_models.py index d474fbe..51b951e 100644 --- a/danlp/models/flair_models.py +++ b/danlp/models/flair_models.py @@ -3,10 +3,11 @@ def load_flair_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ + Loads a flair model for NER. - :param cache_dir: - :param verbose: - :return: + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: an NER flair model """ from flair.models import SequenceTagger @@ -19,10 +20,11 @@ def load_flair_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ + Loads a flair model for Part-of-Speech tagging. - :param cache_dir: - :param verbose: - :return: + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: a POS flair model """ from flair.models import SequenceTagger diff --git a/danlp/models/spacy_models.py b/danlp/models/spacy_models.py index 3607b51..fd4a4b4 100644 --- a/danlp/models/spacy_models.py +++ b/danlp/models/spacy_models.py @@ -7,9 +7,15 @@ def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False): """ - Loads a spacy model. + Loads a spaCy model. + + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :param bool textcat: '`sentiment`' for loading the spaCy sentiment analyser + :param bool vectorError: + :return: a spaCy model - OBS vectorError is a TEMP ugly work around error encounted by keeping two models an not been able to find referece name for vectros + .. warning:: vectorError is a temporary work around error encounted by keeping two models and not been able to find reference name for vectors """ from spacy.util import load_model_from_path @@ -39,17 +45,34 @@ def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, v def load_spacy_chunking_model(spacy_model=None,cache_dir=DEFAULT_CACHE_DIR, verbose=False): + """ + Loads a spaCy chunking model. + + :param spacy_model: a (preloaded) spaCy model + :type spacy_model: spaCy model + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity + :return: a spaCy Chunking model + + .. note:: A spaCy model can be previously loaded using load_spacy_model + and given as an argument to load_spacy_chunking_model + (for instance, to avoid loading the model twice) + """ return SpacyChunking(model=spacy_model, cache_dir=cache_dir, verbose=verbose) class SpacyChunking: """ - Spacy Chunking Model + Spacy Chunking Model + + :param model: a (preloaded) spaCy model + :type model: spaCy model + :param str cache_dir: the directory for storing cached models + :param bool verbose: `True` to increase verbosity """ def __init__(self, model=None, cache_dir=DEFAULT_CACHE_DIR, verbose=False): - if model == None: self.model = load_spacy_model(cache_dir=cache_dir, verbose=verbose) else: @@ -57,15 +80,23 @@ def __init__(self, model=None, cache_dir=DEFAULT_CACHE_DIR, verbose=False): def predict(self, text: Union[str, List[str]], bio=True): """ - Predict NP chunks (BIO format) from raw text or tokenized text. + Predict NP chunks from raw or tokenized text. + + :param text: can either be a raw text or a list of tokens + :param bio: + `True` to return a list of labels in BIO format (same length as the sentence), + `False` to return a list of tuples `(start id, end id, chunk label)` + :type bio: bool + :return: NP chunks - either a list of labels in BIO format or a list of tuples `(start id, end id, chunk label)` + + :Example: + + "`Jeg kommer fra en lille by`" + becomes - E.g. "Jeg kommer fra en lille by." become - - a list of BIO tags: ['B-NP', 'O', 'O', 'O', 'B-NP', 'I-NP', 'I-NP'] - - or a list of triplets (start id, end id, chunk label): [(0, 1, 'NP'), (4, 7, 'NP')] + * a list of BIO tags: ['B-NP', 'O', 'O', 'B-NP', 'I-NP', 'I-NP'] + * or a list of tuples : [(0, 1, 'NP'), (3, 6, 'NP')] - :param text: Can either be a raw text or a list of tokens - :param bool bio: True to return a list of BIO labels (same length as the sentence), False to return a list of NP-chunks - :return: NP chunks """ if isinstance(text, str): From c380c4cd9da164bd90bdec886f66cae6df7bed1e Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Thu, 5 Nov 2020 16:20:53 +0100 Subject: [PATCH 2/8] Add readthedocs setup files + documentation --- .gitignore | 4 + docs/frameworks.rst | 9 +++ docs/models/models.rst | 13 +++ readthedocs/Makefile | 20 +++++ readthedocs/conf.py | 92 ++++++++++++++++++++++ readthedocs/docs | 1 + readthedocs/gettingstarted/contributing.md | 11 +++ readthedocs/gettingstarted/installation.md | 52 ++++++++++++ readthedocs/index.rst | 32 ++++++++ readthedocs/library/datasets.rst | 33 ++++++++ readthedocs/library/download.rst | 7 ++ readthedocs/library/models.rst | 34 ++++++++ readthedocs/requirements.txt | 5 ++ 13 files changed, 313 insertions(+) create mode 100644 docs/frameworks.rst create mode 100644 docs/models/models.rst create mode 100644 readthedocs/Makefile create mode 100644 readthedocs/conf.py create mode 120000 readthedocs/docs create mode 100644 readthedocs/gettingstarted/contributing.md create mode 100644 readthedocs/gettingstarted/installation.md create mode 100644 readthedocs/index.rst create mode 100644 readthedocs/library/datasets.rst create mode 100644 readthedocs/library/download.rst create mode 100644 readthedocs/library/models.rst create mode 100644 readthedocs/requirements.txt diff --git a/.gitignore b/.gitignore index 8804376..5dbe340 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,7 @@ dmypy.json # PyCharm .idea + +# readthedocs +readthedocs/_build +readthedocs/make.bat diff --git a/docs/frameworks.rst b/docs/frameworks.rst new file mode 100644 index 0000000..dc39889 --- /dev/null +++ b/docs/frameworks.rst @@ -0,0 +1,9 @@ +Frameworks +========== + + +.. toctree:: + :maxdepth: 1 + :caption: Frameworks + + spacy.md \ No newline at end of file diff --git a/docs/models/models.rst b/docs/models/models.rst new file mode 100644 index 0000000..4643291 --- /dev/null +++ b/docs/models/models.rst @@ -0,0 +1,13 @@ +Models +====== + + +.. toctree:: + :maxdepth: 1 + :caption: Models + + embeddings.md + pos.md + ner.md + dependency.md + sentiment_analysis.md \ No newline at end of file diff --git a/readthedocs/Makefile b/readthedocs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/readthedocs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/readthedocs/conf.py b/readthedocs/conf.py new file mode 100644 index 0000000..29126ca --- /dev/null +++ b/readthedocs/conf.py @@ -0,0 +1,92 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +from recommonmark.transform import AutoStructify + +# -- Project information ----------------------------------------------------- + +project = 'DaNLP' +copyright = '2020, Alexandra Institute' +author = 'Alexandra Institute' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'recommonmark', + 'sphinx.ext.autodoc', + 'sphinx_markdown_tables', + 'sphinx.ext.todo', + 'sphinx.ext.autosectionlabel' +] + +source_suffix = ['.rst', '.md'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +html_theme_options = { + 'logo_only': True, + 'display_version': True, + 'prev_next_buttons_location': 'bottom', + 'style_external_links': True, + # Toc options + 'collapse_navigation': False, + 'sticky_navigation': False, + 'navigation_depth': 3, + 'includehidden': False, + 'titles_only': False +} + +html_title = "DaNLP documentation" +html_logo = "docs/imgs/danlp_logo.png" +html_favicon = "docs/imgs/danlp_logo.png" +#html_style = 'custom.css' + +master_doc = 'index' + +github_doc_root = 'https://github.com/alexandrainst/danlp/tree/master/readthedocs/docs' + + +autosectionlabel_prefix_document = True + +def setup(app): + app.add_config_value('recommonmark_config', { + 'url_resolver': lambda url: github_doc_root + url, + 'auto_toc_tree_section': 'Contents', + }, True) + app.add_transform(AutoStructify) \ No newline at end of file diff --git a/readthedocs/docs b/readthedocs/docs new file mode 120000 index 0000000..6246dff --- /dev/null +++ b/readthedocs/docs @@ -0,0 +1 @@ +../docs/ \ No newline at end of file diff --git a/readthedocs/gettingstarted/contributing.md b/readthedocs/gettingstarted/contributing.md new file mode 100644 index 0000000..5736ab6 --- /dev/null +++ b/readthedocs/gettingstarted/contributing.md @@ -0,0 +1,11 @@ +How do I contribute? +==================== + +If you want to contribute to the DaNLP repository and make it better, your help is very welcome. You can contribute to the project in many ways: + +- Help us write good tutorials on Danish NLP use-cases +- Contribute with your own pretrained NLP models or datasets in Danish +- Notify us of other Danish NLP resources +- Create GitHub issues with questions and bug reports + +You can write us at danlp@alexandra.dk. \ No newline at end of file diff --git a/readthedocs/gettingstarted/installation.md b/readthedocs/gettingstarted/installation.md new file mode 100644 index 0000000..576d998 --- /dev/null +++ b/readthedocs/gettingstarted/installation.md @@ -0,0 +1,52 @@ +Installation +============ + + +To get started using DaNLP in your python project simply install the pip package. However installing the pip package +will not install all NLP libraries because we want you to have the freedom to limit the dependency on what you use. + +### Install with pip + +To get started using DaNLP simply install the project with pip: + +```bash +pip install danlp +``` + +Note that the installation of DaNLP does not install other NLP libraries such as Gensim, SpaCy, flair or Transformers. +This allows the installation to be as minimal as possible and let the user choose to e.g. load word embeddings with either spaCy, flair or Gensim. Therefore, depending on the function you need to use, you should install one or several of the following: `pip install flair`, `pip install spacy ` or/and `pip install gensim `. You can check the `requirements.txt` file to see what version the packages has been tested with. + +### Install from source + +If you want to be able to use the latest developments before they are released in a new pip package, or you want to modify the code yourself, then clone this repo and install from source. + +``` +git clone https://github.com/alexandrainst/danlp.git +cd danlp +pip install . +``` + +To install the dependencies used in the package with the tested versions: + +```python +pip install -r requirements.txt +``` + + +### Install from github +Alternatively you can install the latest version from github using: +``` +pip install git+https://github.com/alexandrainst/danlp.git +``` + +### Install with Docker +To quickly get started with DaNLP and to try out the models you can use our Docker image. +To start a ipython session simply run: +```bash +docker run -it --rm alexandrainst/danlp ipython +``` +If you want to run a `` in your current working directory you can run: +```bash +docker run -it --rm -v "$PWD":/usr/src/app -w /usr/src/app alexandrainst/danlp python + +``` \ No newline at end of file diff --git a/readthedocs/index.rst b/readthedocs/index.rst new file mode 100644 index 0000000..503b011 --- /dev/null +++ b/readthedocs/index.rst @@ -0,0 +1,32 @@ + +Welcome to DaNLP's documentation! +================================= + + +DaNLP is a repository for Natural Language Processing resources for the Danish Language. +It is a collection of available datasets and models for a variety of NLP tasks. The aim is to make it easier and more applicable to practitioners in the industry to use Danish NLP and hence this project is licensed to allow commercial use. The project features code examples on how to use the datasets and models in popular NLP frameworks such as spaCy, Transformers and Flair as well as Deep Learning frameworks such as PyTorch. + + + +.. toctree:: + :maxdepth: 1 + :caption: Getting started + + gettingstarted/installation.md + gettingstarted/contributing.md + +.. toctree:: + :maxdepth: 2 + :caption: Documentation + + docs/datasets.md + docs/frameworks.rst + docs/models/models.rst + +.. toctree:: + :maxdepth: 1 + :caption: Library + + library/models.rst + library/datasets.rst + diff --git a/readthedocs/library/datasets.rst b/readthedocs/library/datasets.rst new file mode 100644 index 0000000..f6d568c --- /dev/null +++ b/readthedocs/library/datasets.rst @@ -0,0 +1,33 @@ +Datasets +======== + +Danish Dependency Treebank +-------------------------- + +.. automodule:: danlp.datasets.ddt + :members: + :show-inheritance: + + +Sentiment datasets +------------------ + +.. automodule:: danlp.datasets.sentiment + :members: + :show-inheritance: + + +WikiANN +------- + +.. automodule:: danlp.datasets.wiki_ann + :members: + :show-inheritance: + + +Word similarity datasets +------------------------ + +.. automodule:: danlp.datasets.word_sim + :members: + :show-inheritance: \ No newline at end of file diff --git a/readthedocs/library/download.rst b/readthedocs/library/download.rst new file mode 100644 index 0000000..53fdfc6 --- /dev/null +++ b/readthedocs/library/download.rst @@ -0,0 +1,7 @@ + +Download +======== + +.. automodule:: danlp.download + :members: + :show-inheritance: \ No newline at end of file diff --git a/readthedocs/library/models.rst b/readthedocs/library/models.rst new file mode 100644 index 0000000..9ef4ee3 --- /dev/null +++ b/readthedocs/library/models.rst @@ -0,0 +1,34 @@ +Models +====== + + +Embeddings +---------- + +.. automodule:: danlp.models.embeddings + :members: + :show-inheritance: + + +spaCy models +------------ + +.. automodule:: danlp.models.spacy_models + :members: + :show-inheritance: + + +flair models +------------ + +.. automodule:: danlp.models.flair_models + :members: + :show-inheritance: + + +BERT models +----------- + +.. automodule:: danlp.models.bert_models + :members: + :show-inheritance: \ No newline at end of file diff --git a/readthedocs/requirements.txt b/readthedocs/requirements.txt new file mode 100644 index 0000000..ddd0d96 --- /dev/null +++ b/readthedocs/requirements.txt @@ -0,0 +1,5 @@ + +sphinx==3.1.2 +recommonmark==0.6.0 +sphinx_rtd_theme==0.5.0 +sphinx_markdown_tables==0.0.15 \ No newline at end of file From 3eef6a4ae522e8d54c81a12883d8a4f5107582f4 Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Fri, 6 Nov 2020 15:42:58 +0100 Subject: [PATCH 3/8] Privatize some functions functions that should not be displayed in the documentation are made private --- danlp/datasets/sentiment.py | 11 +++++------ danlp/models/embeddings.py | 12 ++++++------ danlp/models/spacy_models.py | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/danlp/datasets/sentiment.py b/danlp/datasets/sentiment.py index 71db90a..254b635 100644 --- a/danlp/datasets/sentiment.py +++ b/danlp/datasets/sentiment.py @@ -101,10 +101,9 @@ class TwitterSent: Class for loading the Twitter Sentiment dataset. :param str cache_dir: the directory for storing cached models - :param bool force: """ - def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR, force: bool =False): + def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'twitter.sentiment' self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_twitter_data_process_func) @@ -121,7 +120,7 @@ def load_with_pandas(self): return df[df['part'] == 'test'].drop(columns=['part']), df[df['part'] == 'train'].drop(columns=['part']) -def lookup_tweets(tweet_ids, api): +def _lookup_tweets(tweet_ids, api): import tweepy full_tweets = [] tweet_count = len(tweet_ids) @@ -143,7 +142,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict, verbose: bool = True): from zipfile import ZipFile - twitter_api = construct_twitter_api_connection() + twitter_api = _construct_twitter_api_connection() model_name = meta_info['name'] full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension'] @@ -156,7 +155,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict, twitter_ids = list(df['twitterid']) - full_t = lookup_tweets(twitter_ids, twitter_api) + full_t = _lookup_tweets(twitter_ids, twitter_api) tweet_texts = [[tweet.id, tweet.full_text] for tweet in full_t] tweet_ids, t_texts = list(zip(*tweet_texts)) tweet_texts_df = pd.DataFrame({'twitterid': tweet_ids, 'text': t_texts}) @@ -173,7 +172,7 @@ def _twitter_data_process_func(tmp_file_path: str, meta_info: dict, print("Downloaded {} out of {} tweets".format(len(full_t), len(twitter_ids))) -def construct_twitter_api_connection(): +def _construct_twitter_api_connection(): if not('TWITTER_CONSUMER_KEY' in os.environ and 'TWITTER_CONSUMER_SECRET' in os.environ and 'TWITTER_ACCESS_TOKEN' in os.environ diff --git a/danlp/models/embeddings.py b/danlp/models/embeddings.py index 85da97f..f192942 100644 --- a/danlp/models/embeddings.py +++ b/danlp/models/embeddings.py @@ -51,7 +51,7 @@ def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, :param bool verbose: `True` to increase verbosity :return: KeyedVectors or FastTextKeyedVectors """ - word_embeddings_available(pretrained_embedding, can_use_subword=True) + _word_embeddings_available(pretrained_embedding, can_use_subword=True) download_model(pretrained_embedding, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin") @@ -77,7 +77,7 @@ def load_wv_with_spacy(pretrained_embedding: str, import spacy # spaCy does not support subwords - word_embeddings_available(pretrained_embedding, can_use_subword=False) + _word_embeddings_available(pretrained_embedding, can_use_subword=False) spacy_model_dir = os.path.join(cache_dir, pretrained_embedding + ".spacy") @@ -112,7 +112,7 @@ def load_keras_embedding_layer(pretrained_embedding: str, :param kwargs: used to forward arguments to the Keras Embedding layer :return: a Keras Embedding layer and index to word dictionary """ - word_embeddings_available(pretrained_embedding, can_use_subword=False) + _word_embeddings_available(pretrained_embedding, can_use_subword=False) from keras.layers import Embedding wv = load_wv_with_gensim(pretrained_embedding, cache_dir, verbose) @@ -137,7 +137,7 @@ def load_pytorch_embedding_layer(pretrained_embedding: str, :param bool verbose: `True` to increase verbosity :return: a pytorch Embedding module and a list id2word """ - word_embeddings_available(pretrained_embedding, can_use_subword=False) + _word_embeddings_available(pretrained_embedding, can_use_subword=False) import torch from torch.nn import Embedding @@ -166,7 +166,7 @@ def load_context_embeddings_with_flair(direction='bi', word_embeddings=None, embeddings = [] if word_embeddings is not None: - word_embeddings_available(word_embeddings, can_use_subword=False) + _word_embeddings_available(word_embeddings, can_use_subword=False) download_model(word_embeddings, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, word_embeddings + ".bin") @@ -192,7 +192,7 @@ def load_context_embeddings_with_flair(direction='bi', word_embeddings=None, return StackedEmbeddings(embeddings=embeddings) -def word_embeddings_available(pretrained_embedding: str, +def _word_embeddings_available(pretrained_embedding: str, can_use_subword=False): if not can_use_subword and pretrained_embedding in AVAILABLE_SUBWORD_EMBEDDINGS: raise ValueError( diff --git a/danlp/models/spacy_models.py b/danlp/models/spacy_models.py index fd4a4b4..b1e6c8c 100644 --- a/danlp/models/spacy_models.py +++ b/danlp/models/spacy_models.py @@ -176,9 +176,9 @@ def get_bounds(doc, root): is_chunk[j] = False final_chunks = [c for c, ischk in zip(chunks, is_chunk) if ischk] - return chunks2bio(final_chunks, len(spacy_doc)) if bio else final_chunks + return _chunks2bio(final_chunks, len(spacy_doc)) if bio else final_chunks -def chunks2bio(chunks, sent_len): +def _chunks2bio(chunks, sent_len): bio_tags = ['O'] * sent_len for (start, end, label) in chunks: bio_tags[start] = 'B-'+label From 8a691c2b16ac3fd859a89381249e2ebebb439f65 Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Mon, 9 Nov 2020 16:00:12 +0100 Subject: [PATCH 4/8] Update links in docs using relative path for links to work with sphinx and markdown in github --- docs/datasets.md | 32 ++++++++++----------- docs/models/dependency.md | 14 +++++----- docs/models/embeddings.md | 40 +++++++++++++-------------- docs/models/ner.md | 46 +++++++++++++++---------------- docs/models/pos.md | 18 ++++++------ docs/models/sentiment_analysis.md | 46 +++++++++++++++---------------- docs/spacy.md | 13 ++++----- readthedocs/conf.py | 25 +++++++++++------ readthedocs/requirements.txt | 4 +-- 9 files changed, 123 insertions(+), 115 deletions(-) diff --git a/docs/datasets.md b/docs/datasets.md index 4aea230..043b9b0 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -3,25 +3,25 @@ Datasets This section keeps a list of Danish NLP datasets publicly available. -| Dataset | Task | Words | Sents | License | DaNLP | -|---------|------|-------|-------|---------|-----------------| -| [OpenSubtitles2018]() | Translation | 206,700,000 | 30,178,452 |[None](http://opus.nlpl.eu/OpenSubtitles2018.php) | ❌ | -| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - | ❌ | -| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) | ❌ | -| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) | ❌ | -| [WikiANN](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#wikiann)| NER | 832.901 | 95.924 |[ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/)| ✔️ | -| [UD-DDT (DaNE)](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ | -| [LCC Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ | -| [Europarl Sentiment1](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ | -| [Europarl Sentiment2](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ | -| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) | ❌ | -| [WordSim-353](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE)| ✔️ | -| [Danish Similarity Dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE)| ✔️ | -| [Twitter Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#Twitter-Sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ | +| Dataset | Task | Words | Sents | License | DaNLP | +|------------------------------------------------------------------|-----------------|-------------|------------------------|----------------------------------------------------------------------------------------------|-------| +| [OpenSubtitles2018]() | Translation | 206,700,000 | 30,178,452 | [None](http://opus.nlpl.eu/OpenSubtitles2018.php) | ❌ | +| [EU Bookshop](http://opus.nlpl.eu/EUbookshop-v2.php) | Translation | 208,175,843 | 8,650,537 | - | ❌ | +| [Europarl7](http://www.statmt.org/europarl/) | Translation | 47,761,381 | 2,323,099 | [None](http://www.statmt.org/europarl/) | ❌ | +| [ParaCrawl5](https://paracrawl.eu/) | Translation | - | - | [CC0](https://paracrawl.eu/releases.html) | ❌ | +| [WikiANN](#wikiann) | NER | 832.901 | 95.924 | [ODC-BY 1.0](http://nlp.cs.rpi.edu/wikiann/) | ✔️ | +| [UD-DDT (DaNE)](#dane) | DEP, POS, NER | 100,733 | 5,512 | [CC BY-SA 4.0](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md) | ✔️ | +| [LCC Sentiment](#lcc-sentiment) | Sentiment | 10.588 | 499 | [CC BY](https://github.com/fnielsen/lcc-sentiment/blob/master/LICENSE) | ✔️ | +| [Europarl Sentiment1](#europarl-sentiment1) | Sentiment | 3.359 | 184 | None | ✔️ | +| [Europarl Sentiment2](#europarl-sentiment2) | sentiment | | 957 | CC BY-SA 4.0 | ✔️ | +| [Wikipedia](https://dumps.wikimedia.org/dawiki/latest/) | Raw | - | - | [CC BY-SA 3.0](https://dumps.wikimedia.org/legal.html) | ❌ | +| [WordSim-353](#wordsim-353) | Word Similarity | 353 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | +| [Danish Similarity Dataset](#danish-similarity-dataset) | Word Similarity | 99 | - | [CC BY 4.0](https://github.com/fnielsen/dasem/blob/master/dasem/data/wordsim353-da/LICENSE) | ✔️ | +| [Twitter Sentiment](#twitter-sentiment) | Sentiment | - | train: 1215, test: 512 | Twitter privacy policy applies | ✔️ | It is also recommend to check out Finn Årup Nielsen's [dasem github](https://github.com/fnielsen/dasem) which also provides script for loading different Danish corpus. -### Danish Dependency Treebank (DaNE) +### Danish Dependency Treebank (DaNE) {#dane} The Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of the Danish Dependency Treebank (Buch-Kromann et diff --git a/docs/models/dependency.md b/docs/models/dependency.md index 68deb0f..a7c06cd 100644 --- a/docs/models/dependency.md +++ b/docs/models/dependency.md @@ -8,9 +8,9 @@ It is typically represented by a directed graph that depicts the grammatical str A dependency relation is a triplet consisting of: a head (word), a dependent (another word) and a dependency label (describing the type of the relation). -| Model | Train Data | License | Trained by | Tags | DaNLP | -|-------|-------|-------|-------|-------|-------| -| [SpaCy](https://github.com/alexandrainst/danlp/blob/master/docs/models/dependency.md#spacy) | [Danish Dependency Treebank]() | MIT | Alexandra Institute | 17 Universal dependencies | ✔️ | +| Model | Train Data | License | Trained by | Tags | DaNLP | +|-----------------|---------------------------------------------------|---------|---------------------|----------------------------|-------| +| [SpaCy](#spacy) | [Danish Dependency Treebank](../datasets.md#dane) | MIT | Alexandra Institute | 17 Universal dependencies | ✔️ | The model has been trained on the Danish UD treebank which have been annotated with dependencies following the [Universal Dependency](https://universaldependencies.org/u/dep/index.html) scheme. @@ -31,9 +31,9 @@ We provide a convertion function -- from dependencies to NP-chunks -- thus depen -## 🔧 SpaCy +## 🔧 SpaCy {#spacy} -Read more about the SpaCy model in the dedicated [SpaCy docs]() , it has also been trained using the [Danish Dependency Treebank]() dataset. +Read more about the SpaCy model in the dedicated [SpaCy docs](../spacy.md) , it has also been trained using the [Danish Dependency Treebank](../datasets.md#dane) dataset. ### Dependency Parser @@ -121,7 +121,7 @@ for token, nc in zip(doc, np_chunks): See detailed scoring of the benchmarks in the [example]() folder. -### Dependency Parsing +### Dependency Parsing Scores Dependency scores — LA (labelled attachment score), UAS (Unlabelled Attachment Score) and LAS (Labelled Attachment Score) — are reported below : @@ -129,7 +129,7 @@ Dependency scores — LA (labelled attachment score), UAS (Unlabelled Attachment |-------|-------|-------|-------| | SpaCy | 87.68 | 81.36 | 77.46 | -### Noun Phrase Chunking +### Noun Phrase Chunking Scores NP chunking scores (F1) are reported below : diff --git a/docs/models/embeddings.md b/docs/models/embeddings.md index 6941988..fbf8f09 100644 --- a/docs/models/embeddings.md +++ b/docs/models/embeddings.md @@ -5,15 +5,15 @@ This repository keeps a list of pretrained word embeddings publicly available in and `load_embeddings.py` provides functions for downloading the embeddings as well as prepare them for use in popular NLP frameworks. -| Name | Model | Tokens | Vocab | Unit | Task | License | DaNLP | -|------|-------|--------|:-----:|------|-------|---------|-------| -| [CoNLL2017](http://vectors.nlpl.eu/repository/#) | word2vec | 1.6B | 1,655,886 | Word | Skipgram | [CC BY-NC-SA 4.0](https://embeddings.sketchengine.co.uk/static/index.html) | ✔️ | -| [Kongelige Bibliotek](https://loar.kb.dk/handle/1902/329) | word2vec | - | 2,404,836 | Word | Skipgram | [CC0 1.0](https://loar.kb.dk/handle/1902/329) | ✔️ | -| [Facebook CC](https://fasttext.cc/docs/en/crawl-vectors.html) | fastText | - | 2,000,000 | Char N-gram | Skipgram | [CC BY-SA 3.0](https://fasttext.cc/docs/en/crawl-vectors.html#license) | ✔️ | -| [Facebook Wiki](https://fasttext.cc/docs/en/pretrained-vectors.html)| fastText | - | 312,956 | Char N-gram | Skipgram | [CC BY-SA 3.0](https://fasttext.cc/docs/en/crawl-vectors.html#license) | ✔️ | -| [SketchEngine](https://embeddings.sketchengine.co.uk/static/index.html) | fastText | 2B | 2,722,811 | Char N-gram | Skipgram | [CC BY-NC-SA 4.0](https://embeddings.sketchengine.co.uk/static/index.html) | ✔️ | -| [DSL Reddit](https://github.com/danish-stance-detectors/RumourResolution) | word2vec | | 178,649 | Word | CBOW | [MIT](https://github.com/danish-stance-detectors/RumourResolution/blob/master/LICENSE) | ✔️ | -| flair | Flair | - | | Char | LM | MIT | ✔️ | +| Name | Model | Tokens | Vocab | Unit | Task | License | DaNLP | +|---------------------------------------------------------------------------|----------|--------|:---------:|-------------|----------|----------------------------------------------------------------------------------------|-------| +| [CoNLL2017](http://vectors.nlpl.eu/repository/#) | word2vec | 1.6B | 1,655,886 | Word | Skipgram | [CC BY-NC-SA 4.0](https://embeddings.sketchengine.co.uk/static/index.html) | ✔️ | +| [Kongelige Bibliotek](https://loar.kb.dk/handle/1902/329) | word2vec | - | 2,404,836 | Word | Skipgram | [CC0 1.0](https://loar.kb.dk/handle/1902/329) | ✔️ | +| [Facebook CC](https://fasttext.cc/docs/en/crawl-vectors.html) | fastText | - | 2,000,000 | Char N-gram | Skipgram | [CC BY-SA 3.0](https://fasttext.cc/docs/en/crawl-vectors.html#license) | ✔️ | +| [Facebook Wiki](https://fasttext.cc/docs/en/pretrained-vectors.html) | fastText | - | 312,956 | Char N-gram | Skipgram | [CC BY-SA 3.0](https://fasttext.cc/docs/en/crawl-vectors.html#license) | ✔️ | +| [SketchEngine](https://embeddings.sketchengine.co.uk/static/index.html) | fastText | 2B | 2,722,811 | Char N-gram | Skipgram | [CC BY-NC-SA 4.0](https://embeddings.sketchengine.co.uk/static/index.html) | ✔️ | +| [DSL Reddit](https://github.com/danish-stance-detectors/RumourResolution) | word2vec | | 178,649 | Word | CBOW | [MIT](https://github.com/danish-stance-detectors/RumourResolution/blob/master/LICENSE) | ✔️ | +| flair | Flair | - | | Char | LM | MIT | ✔️ | Embeddings are a way of representing text as numeric vectors, and can be calculated both for chars, subword units [(Sennrich et al. 2016)](https://aclweb.org/anthology/P16-1162), words, sentences or documents. @@ -37,18 +37,18 @@ the next character in a sentence. To evaluate word embeddings it is common to do intrinsic evaluations to directly test for syntactic or semantic relationships between words. The -[Danish Similarity Dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-similarity-dataset) and [WordSim-353](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#wordsim-353) contains word pairs annotated with a similarity score. Calculating the correlation between the word embedding similarity and the similarity score +[Danish Similarity Dataset](../datasets.md#danish-similarity-dataset) and [WordSim-353](../datasets.md#wordsim-353) contains word pairs annotated with a similarity score. Calculating the correlation between the word embedding similarity and the similarity score gives and indication of how well the word embeddings captures relationships between words. -| Model | DSD-ρ | DSD-OOV | WS353-ρ | WS353-OOV | -|--------------------|--------|----------|-----------|-----------| -| wiki.da.wv | 0.205 | 1.01% | **0.639** | 0.85% | -| cc.da.wv | **0.313** | 0.00% | 0.533 | 1.70% | -| conll17.da.wv | 0.150 | 0.00% | 0.549 | 1.70% | -| news.da.wv | 0.306 | 0.00% | 0.541 | 4.25% | -| sketchengine.da.wv | 0.197 | 0.00% | 0.626 | 0.85% | -| dslreddit.da.wv | 0.198 | 0.00% | 0.443 | 1.98% | +| Model | DSD-ρ | DSD-OOV | WS353-ρ | WS353-OOV | +|--------------------|-----------|---------|-----------|-----------| +| wiki.da.wv | 0.205 | 1.01% | **0.639** | 0.85% | +| cc.da.wv | **0.313** | 0.00% | 0.533 | 1.70% | +| conll17.da.wv | 0.150 | 0.00% | 0.549 | 1.70% | +| news.da.wv | 0.306 | 0.00% | 0.541 | 4.25% | +| sketchengine.da.wv | 0.197 | 0.00% | 0.626 | 0.85% | +| dslreddit.da.wv | 0.198 | 0.00% | 0.443 | 1.98% | ## 🐣 Get started using word embeddings @@ -107,7 +107,7 @@ print('{} sentences out of {} is equal'.format(int(sum(sentence2[4].embedding==s -## 🔧 Training details for Flair embeddings +## 🔧 Training details for Flair embeddings {#flair-embeddings} This repository provides pretrained Flair word embeddings trained on Danish data from Wikipedia and EuroParl both forwards and backwards. To see the code for training the Flair embeddings have a look at [Flairs GitHub](https://github.com/zalandoresearch/flair). @@ -115,7 +115,7 @@ The hyperparameter are set as follows: `hidden_size=1032`, `nlayers=1`, `sequenc `max_epochs=5` -The trained Flair word embeddings has been used in training a Part of Speech tagger and Name Entity Recognition tagger with Flair, check it out in the docs for [pos](https://github.com/alexandrainst/danlp/blob/master/docs/models/pos.md) and [ner](https://github.com/alexandrainst/danlp/blob/master/docs/models/ner.md) . +The trained Flair word embeddings has been used in training a Part of Speech tagger and Name Entity Recognition tagger with Flair, check it out in the docs for [pos](pos.md) and [ner](ner.md) . ## 🎓 References diff --git a/docs/models/ner.md b/docs/models/ner.md index 8be9468..8594758 100644 --- a/docs/models/ner.md +++ b/docs/models/ner.md @@ -8,20 +8,20 @@ tools are available for NER in Danish. Popular models for NER are continuously trained on the newest available named entity datasets such as DaNE and made available through the DaNLP library. -| Model | Train Data | Maintainer | Tags | DaNLP | -|-------|-------|------------|------|-------| -| [BERT](https://github.com/alexandrainst/danlp/blob/master/docs/models/ner.md#bert) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) | Alexandra Institute | PER, ORG, LOC | ✔ | -| [Flair](https://github.com/alexandrainst/danlp/blob/master/docs/models/ner.md#flair) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) | Alexandra Institute | PER, ORG, LOC | ✔️ | -| [spaCy](https://github.com/alexandrainst/danlp/blob/master/docs/models/ner.md#spacy) | [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) | Alexandra Institute | PER, ORG, LOC | ✔ | -| [Polyglot](https://polyglot.readthedocs.io/en/latest/POS.html/#) | Wikipedia | Polyglot | PER, ORG, LOC | ❌ | -| [daner](https://github.com/ITUnlp/daner) | [Derczynski et al. (2014)](https://www.aclweb.org/anthology/E14-2016) | [ITU NLP](https://nlp.itu.dk/) | PER, ORG, LOC | ❌ | - -#### 🔧 BERT +| Model | Train Data | Maintainer | Tags | DaNLP | +|-----------------------------------------------------------------------------------|-----------------------------------------------------------------------|--------------------------------|---------------|-------| +| [BERT](#bert) | [DaNE](../datasets.md#dane) | Alexandra Institute | PER, ORG, LOC | ✔ | +| [Flair](#flair) | [DaNE](../datasets.md#dane) | Alexandra Institute | PER, ORG, LOC | ✔ | +| [spaCy](#spacy) | [DaNE](../datasets.md#dane) | Alexandra Institute | PER, ORG, LOC | ✔ | +| [Polyglot](https://polyglot.readthedocs.io/en/latest/NamedEntityRecognition.html) | Wikipedia | Polyglot | PER, ORG, LOC | ❌ | +| [daner](https://github.com/ITUnlp/daner) | [Derczynski et al. (2014)](https://www.aclweb.org/anthology/E14-2016) | [ITU NLP](https://nlp.itu.dk/) | PER, ORG, LOC | ❌ | + +#### 🔧 BERT {#bert} The BERT [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) NER model is based on the pre-trained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO which -has been finetuned on the [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) +has been finetuned on the [DaNE](../datasets.md#dane) dataset [(Hvingelby et al. 2020)](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). The finetuning has been done using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace. -To use the BERT NER model it can be loaded with the `load_bert_ner_model()` method. Please notice that it can maximum take 512 tokens as input at a time. For longer text sequences split before hand, for example be using sentence boundary detection (eg. by using the [spacy model](https://github.com/alexandrainst/danlp/blob/master/docs/spacy.md ).) +To use the BERT NER model it can be loaded with the `load_bert_ner_model()` method. Please notice that it can maximum take 512 tokens as input at a time. For longer text sequences split before hand, for example be using sentence boundary detection (eg. by using the [spacy model](../spacy.md ).) ```python from danlp.models import load_bert_ner_model @@ -32,11 +32,11 @@ print(" ".join(["{}/{}".format(tok,lbl) for tok,lbl in zip(tokens,labels)])) ``` -#### 🔧 Flair +#### 🔧 Flair {#flair} The Flair [(Akbik et al. 2018)](https://www.aclweb.org/anthology/C18-1139/) NER model -uses pretrained [Flair embeddings](https://github.com/alexandrainst/danlp/blob/master/docs/models/embeddings.md#-training-details-for-flair-embeddings) +uses pretrained [Flair embeddings](embeddings.md#flair-embeddings) in combination with fastText word embeddings. The model is trained using the [Flair](https://github.com/flairNLP/flair) - library on the the [DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) dataset. + library on the the [DaNE](../datasets.md#dane) dataset. The Flair NER model can be used with DaNLP using the `load_flair_ner_model()` method. ```python @@ -52,8 +52,8 @@ flair_model.predict(sentence) print(sentence.to_tagged_string()) ``` -#### 🔧 spaCy -The [spaCy](https://spacy.io/) model is trained for several NLP tasks [(read more here)](https://github.com/alexandrainst/danlp/blob/master/docs/spacy.md) uing the [DDT and DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) annotations. +#### 🔧 spaCy {#spacy} +The [spaCy](https://spacy.io/) model is trained for several NLP tasks [(read more here)](../spacy.md) uing the [DDT and DaNE](../datasets.md#dane) annotations. The spaCy model can be loaded with DaNLP to do NER predictions in the following way. ```python from danlp.models import load_spacy_model @@ -80,16 +80,16 @@ The tool is not available through DaNLP but it can be used from the [daner repos ## 📈 Benchmarks The benchmarks has been performed on the test part of the -[DaNE](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane) dataset. +[DaNE](../datasets.md#dane) dataset. None of the models have been trained on this test part. We are only reporting the scores on the `LOC`, `ORG` and `PER` entities as the `MISC` category has limited practical use. The table below has the achieved F1 score on the test set: -| Model | LOC | ORG | PER | AVG | -|-------|-------|-----|-----|-----| -| BERT | 83.90 | **72.98** | 92.82 | **84.04** | -| Flair | **84.82** | 62.95 | **93.15** | 81.78 | -| spaCy | 75.96 | 59.57 | 87.87 | 75.73 | -| Polyglot | 64.95 | 39.3 | 78.74 | 64.18 | +| Model | LOC | ORG | PER | AVG | +|----------|-----------|-----------|-----------|-----------| +| BERT | 83.90 | **72.98** | 92.82 | **84.04** | +| Flair | **84.82** | 62.95 | **93.15** | 81.78 | +| spaCy | 75.96 | 59.57 | 87.87 | 75.73 | +| Polyglot | 64.95 | 39.3 | 78.74 | 64.18 | The evaluation script `ner_benchmarks.py` can be found [here](https://github.com/alexandrainst/danlp/blob/master/examples/benchmarks/ner_benchmarks.py). diff --git a/docs/models/pos.md b/docs/models/pos.md index b79b3e5..caaaac6 100644 --- a/docs/models/pos.md +++ b/docs/models/pos.md @@ -2,11 +2,11 @@ Part of Speech Tagging ====================== This section is concerned with public available Part of Speech (POS) taggers in Danish. -| Model | Train Data | License | Trained by | Tags | DaNLP | -|-------|-------|-------|-------|-------|-------| -| [Polyglot](https://github.com/alexandrainst/danlp/blob/master/docs/models/pos.md#polyglot) | [Danish Dependency Treebank]() [Al-Rfou et al. (2013)] | GPLv3 license | Polyglot | 17 Universal part of speech | ❌ | -| [Flair](https://github.com/alexandrainst/danlp/blob/master/docs/models/pos.md#flair) | [Danish Dependency Treebank]() | MIT | Alexandra Instittut | 17 Universal part of speech | ✔️ | -| [SpaCy](https://github.com/alexandrainst/danlp/blob/master/docs/models/pos.md#spacy) | [Danish Dependency Treebank]() | MIT | Alexandra Instittut | 17 Universal part of speech | ✔️ | +| Model | Train Data | License | Trained by | Tags | DaNLP | +|-----------------------|---------------------------------------------------------------------------|---------------|---------------------|------------------------------|-------| +| [Polyglot](#polyglot) | [Danish Dependency Treebank](../datasets.md#dane) [Al-Rfou et al. (2013)] | GPLv3 license | Polyglot | 17 Universal part of speech | ❌ | +| [Flair](#flair) | [Danish Dependency Treebank](../datasets.md#dane) | MIT | Alexandra Instittut | 17 Universal part of speech | ✔️ | +| [SpaCy](#spacy) | [Danish Dependency Treebank](../datasets.md#dane) | MIT | Alexandra Instittut | 17 Universal part of speech | ✔️ | The Danish UD treebank uses 17 [universal part of speech tags](): @@ -16,9 +16,9 @@ A medium blog using Part of Speech tagging on Danish, can be found [here]() framework from Zalando, based on the paper [Akbik et. al (2018)](). The model is trained using the data [Danish Dependency Treebank]() and by using FastText word embeddings and Flair contextual word embeddings trained in this project on data from Wikipedia and EuroParl corpus, see [here](). +This project provides a trained part of speech tagging model for Danish using the [Flair]() framework from Zalando, based on the paper [Akbik et. al (2018)](). The model is trained using the data [Danish Dependency Treebank](../datasets.md#dane) and by using FastText word embeddings and Flair contextual word embeddings trained in this project on data from Wikipedia and EuroParl corpus, see [here](embeddings.md). The code for training can be found on Flairs GitHub, and the following parameters are set: `learning_rate=1`, `mini_batch_size=32`, `max_epochs=150`, `hidden_size=256`. @@ -45,9 +45,9 @@ print(sentence.to_tagged_string()) -##### 🔧 SpaCy +##### 🔧 SpaCy {#spacy} -Read more about the spaCy model in the dedicated [spaCy docs]() , it has also been trained using the [Danish Dependency Treebank]() data. +Read more about the spaCy model in the dedicated [spaCy docs](../spacy.md) , it has also been trained using the [Danish Dependency Treebank](../datasets.md#dane) data. Below is a small getting started snippet for using the Spacy pos tagger: diff --git a/docs/models/sentiment_analysis.md b/docs/models/sentiment_analysis.md index 0efe2f2..ccb2350 100644 --- a/docs/models/sentiment_analysis.md +++ b/docs/models/sentiment_analysis.md @@ -7,11 +7,11 @@ In this repository we provide an overview of open sentiment analysis models and | Model | Model | License | Trained by | Dimension | Tags | DaNLP | | ------------------------------------------------------------ | -------- | ------------------------------------------------------------ | --------------------------------------------------------- | ------------------ | ------------------------------------------------------------ | ----- | -| [AFINN](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md#afinn) | Wordlist | [Apache 2.0](https://github.com/fnielsen/afinn/blob/master/LICENSE) | Finn Årup Nielsen | Polarity | Score (integers) | ❌ | -| [Sentida](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md#sentida) | Wordlist | [GPL-3.0](https://github.com/esbenkc/emma/blob/master/LICENSE) | Jacob Dalsgaard, Lars Kjartan Svenden og Gustav Lauridsen | Polarity | Score (continuous) | ❌ | -| [BERT Emotion](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md#wrenchbert-emotion) | BERT | CC-BY_4.0 | Alexandra Institute | Emotions | glæde/sindsro, forventning/interesse, tillid/accept, overraskelse/forundring, vrede/irritation, foragt/modvilje, sorg/skuffelse, frygt/bekymring, No emotion | ✔️ | -| [BERT Tone](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md#wrenchbert-tone) (beta) | BERT | CC-BY_4.0 | Alexandra Institute | Polarity, Analytic | ['postive', 'neutral', 'negative'] and ['subjective', 'objective] | ✔️ | -| [SpaCy Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md#wrench-spacy-sentiment) (beta) | spaCy | MIT | Alexandra Institute | Polarity | 'postive', 'neutral', 'negative' | ✔️ | +| [AFINN](#afinn) | Wordlist | [Apache 2.0](https://github.com/fnielsen/afinn/blob/master/LICENSE) | Finn Årup Nielsen | Polarity | Score (integers) | ❌ | +| [Sentida](#sentida) | Wordlist | [GPL-3.0](https://github.com/esbenkc/emma/blob/master/LICENSE) | Jacob Dalsgaard, Lars Kjartan Svenden og Gustav Lauridsen | Polarity | Score (continuous) | ❌ | +| [BERT Emotion](#bert-emotion) | BERT | CC-BY_4.0 | Alexandra Institute | Emotions | glæde/sindsro, forventning/interesse, tillid/accept, overraskelse/forundring, vrede/irritation, foragt/modvilje, sorg/skuffelse, frygt/bekymring, No emotion | ✔ | +| [BERT Tone](#bert-tone) (beta) | BERT | CC-BY_4.0 | Alexandra Institute | Polarity, Analytic | ['postive', 'neutral', 'negative'] and ['subjective', 'objective] | ✔ | +| [SpaCy Sentiment](#spacy-sentiment) (beta) | spaCy | MIT | Alexandra Institute | Polarity | 'postive', 'neutral', 'negative' | ✔ | @@ -25,7 +25,7 @@ The tool scores texts with an integer where scores <0 are negative, =0 are neutr The tool Sentida [(Lauridsen et al. 2019)](https://tidsskrift.dk/lwo/article/view/115711) uses a lexicon based approach to sentiment analysis. The tool scores texts with a continuous value. There exist both an R version and an implementation in Python. In these documentations we evaluate the python version from [sentida](https://github.com/guscode/sentida). -#### 🔧 BERT Emotion +#### 🔧 BERT Emotion {#bert-emotion} The emotion classifier is developed in a collaboration with Danmarks Radio, which has granted access to a set of social media data. The data has been manual annotated first to distinguish between a binary problem of emotion or no emotion, and afterwards tagged with 8 emotions. The BERT [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) emotion model is finetuned on this data using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace, and it is based on a pretrained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO . The model to classify the eight emotions achieves an accuracy on 0.65 and a macro-f1 on 0.64 on the social media test set from DR's Facebook containing 999 examples. We do not have permission to distributing the data. @@ -50,9 +50,9 @@ classifier._classes() -#### 🔧 BERT Tone +#### 🔧 BERT Tone {#bert-tone} -The tone analyzer consists of two BERT [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) classification models, and the first is recognizing the following tags positive, neutral and negative and the second model the tags: subjective and objective. This is a first version of the models, and work should be done to improve performance. Both models is finetuned on annotated twitter data using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace, and it is based on a pretrained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO . The data used is manually annotated data from Twitter Sentiment (train part)([see here](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#twitter-sentiment) ) and EuroParl sentiment 2 ([se here](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#europarl-sentiment2)), both datasets can be loaded with the DaNLP package. +The tone analyzer consists of two BERT [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) classification models, and the first is recognizing the following tags positive, neutral and negative and the second model the tags: subjective and objective. This is a first version of the models, and work should be done to improve performance. Both models is finetuned on annotated twitter data using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace, and it is based on a pretrained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO . The data used is manually annotated data from [Twitter Sentiment](../datasets.md#twitter-sentiment) (train part) and [EuroParl sentiment 2](../datasets.md#europarl-sentiment2)), both datasets can be loaded with the DaNLP package. Below is a small snippet for getting started using the BERT Tone model. Please notice that the BERT model can maximum take 512 tokens as input, however the code allows for overfloating tokens and will therefore not give an error but just a warning. @@ -73,15 +73,15 @@ classifier._clases() -#### 🔧 SpaCy Sentiment +#### 🔧 SpaCy Sentiment {#spacy-sentiment} -SpaCy sentiment is a text classification model trained using spacy built in command line interface. It uses the CoNLL2017 word vectors, read about it [here](https://github.com/alexandrainst/danlp/blob/master/docs/models/embeddings.md) . +SpaCy sentiment is a text classification model trained using spacy built in command line interface. It uses the CoNLL2017 word vectors (read about it [here](embeddings.md)). -The model is trained using hard distil of the [BERT Tone](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md#wrenchbert-tone) (beta) - Meaning, the BERT Tone model is used to make predictions on 50.000 sentences from Twitter and 50.000 sentences from [Europarl7](http://www.statmt.org/europarl/). These data is then used to trained a spacy model. Notice the dataset has first been balanced between the classes by oversampling. The model recognizes the classses: 'positiv', 'neutral' and 'negative'. +The model is trained using hard distil of the [BERT Tone](#wrenchbert-tone) (beta) - Meaning, the BERT Tone model is used to make predictions on 50.000 sentences from Twitter and 50.000 sentences from [Europarl7](http://www.statmt.org/europarl/). These data is then used to trained a spacy model. Notice the dataset has first been balanced between the classes by oversampling. The model recognizes the classses: 'positiv', 'neutral' and 'negative'. It is a first version. -Read more about using the Danish spaCy model [here](https://github.com/alexandrainst/danlp/blob/Add_spacy_sentiment/docs/spacy.md) +Read more about using the Danish spaCy model [here](../spacy.md). Below is a small snippet for getting started using the spaCy sentiment model. Currently the danlp packages provide both a spaCy model which do not provide any classes in the textcat module (so it is empty for you to train from scratch), and the sentiment spacy model which have pretrained the classes 'positiv', 'neutral' and 'negative'. Notice it is possible with the spacy command line interface to continue training of the sentiment classes, or add new tags. @@ -109,9 +109,9 @@ into the there classes 'positive', 'neutral' and 'negative'. The tools are benchmarked on the following datasets: -- [LCC Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#lcc-sentiment) contains 499 sentences from the proceedings of the European Parliament annotated with a sentiment score from -5 to 5 by Finn Årup Nielsen. -- [Europarl Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#europarl-sentiment) contains 184 sentences from news and web pages annotated with sentiment -5 to 5 by Finn Årup Nielsen. -- [Twitter Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#twitter-sentiment) contains annotations for polarity (positive, neutral, negative) and annotations for analytic (subjective, objective) made by Alexandra Institute. 512 examples of the dataset are defined for evaluation. +- [LCC Sentiment](../datasets.md#lcc-sentiment) contains 499 sentences from the proceedings of the European Parliament annotated with a sentiment score from -5 to 5 by Finn Årup Nielsen. +- [Europarl Sentiment](../datasets.md#europarl-sentiment1) contains 184 sentences from news and web pages annotated with sentiment -5 to 5 by Finn Årup Nielsen. +- [Twitter Sentiment](../datasets.md#twitter-sentiment) contains annotations for polarity (positive, neutral, negative) and annotations for analytic (subjective, objective) made by Alexandra Institute. 512 examples of the dataset are defined for evaluation. A conversion of the scores of the LCC and Europarl Sentiment dataset and the Afinn model is done in the following way: a score of zero to be "neutral", a positive score to be "positive" and a negative score to be "negative". @@ -120,18 +120,18 @@ A conversion of the continuous scores of the Sentida tool into three classes is The scripts for the benchmarks can be found [here](https://github.com/alexandrainst/danlp/blob/master/examples/benchmarks/). There is one for the europarl sentiment and LCC sentiment data and another one for the twitter sentiment. This is due to the fact that downloading the twitter data requires login to a twitter API account. The scores below for the twitter data is reported for all the data, but if tweets are deleted in the mean time on twitter, not all tweets can be downloaded. In the table we consider the accuracy and macro-f1 in brackets, but to get the scores per class we refer to our benchmark script. -| Tool/Model | Europarl Sentiment | LCC Sentiment | Twitter Sentiment (Polarity) | -| ---- | ------------------ | ------------- | ---- | -| AFINN | 0.68 (0.68) | 0.66 (0.61) | 0.48 (0.46) | -| Sentida (version 0.5.0) | 0.67 (0.65) | 0.58 (0.55) | 0.44 (0.44) | -| BERT Tone (polarity, version 0.0.1) | **0.79** (0.78) | **0.74** (0.67) | **0.73** (0.70) | -| spaCy sentiment (version 0.0.1) | 0.74 (0.73) | 0.66 (0.61) | 0.66 (0.60) | +| Tool/Model | Europarl Sentiment | LCC Sentiment | Twitter Sentiment (Polarity) | +|-------------------------------------|--------------------|-----------------|------------------------------| +| AFINN | 0.68 (0.68) | 0.66 (0.61) | 0.48 (0.46) | +| Sentida (version 0.5.0) | 0.67 (0.65) | 0.58 (0.55) | 0.44 (0.44) | +| BERT Tone (polarity, version 0.0.1) | **0.79** (0.78) | **0.74** (0.67) | **0.73** (0.70) | +| spaCy sentiment (version 0.0.1) | 0.74 (0.73) | 0.66 (0.61) | 0.66 (0.60) | **Benchmark of subjective versus objective classification** The data for benchmark is: -- [Twitter Sentiment](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#twitter-sentiment) contains annotations for polarity (positive, neutral, negative) and annotations for analytic (subjective, objective) made by Alexandra Institute. 512 examples of the dataset are defined for evaluation. +- [Twitter Sentiment](../datasets.md#twitter-sentiment) contains annotations for polarity (positive, neutral, negative) and annotations for analytic (subjective, objective) made by Alexandra Institute. 512 examples of the dataset are defined for evaluation. The script for the benchmarks can be found [here](https://github.com/alexandrainst/danlp/blob/master/examples/benchmarks/) and it provides more detailed scores. Below is accuracy and macro-f1 reported: @@ -152,7 +152,7 @@ So if you manage a site containing user reviews for example movie reviews and wo ## Zero-shot Cross-lingual transfer example An example of utilizing a dataset in another language to be able to make predictions on Danish without seeing Danish training data is shown in this -[notebok](). It is trained on English movie reviews from IMDB, and +[notebook](). It is trained on English movie reviews from IMDB, and it uses multilingual embeddings from [Artetxe et al. 2019](https://arxiv.org/pdf/1812.10464.pdf) called [LASER]()(Language-Agnostic SEntence Representations). diff --git a/docs/spacy.md b/docs/spacy.md index dcab31f..f6da024 100644 --- a/docs/spacy.md +++ b/docs/spacy.md @@ -11,15 +11,15 @@ Note that the two models are not the same, e.g. the spaCy model in DaNLP perform The spaCy model comes with **tokenization**, **dependency parsing**, **part of speech tagging** , **word vectors** and **name entity recognition**. -The model is trained on the [Danish Dependency Treebank (DaNe)](), and with additional data for NER which originates from news articles form a collaboration with InfoMedia. +The model is trained on the [Danish Dependency Treebank (DaNe)](datasets.md#dane), and with additional data for NER which originates from news articles form a collaboration with InfoMedia. -For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](), [named entity recognition](), [part of speech tagging]() and [dependency parsing](). +For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](models/embeddings.md), [named entity recognition](models/ner.md), [part of speech tagging](models/pos.md) and [dependency parsing](models/dependency.md). -The danlp github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md). +The danlp github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](models/sentiment_analysis.md). #### Performance of the spaCy model -The following lists the performance scores of the spaCy model provided in DaNLP pakage on the [Danish Dependency Treebank (DaNe)]() test set. The scores and elaborating scores can be found in the file meta.json that is shipped with the model when it is downloaded. +The following lists the performance scores of the spaCy model provided in DaNLP pakage on the [Danish Dependency Treebank (DaNe)](datasets.md#dane) test set. The scores and elaborating scores can be found in the file meta.json that is shipped with the model when it is downloaded. | Task | Measures | Scores | | ----------------------- | -------- | :----- | @@ -27,7 +27,6 @@ The following lists the performance scores of the spaCy model provided in DaNLP | Dependency parsing | las | 77.22 | | Part of speech tags | accuracy | 96.40 | | Named entity recognition| f1 | 80.50 | -| | | | @@ -82,7 +81,7 @@ displacy.serve(doc, style='dep') ![](imgs/dep.PNG) -Here is an example of using Named entity recognitions . You can read more about [NER](https://github.com/alexandrainst/danlp/blob/master/docs/models/ner.md#named-entity-recognition) in the specific doc. +Here is an example of using Named entity recognitions . You can read more about [NER](models/ner.md#named-entity-recognition) in the specific doc. ```python doc = nlp('Jens Peter Hansen kommer fra Danmark og arbejder hos Alexandra Instituttet') @@ -109,7 +108,7 @@ Instituttet ORG The spaCy framework provides an easy command line tool for training an existing model, for example by adding a text classifier. This short example shows how to do so using your own annotated data. It is also possible to use any static embedding provided in the DaNLP wrapper. -As an example we will use a small dataset for sentiment classification on twitter. The dataset is under development and will be added in the DaNLP package when ready, and the spacy model will be updated with the classification model as well. A first verison of a spacy model with a sentiment classifier can be load with the danlp wrapper, read more about it in the sentiment analysis [docs](https://github.com/alexandrainst/danlp/blob/master/docs/models/sentiment_analysis.md). +As an example we will use a small dataset for sentiment classification on twitter. The dataset is under development and will be added in the DaNLP package when ready, and the spacy model will be updated with the classification model as well. A first verison of a spacy model with a sentiment classifier can be load with the danlp wrapper, read more about it in the sentiment analysis [docs](models/sentiment_analysis.md). **The first thing is to convert the annotated data into a data format readable by spaCy** diff --git a/readthedocs/conf.py b/readthedocs/conf.py index 29126ca..33d2e74 100644 --- a/readthedocs/conf.py +++ b/readthedocs/conf.py @@ -14,7 +14,9 @@ import sys sys.path.insert(0, os.path.abspath('..')) -from recommonmark.transform import AutoStructify +import sphinx_markdown_parser +from sphinx_markdown_parser.parser import MarkdownParser +from sphinx_markdown_parser.transform import AutoStructify # -- Project information ----------------------------------------------------- @@ -29,11 +31,12 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'recommonmark', 'sphinx.ext.autodoc', 'sphinx_markdown_tables', 'sphinx.ext.todo', - 'sphinx.ext.autosectionlabel' + 'sphinx.ext.autosectionlabel', + 'sphinx.ext.napoleon', + 'sphinx.ext.mathjax' ] source_suffix = ['.rst', '.md'] @@ -85,8 +88,14 @@ autosectionlabel_prefix_document = True def setup(app): - app.add_config_value('recommonmark_config', { - 'url_resolver': lambda url: github_doc_root + url, - 'auto_toc_tree_section': 'Contents', - }, True) - app.add_transform(AutoStructify) \ No newline at end of file + app.add_source_suffix('.md', 'markdown') + app.add_source_parser(MarkdownParser) + app.add_config_value('markdown_parser_config', { + "extensions": ["tables","extra"], + 'auto_toc_tree_section': 'Content', + 'enable_auto_toc_tree': True, + 'enable_eval_rst': True, + 'enable_inline_math': True, + 'enable_math': True, + }, True) + app.add_transform(AutoStructify) diff --git a/readthedocs/requirements.txt b/readthedocs/requirements.txt index ddd0d96..541700a 100644 --- a/readthedocs/requirements.txt +++ b/readthedocs/requirements.txt @@ -1,5 +1,5 @@ sphinx==3.1.2 -recommonmark==0.6.0 sphinx_rtd_theme==0.5.0 -sphinx_markdown_tables==0.0.15 \ No newline at end of file +sphinx_markdown_tables==0.0.15 +sphinx_markdown_parser==0.2.4 \ No newline at end of file From 06b5f66737c38e6d7a947ca67395b68b46a1b323 Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Fri, 13 Nov 2020 11:07:01 +0100 Subject: [PATCH 5/8] Add and update documentation add: - a quick start guide - framework documentation (bert and flair) --- docs/bert.md | 48 +++++ docs/datasets.md | 8 +- docs/flair.md | 125 ++++++++++++ docs/frameworks.rst | 4 +- docs/spacy.md | 17 +- readthedocs/gettingstarted/contributing.md | 4 +- readthedocs/gettingstarted/installation.md | 7 +- readthedocs/gettingstarted/quickstart.md | 226 +++++++++++++++++++++ readthedocs/index.rst | 11 +- 9 files changed, 427 insertions(+), 23 deletions(-) create mode 100644 docs/bert.md create mode 100644 docs/flair.md create mode 100644 readthedocs/gettingstarted/quickstart.md diff --git a/docs/bert.md b/docs/bert.md new file mode 100644 index 0000000..77b3f3b --- /dev/null +++ b/docs/bert.md @@ -0,0 +1,48 @@ +BERT +==== + +BERT (Bidirectional Encoder Representations from Transformers) [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) is a deep neural network model used in Natural Language Processing. + +The BERT models provided with DaNLP are based on the pre-trained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO which has been finetuned on several tasks using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace. + +Through DaNLP, we provide fine-tuned BERT models for the following tasks: + +* Named Entity Recognition +* Emotion detection +* Tone and polarity detection + +Please note that the BERT models can take a maximum of 512 tokens as input at a time. For longer text sequences, you should split the text before hand -- for example by using sentence boundary detection (e.g. with the [spaCy model](spacy.md)). + +See our [getting started guides](../gettingstarted/quickstart.md#bert) for examples on how to use the BERT models. + +### Named Entity Recognition + +The BERT NER model has been finetuned on the [DaNE](../datasets.md#dane) dataset [(Hvingelby et al. 2020)](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). +It can be loaded with the `load_bert_ner_model()` method. + +### Emotion detection + +The emotion classifier is developed in a collaboration with Danmarks Radio, which has granted access to a set of social media data. The data has been manually annotated first to distinguish between a binary problem of emotion or no emotion, and afterwards tagged with 8 emotions. The BERT emotion model is finetuned on this data. + +The model can detect the eight following emotions: + +* `Glæde/Sindsro` +* `Tillid/Accept` +* `Forventning/Interrese` +* `Overasket/Målløs` +* `Vrede/Irritation` +* `Foragt/Modvilje` +* `Sorg/trist` +* `Frygt/Bekymret` + +The model achieves an accuracy of 0.65 and a macro-f1 of 0.64 on the social media test set from DR's Facebook containing 999 examples. We do not have permission to distributing the data. + +### Tone and polarity detection + +The tone analyzer consists of two BERT classification models. +The first model detects the polarity of a sentence, i.e. whether it is perceived as `positive`, `neutral` or `negative`. +The second model detects the tone of a sentence, between `subjective` and `objective`. + +The models are finetuned on manually annotated Twitter data from [Twitter Sentiment](datasets.md#twitter-sentiment) (train part) and [EuroParl sentiment 2](datasets.md#europarl-sentiment2)). +Both datasets can be loaded with the DaNLP package. + diff --git a/docs/datasets.md b/docs/datasets.md index 043b9b0..538428b 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -26,7 +26,7 @@ It is also recommend to check out Finn Årup Nielsen's [dasem github](https://gi The Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of the Danish Dependency Treebank (Buch-Kromann et al. 2003) based on texts from Parole (Britt, 1998). -UD-DDT has annotations for dependency parsing and POS. +UD-DDT has annotations for dependency parsing and part-of-speech (POS) tagging. The dataset was annotated with Named Entities for **PER**, **ORG** and **LOC** by the Alexandra Institute in the DaNE dataset (Hvingelby et al. 2020). To read more about how the dataset was annotated with POS and DEP tags we refer to the @@ -74,14 +74,14 @@ It is constructed with frequently used Danish words. The Twitter sentiment is a small manually annotated dataset by the Alexandra Institute. It contains tags in two sentiment dimension: analytic: ['subjective' , 'objective'] and polarity: ['positive', 'neutral', 'negative' ]. It is split in train and test part. Due to Twitters privacy policy, it is only allowed to display the "tweet ID" and not the actually text. This allows people to delete their tweets. Therefore, to download the actual tweet text one need a Twitter development account and to generate the sets of login keys, read how to get started [here](https://python-twitter.readthedocs.io/en/latest/getting_started.html). Then the dataset can be loaded with the DaNLP package by setting the following environment variable for the keys: -``` TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET```| +``` TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET``` - ```python +```python from danlp.datasets import TwitterSent twitSent = TwitterSent() df_test, df_train = twitSent.load_with_pandas() - ``` +``` The dataset can also be downloaded directly with the labels and tweet id: diff --git a/docs/flair.md b/docs/flair.md new file mode 100644 index 0000000..d9590cb --- /dev/null +++ b/docs/flair.md @@ -0,0 +1,125 @@ +Flair +===== + +The [flair](https://github.com/flairNLP/flair) framework from Zalando is based on the paper [Akbik et. al (2018)](). + + +Through the DaNLP package, we provide a pre-trained Part-of-Speech tagger and Named Entity recognizer using the flair framework. +The models have been trained on the [Danish Dependency Treebank](datasets.md#dane) and use fastText word embeddings and [flair contextual word embeddings](models/embeddings.md#flair-embeddings) trained on data from Wikipedia and EuroParl corpus. +The code for training can be found on flair's GitHub, and the following parameters are set: +`learning_rate=1`, `mini_batch_size=32`, `max_epochs=150`, `hidden_size=256`. + + +## One sentence at a time + +For Part-of-Speech tagging and Named Entity Recognition, it is possible to analyse one sentence at a time using the `Sentence` class of the flair framework. + +Please note that the text should be tokenized before hand. + +Here is a snippet for using the part-of-speech tagger which can be loaded using the DaNLP `load_flair_pos_model` function. + +```python +from danlp.models import load_flair_pos_model +from flair.data import Sentence + +text = "Morten bor i København tæt på Kongens Nytorv" +sentence = Sentence(text) + +tagger = load_flair_pos_model() + +tagger.predict(sentence) + +for tok in sentence.tokens: + print(tok.text, tok.get_tag('upos').value) + +``` + +In a similar way, you can load and use the DaNLP Named Entity Recognition model using the `load_flair_ner_model` function. + +```python +from danlp.models import load_flair_ner_model +from flair.data import Sentence + +text = "Morten bor i København tæt på Kongens Nytorv" +sentence = Sentence(text) + +tagger = load_flair_ner_model() + +tagger.predict(sentence) + +for tok in sentence.tokens: + print(tok.text, tok.get_tag('ner').value) + +``` + + +## Dataset analysis + + +If you want to analyze an entire dataset you can either use one of the DaNLP functions to load the DDT or the WikiAnn, or create a list of flair `Sentence`. + +### DaNLP datasets + +You can load the DDT as follow: + +```python +from danlp.datasets import DDT +ddt = DDT() +# load the DDT +flair_corpus = ddt.load_with_flair() + +# you can access the train, test or dev part of the dataset +flair_train = flair_corpus.train +flair_test = flair_corpus.test +flair_dev = flair_corpus.dev + +# to get the list of UPOS tags for each sentence +pos_tags = [[tok.tags['upos'].value for tok in fs] for fs in flair_test] +# to get the list of NER tags for each sentence (BIO format) +ner_tags = [[tok.tags['ner'].value for tok in fs] for fs in flair_test] +# to get the list of tokens for each sentence +tokens = [[tok.text for tok in fs] for fs in flair_test] + +# you can use the loaded datasets +# to parse with the danlp POS or NER models +tagger.predict(flair_test) + +``` + +Or the WikiAnn: + +```python +from danlp.datasets import WikiAnn +wikiann = WikiAnn() +# load the WikiAnn dataset +flair_corpus = wikiann.load_with_flair() +``` + +### Your dataset + +From your own list of sentences (pre-tokenized) you can build a list of flair `Sentence` -- to use as previously described with the DaNLP datasets. + +Here is an example with the POS model: + +```python +from danlp.models import load_flair_pos_model +from flair.data import Sentence, Token + +# loading the POS flair model +tagger = load_flair_pos_model() + +# your sentences (list of lists of tokens) +my_sentences = [[...], [...], ...] + +flair_sentences = [] +for sent in my_sentences: + flair_sent = Sentence() + for tok in sent: + flair_sent.add_token(Token(tok)) + flair_sentences.append(flair_sent) + +tagger.predict(flair_sentences) + +for sentence in flair_sentences: + print(" ".join(["{}/{}".format(t.text, t.get_tag('upos').value) for t in sentence.tokens])) +``` \ No newline at end of file diff --git a/docs/frameworks.rst b/docs/frameworks.rst index dc39889..014ac29 100644 --- a/docs/frameworks.rst +++ b/docs/frameworks.rst @@ -6,4 +6,6 @@ Frameworks :maxdepth: 1 :caption: Frameworks - spacy.md \ No newline at end of file + spacy.md + flair.md + bert.md \ No newline at end of file diff --git a/docs/spacy.md b/docs/spacy.md index f6da024..e1ac005 100644 --- a/docs/spacy.md +++ b/docs/spacy.md @@ -1,5 +1,5 @@ -SpaCy model in Danish -===================== +SpaCy +===== SpaCy is an industrial friendly open source framework for doing NLP, and you can read more about it on their [homesite](https://spacy.io/) or [gitHub](https://github.com/explosion/spaCy). @@ -15,19 +15,18 @@ The model is trained on the [Danish Dependency Treebank (DaNe)](datasets.md#dane For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](models/embeddings.md), [named entity recognition](models/ner.md), [part of speech tagging](models/pos.md) and [dependency parsing](models/dependency.md). -The danlp github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](models/sentiment_analysis.md). +The DaNLP github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](models/sentiment_analysis.md). -#### Performance of the spaCy model +### Performance of the spaCy model The following lists the performance scores of the spaCy model provided in DaNLP pakage on the [Danish Dependency Treebank (DaNe)](datasets.md#dane) test set. The scores and elaborating scores can be found in the file meta.json that is shipped with the model when it is downloaded. | Task | Measures | Scores | | ----------------------- | -------- | :----- | -| Dependency parsing | uas | 81.63 | -| Dependency parsing | las | 77.22 | +| Dependency parsing | UAS | 81.63 | +| Dependency parsing | LAS | 77.22 | | Part of speech tags | accuracy | 96.40 | -| Named entity recognition| f1 | 80.50 | - +| Named entity recognition| F1 | 80.50 | @@ -69,7 +68,7 @@ for token in doc: ![](imgs/ling_feat.PNG) -**Visualizing the dependency tree:** +**Visualizing the dependency tree** ```python # the spaCy framework provides a nice visualization tool! diff --git a/readthedocs/gettingstarted/contributing.md b/readthedocs/gettingstarted/contributing.md index 5736ab6..cbb8df2 100644 --- a/readthedocs/gettingstarted/contributing.md +++ b/readthedocs/gettingstarted/contributing.md @@ -1,9 +1,9 @@ How do I contribute? ==================== -If you want to contribute to the DaNLP repository and make it better, your help is very welcome. You can contribute to the project in many ways: +If you want to contribute to the [DaNLP repository](https://github.com/alexandrainst/danlp) and make it better, your help is very welcome. You can contribute to the project in many ways: -- Help us write good tutorials on Danish NLP use-cases +- Help us write good [tutorials](https://github.com/alexandrainst/danlp/tree/master/examples) on Danish NLP use-cases - Contribute with your own pretrained NLP models or datasets in Danish - Notify us of other Danish NLP resources - Create GitHub issues with questions and bug reports diff --git a/readthedocs/gettingstarted/installation.md b/readthedocs/gettingstarted/installation.md index 576d998..60346cd 100644 --- a/readthedocs/gettingstarted/installation.md +++ b/readthedocs/gettingstarted/installation.md @@ -33,8 +33,8 @@ pip install -r requirements.txt ``` -### Install from github -Alternatively you can install the latest version from github using: +### Install from GitHub +Alternatively you can install the latest version from GitHub using: ``` pip install git+https://github.com/alexandrainst/danlp.git ``` @@ -49,4 +49,5 @@ If you want to run a `` in your current working directory you can run ```bash docker run -it --rm -v "$PWD":/usr/src/app -w /usr/src/app alexandrainst/danlp python -``` \ No newline at end of file +``` + diff --git a/readthedocs/gettingstarted/quickstart.md b/readthedocs/gettingstarted/quickstart.md new file mode 100644 index 0000000..3d6f023 --- /dev/null +++ b/readthedocs/gettingstarted/quickstart.md @@ -0,0 +1,226 @@ +Quick start +=========== + +Once you have [installed](installation.md) the DaNLP package, you can use it in your python project using `import danlp`. + +You will find the main functions through the `models` and `datasets` modules -- see the library documentation for more details about how to use the different functions for loading models and datasets. +For analysing texts in Danish, you will primarily need to import functions from `danlp.models` in order to load and use our pre-trained models. + +The DaNLP package provides you with several models for different NLP tasks using different frameworks. +On this section, you will have a quick tour of the main functions of the DaNLP package. +For a more detailed description of the tasks and frameworks, follow the links to the documentation: + +* [Embedding of text](../docs/models/embeddings.md) with flair, spaCy or Gensim +* [Part of speech tagging](../docs/models/pos.md) (POS) with spaCy or flair +* [Named Entity Recognition](../docs/models/ner.md) (NER) with spaCy, flair or BERT +* [Sentiment Analysis](../docs/models/sentiment_analysis.md) with spaCy or BERT +* [Dependency parsing and NP-chunking](../docs/models/dependency.md) with spaCy + + +## All-in-one with the spaCy models + +To quickly get started with DaNLP and try out different NLP tasks, you can use the spaCy model ([see also](../docs/spacy.md)). The main advantages of the spaCy model is that it is fast and it includes most of the basic NLP tasks that you need for pre-processing texts in Danish. + +The main functions are: + +* `load_spacy_model` for loading a spaCy model for POS, NER and dependency parsing or a spaCy sentiment model +* `load_spacy_chunking_model` for loading a wrapper around the spaCy model with which you can deduce NP-chunks from dependency parses + +### Pre-processing tasks + +Perform [Part-of-Speech tagging](../docs/models/pos.md), [Named Entity Recognition](../docs/models/ner.md) and [dependency parsing](../docs/models/dependency.md) at the same time with the DaNLP spaCy model. +Here is a snippet to quickly getting started: + +```python +# Import the load function +from danlp.models import load_spacy_model + +# Download and load the spaCy model using the DaNLP wrapper function +nlp = load_spacy_model() + +# Parse the text using the spaCy model +# it creates a spaCy Doc object +doc = nlp("Jeg er en sætning, der skal analyseres") + +# prepare some pretty printing +features = ['Text','POS', 'Dep'] +head_format ="\033[1m{!s:>11}\033[0m" * (len(features) ) +row_format ="{!s:>11}" * (len(features) ) + +print(head_format.format(*features)) +# printing for each token in the docs the pos and dep features +for token in doc: + print(row_format.format(token.text, token.pos_, token.dep_)) + +``` + +For NP-chunking you can use the `load_spacy_chunking_model`. +The spaCy chunking model includes the spaCy model -- which can be used as previously described. + +```python +from danlp.models import load_spacy_chunking_model + +# text to process +text = 'Et syntagme er en gruppe af ord, der hænger sammen' + +# Load the chunker using the DaNLP wrapper +chunker = load_spacy_chunking_model() +# Applying the spaCy model for parsing the sentence +# and deducing NP-chunks +np_chunks = chunker.predict(text, bio=False) + +nlp = chunker.model +doc = nlp(text) + +# print the chunks +for (start_id, end_id, _) in np_chunks: + print(doc[start_id:end_id]) +``` + + +### Sentiment analysis + +With the spaCy sentiment model, you can predict whether a sentence is perceived positive, negative or neutral. +For loading and using the spaCy sentiment analyser, follow these steps: + +```python +from danlp.models import load_spacy_model + +# Download and load the spaCy sentiment model using the DaNLP wrapper function +nlpS = load_spacy_model(textcat='sentiment', vectorError=True) + +text = "Jeg er meget glad med DaNLP" + +# analyse the text using the spaCy sentiment analyser +doc = nlpS(text) + +# print the most probable category among 'positiv', 'negativ' or 'neutral' +print(max(doc.cats)) +``` + + +## Sequence labelling with flair + +For part-of-speech tagging and named entity recognition, you also have the possibility to use flair. +If you value precision rather than speed, we would recommend you to use the flair models (or BERT NER, next section). + +Perform POS tagging or NER using the DaNLP flair models that you can load through the following functions: + +* `load_flair_pos_model` +* `load_flair_ner_model` + +Use the following snippet to try out the flair POS model. Note that the text should be pre-tokenized. + +```python +from danlp.models import load_flair_pos_model +from flair.data import Sentence + +text = "Hans har en lille sort kat ." +sentence = Sentence(text) + +tagger = load_flair_pos_model() + +tagger.predict(sentence) + +for tok in sentence.tokens: + print(tok.text, tok.get_tag('upos').value) + +``` + +You can use the flair NER model in a similar way. + + +```python +from danlp.models import load_flair_ner_model +from flair.data import Sentence + +text = "Hans bor i København" +sentence = Sentence(text) + +tagger = load_flair_ner_model() + +tagger.predict(sentence) + +for tok in sentence.tokens: + print(tok.text, tok.get_tag('ner').value) +``` + +## Deep NLP with BERT {#bert} + +### NER with BERT + +You can also perform NER with BERT. Load the DaNLP model with `load_bert_ner_model` and try out the following snippet: + +```python +from danlp.models import load_bert_ner_model + +bert = load_bert_ner_model() +tokens, labels = bert.predict("Jens Peter Hansen kommer fra Danmark") + +print(" ".join(["{}/{}".format(tok,lbl) for tok,lbl in zip(tokens,labels)])) + +``` + + +### Classification with BERT + +BERT is well suited for classification tasks. You can load the DaNLP sentiment classification BERT models with: + +* `load_bert_emotion_model` +* `load_bert_tone_model` + + +With the BERT Emotion model you can classify sentences among eight emotions: + +* `Glæde/Sindsro` +* `Tillid/Accept` +* `Forventning/Interrese` +* `Overasket/Målløs` +* `Vrede/Irritation` +* `Foragt/Modvilje` +* `Sorg/trist` +* `Frygt/Bekymret` + +Following is an example of how to use the BERT Emotion model: + +```python +from danlp.models import load_bert_emotion_model +classifier = load_bert_emotion_model() + +# using the classifier +classifier.predict('jeg ejer en bil') +''''No emotion'''' +classifier.predict('jeg ejer en rød bil og det er en god bil') +''''Tillid/Accept'''' +classifier.predict('jeg ejer en rød bil men den er gået i stykker') +''''Sorg/trist'''' + +# Get probabilities and matching classes +probas = classifier.predict_proba('jeg ejer en rød bil men den er gået i stykker', no_emotion=False)[0] +classes = classifier._classes()[0] + +for c, p in zip(classes, probas): + print(c, ':', p) +``` + +With the BERT Tone model, you can predict the tone (`objective` or `subjective`) or the polarity (`positive`, `negative` or `neutral`) of sentences. + + +```python +from danlp.models import load_bert_tone_model +classifier = load_bert_tone_model() + +text = 'Analysen viser, at økonomien bliver forfærdelig dårlig' + +# using the classifier +prediction = classifier.predict(text) +print("Tone: ", prediction['analytic']) +print("Polarity: ", prediction['polarity']) + +# Get probabilities and matching classes +probas = classifier.predict_proba(text)[0] +classes = classifier._classes()[0] + +for c, p in zip(classes, probas): + print(c, ':', p) +``` \ No newline at end of file diff --git a/readthedocs/index.rst b/readthedocs/index.rst index 503b011..c951d47 100644 --- a/readthedocs/index.rst +++ b/readthedocs/index.rst @@ -4,27 +4,30 @@ Welcome to DaNLP's documentation! DaNLP is a repository for Natural Language Processing resources for the Danish Language. -It is a collection of available datasets and models for a variety of NLP tasks. The aim is to make it easier and more applicable to practitioners in the industry to use Danish NLP and hence this project is licensed to allow commercial use. The project features code examples on how to use the datasets and models in popular NLP frameworks such as spaCy, Transformers and Flair as well as Deep Learning frameworks such as PyTorch. +It is a collection of available datasets and models for a variety of NLP tasks. +The aim is to make it easier and more applicable to practitioners in the industry to use Danish NLP and hence this project is licensed to allow commercial use. The project features code examples on how to use the datasets and models in popular NLP frameworks such as spaCy, Transformers and Flair as well as Deep Learning frameworks such as PyTorch. +If you are new to NLP or want to know more about the project in a broader perspective, you can have a look at our `microsite `_ (in Danish). .. toctree:: :maxdepth: 1 :caption: Getting started gettingstarted/installation.md + gettingstarted/quickstart.md gettingstarted/contributing.md .. toctree:: :maxdepth: 2 :caption: Documentation - docs/datasets.md - docs/frameworks.rst docs/models/models.rst + docs/frameworks.rst + docs/datasets.md .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :caption: Library library/models.rst From 0a443ad1a77470c663cb4aac0a5394a1a973f5fc Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Fri, 13 Nov 2020 13:46:02 +0100 Subject: [PATCH 6/8] Move documentation and update links --- docs/frameworks.rst | 11 ---------- docs/{ => frameworks}/bert.md | 2 +- docs/{ => frameworks}/flair.md | 2 +- docs/{ => frameworks}/spacy.md | 20 +++++++++---------- .../gettingstarted/contributing.md | 0 .../gettingstarted/installation.md | 0 .../gettingstarted/quickstart.md | 14 ++++++------- docs/models/dependency.md | 2 +- docs/models/models.rst | 13 ------------ docs/models/ner.md | 4 ++-- docs/models/pos.md | 2 +- docs/models/sentiment_analysis.md | 4 ++-- readthedocs/frameworks.rst | 11 ++++++++++ readthedocs/index.rst | 10 +++++----- readthedocs/library/download.rst | 7 ------- readthedocs/models.rst | 13 ++++++++++++ 16 files changed, 54 insertions(+), 61 deletions(-) delete mode 100644 docs/frameworks.rst rename docs/{ => frameworks}/bert.md (94%) rename docs/{ => frameworks}/flair.md (94%) rename docs/{ => frameworks}/spacy.md (90%) rename {readthedocs => docs}/gettingstarted/contributing.md (100%) rename {readthedocs => docs}/gettingstarted/installation.md (100%) rename {readthedocs => docs}/gettingstarted/quickstart.md (88%) delete mode 100644 docs/models/models.rst create mode 100644 readthedocs/frameworks.rst delete mode 100644 readthedocs/library/download.rst create mode 100644 readthedocs/models.rst diff --git a/docs/frameworks.rst b/docs/frameworks.rst deleted file mode 100644 index 014ac29..0000000 --- a/docs/frameworks.rst +++ /dev/null @@ -1,11 +0,0 @@ -Frameworks -========== - - -.. toctree:: - :maxdepth: 1 - :caption: Frameworks - - spacy.md - flair.md - bert.md \ No newline at end of file diff --git a/docs/bert.md b/docs/frameworks/bert.md similarity index 94% rename from docs/bert.md rename to docs/frameworks/bert.md index 77b3f3b..ab2e811 100644 --- a/docs/bert.md +++ b/docs/frameworks/bert.md @@ -43,6 +43,6 @@ The tone analyzer consists of two BERT classification models. The first model detects the polarity of a sentence, i.e. whether it is perceived as `positive`, `neutral` or `negative`. The second model detects the tone of a sentence, between `subjective` and `objective`. -The models are finetuned on manually annotated Twitter data from [Twitter Sentiment](datasets.md#twitter-sentiment) (train part) and [EuroParl sentiment 2](datasets.md#europarl-sentiment2)). +The models are finetuned on manually annotated Twitter data from [Twitter Sentiment](../datasets.md#twitter-sentiment) (train part) and [EuroParl sentiment 2](../datasets.md#europarl-sentiment2)). Both datasets can be loaded with the DaNLP package. diff --git a/docs/flair.md b/docs/frameworks/flair.md similarity index 94% rename from docs/flair.md rename to docs/frameworks/flair.md index d9590cb..e713b5f 100644 --- a/docs/flair.md +++ b/docs/frameworks/flair.md @@ -5,7 +5,7 @@ The [flair](https://github.com/flairNLP/flair) framework from Zalando is based o Through the DaNLP package, we provide a pre-trained Part-of-Speech tagger and Named Entity recognizer using the flair framework. -The models have been trained on the [Danish Dependency Treebank](datasets.md#dane) and use fastText word embeddings and [flair contextual word embeddings](models/embeddings.md#flair-embeddings) trained on data from Wikipedia and EuroParl corpus. +The models have been trained on the [Danish Dependency Treebank](../datasets.md#dane) and use fastText word embeddings and [flair contextual word embeddings](../models/embeddings.md#flair-embeddings) trained on data from Wikipedia and EuroParl corpus. The code for training can be found on flair's GitHub, and the following parameters are set: `learning_rate=1`, `mini_batch_size=32`, `max_epochs=150`, `hidden_size=256`. diff --git a/docs/spacy.md b/docs/frameworks/spacy.md similarity index 90% rename from docs/spacy.md rename to docs/frameworks/spacy.md index e1ac005..a147876 100644 --- a/docs/spacy.md +++ b/docs/frameworks/spacy.md @@ -11,15 +11,15 @@ Note that the two models are not the same, e.g. the spaCy model in DaNLP perform The spaCy model comes with **tokenization**, **dependency parsing**, **part of speech tagging** , **word vectors** and **name entity recognition**. -The model is trained on the [Danish Dependency Treebank (DaNe)](datasets.md#dane), and with additional data for NER which originates from news articles form a collaboration with InfoMedia. +The model is trained on the [Danish Dependency Treebank (DaNe)](../datasets.md#dane), and with additional data for NER which originates from news articles form a collaboration with InfoMedia. -For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](models/embeddings.md), [named entity recognition](models/ner.md), [part of speech tagging](models/pos.md) and [dependency parsing](models/dependency.md). +For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](../models/embeddings.md), [named entity recognition](../models/ner.md), [part of speech tagging](../models/pos.md) and [dependency parsing](../models/dependency.md). -The DaNLP github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](models/sentiment_analysis.md). +The DaNLP github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](../models/sentiment_analysis.md). ### Performance of the spaCy model -The following lists the performance scores of the spaCy model provided in DaNLP pakage on the [Danish Dependency Treebank (DaNe)](datasets.md#dane) test set. The scores and elaborating scores can be found in the file meta.json that is shipped with the model when it is downloaded. +The following lists the performance scores of the spaCy model provided in DaNLP pakage on the [Danish Dependency Treebank (DaNe)](../datasets.md#dane) test set. The scores and elaborating scores can be found in the file meta.json that is shipped with the model when it is downloaded. | Task | Measures | Scores | | ----------------------- | -------- | :----- | @@ -66,7 +66,7 @@ for token in doc: ``` -![](imgs/ling_feat.PNG) +![](../imgs/ling_feat.PNG) **Visualizing the dependency tree** @@ -78,9 +78,9 @@ displacy.serve(doc, style='dep') -![](imgs/dep.PNG) +![](../imgs/dep.PNG) -Here is an example of using Named entity recognitions . You can read more about [NER](models/ner.md#named-entity-recognition) in the specific doc. +Here is an example of using Named entity recognitions . You can read more about [NER](../models/ner.md#named-entity-recognition) in the specific doc. ```python doc = nlp('Jens Peter Hansen kommer fra Danmark og arbejder hos Alexandra Instituttet') @@ -107,13 +107,13 @@ Instituttet ORG The spaCy framework provides an easy command line tool for training an existing model, for example by adding a text classifier. This short example shows how to do so using your own annotated data. It is also possible to use any static embedding provided in the DaNLP wrapper. -As an example we will use a small dataset for sentiment classification on twitter. The dataset is under development and will be added in the DaNLP package when ready, and the spacy model will be updated with the classification model as well. A first verison of a spacy model with a sentiment classifier can be load with the danlp wrapper, read more about it in the sentiment analysis [docs](models/sentiment_analysis.md). +As an example we will use a small dataset for sentiment classification on twitter. The dataset is under development and will be added in the DaNLP package when ready, and the spacy model will be updated with the classification model as well. A first verison of a spacy model with a sentiment classifier can be load with the danlp wrapper, read more about it in the sentiment analysis [docs](../models/sentiment_analysis.md). **The first thing is to convert the annotated data into a data format readable by spaCy** Imagine you have the data in an e.g csv format and have it split in development and training part. Our twitter data has (in time of creating this snippet) 973 training examples and 400 evaluation examples, with the following labels : 'positive' marked by 0, 'neutral' marked by 1, and 'negative' marked by 2. Loaded with pandas dataFrame it looks like this: -![](imgs/data_head.PNG) +![](../imgs/data_head.PNG) It needs to be converted into the format expected by spaCy for training the model, which can be done as follows: @@ -159,7 +159,7 @@ prepare_data(df_dev, 'eval_dev.json') The data now looks like this cutted snippet: -![](imgs/snippet_json.PNG) +![](../imgs/snippet_json.PNG) **Ensure you have the models and embeddings downloaded** diff --git a/readthedocs/gettingstarted/contributing.md b/docs/gettingstarted/contributing.md similarity index 100% rename from readthedocs/gettingstarted/contributing.md rename to docs/gettingstarted/contributing.md diff --git a/readthedocs/gettingstarted/installation.md b/docs/gettingstarted/installation.md similarity index 100% rename from readthedocs/gettingstarted/installation.md rename to docs/gettingstarted/installation.md diff --git a/readthedocs/gettingstarted/quickstart.md b/docs/gettingstarted/quickstart.md similarity index 88% rename from readthedocs/gettingstarted/quickstart.md rename to docs/gettingstarted/quickstart.md index 3d6f023..f167e29 100644 --- a/readthedocs/gettingstarted/quickstart.md +++ b/docs/gettingstarted/quickstart.md @@ -10,16 +10,16 @@ The DaNLP package provides you with several models for different NLP tasks using On this section, you will have a quick tour of the main functions of the DaNLP package. For a more detailed description of the tasks and frameworks, follow the links to the documentation: -* [Embedding of text](../docs/models/embeddings.md) with flair, spaCy or Gensim -* [Part of speech tagging](../docs/models/pos.md) (POS) with spaCy or flair -* [Named Entity Recognition](../docs/models/ner.md) (NER) with spaCy, flair or BERT -* [Sentiment Analysis](../docs/models/sentiment_analysis.md) with spaCy or BERT -* [Dependency parsing and NP-chunking](../docs/models/dependency.md) with spaCy +* [Embedding of text](../models/embeddings.md) with flair, spaCy or Gensim +* [Part of speech tagging](../models/pos.md) (POS) with spaCy or flair +* [Named Entity Recognition](../models/ner.md) (NER) with spaCy, flair or BERT +* [Sentiment Analysis](../models/sentiment_analysis.md) with spaCy or BERT +* [Dependency parsing and NP-chunking](../models/dependency.md) with spaCy ## All-in-one with the spaCy models -To quickly get started with DaNLP and try out different NLP tasks, you can use the spaCy model ([see also](../docs/spacy.md)). The main advantages of the spaCy model is that it is fast and it includes most of the basic NLP tasks that you need for pre-processing texts in Danish. +To quickly get started with DaNLP and try out different NLP tasks, you can use the spaCy model ([see also](../frameworks/spacy.md)). The main advantages of the spaCy model is that it is fast and it includes most of the basic NLP tasks that you need for pre-processing texts in Danish. The main functions are: @@ -28,7 +28,7 @@ The main functions are: ### Pre-processing tasks -Perform [Part-of-Speech tagging](../docs/models/pos.md), [Named Entity Recognition](../docs/models/ner.md) and [dependency parsing](../docs/models/dependency.md) at the same time with the DaNLP spaCy model. +Perform [Part-of-Speech tagging](../models/pos.md), [Named Entity Recognition](../models/ner.md) and [dependency parsing](../models/dependency.md) at the same time with the DaNLP spaCy model. Here is a snippet to quickly getting started: ```python diff --git a/docs/models/dependency.md b/docs/models/dependency.md index a7c06cd..fc83184 100644 --- a/docs/models/dependency.md +++ b/docs/models/dependency.md @@ -33,7 +33,7 @@ We provide a convertion function -- from dependencies to NP-chunks -- thus depen ## 🔧 SpaCy {#spacy} -Read more about the SpaCy model in the dedicated [SpaCy docs](../spacy.md) , it has also been trained using the [Danish Dependency Treebank](../datasets.md#dane) dataset. +Read more about the SpaCy model in the dedicated [SpaCy docs](../frameworks/spacy.md) , it has also been trained using the [Danish Dependency Treebank](../datasets.md#dane) dataset. ### Dependency Parser diff --git a/docs/models/models.rst b/docs/models/models.rst deleted file mode 100644 index 4643291..0000000 --- a/docs/models/models.rst +++ /dev/null @@ -1,13 +0,0 @@ -Models -====== - - -.. toctree:: - :maxdepth: 1 - :caption: Models - - embeddings.md - pos.md - ner.md - dependency.md - sentiment_analysis.md \ No newline at end of file diff --git a/docs/models/ner.md b/docs/models/ner.md index 8594758..2e2cba9 100644 --- a/docs/models/ner.md +++ b/docs/models/ner.md @@ -21,7 +21,7 @@ The BERT [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) NER has been finetuned on the [DaNE](../datasets.md#dane) dataset [(Hvingelby et al. 2020)](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). The finetuning has been done using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace. -To use the BERT NER model it can be loaded with the `load_bert_ner_model()` method. Please notice that it can maximum take 512 tokens as input at a time. For longer text sequences split before hand, for example be using sentence boundary detection (eg. by using the [spacy model](../spacy.md ).) +To use the BERT NER model it can be loaded with the `load_bert_ner_model()` method. Please notice that it can maximum take 512 tokens as input at a time. For longer text sequences split before hand, for example be using sentence boundary detection (eg. by using the [spacy model](../frameworks/spacy.md ).) ```python from danlp.models import load_bert_ner_model @@ -53,7 +53,7 @@ print(sentence.to_tagged_string()) ``` #### 🔧 spaCy {#spacy} -The [spaCy](https://spacy.io/) model is trained for several NLP tasks [(read more here)](../spacy.md) uing the [DDT and DaNE](../datasets.md#dane) annotations. +The [spaCy](https://spacy.io/) model is trained for several NLP tasks [(read more here)](../frameworks/spacy.md) uing the [DDT and DaNE](../datasets.md#dane) annotations. The spaCy model can be loaded with DaNLP to do NER predictions in the following way. ```python from danlp.models import load_spacy_model diff --git a/docs/models/pos.md b/docs/models/pos.md index caaaac6..9ac615b 100644 --- a/docs/models/pos.md +++ b/docs/models/pos.md @@ -47,7 +47,7 @@ print(sentence.to_tagged_string()) ##### 🔧 SpaCy {#spacy} -Read more about the spaCy model in the dedicated [spaCy docs](../spacy.md) , it has also been trained using the [Danish Dependency Treebank](../datasets.md#dane) data. +Read more about the spaCy model in the dedicated [spaCy docs](../frameworks/spacy.md) , it has also been trained using the [Danish Dependency Treebank](../datasets.md#dane) data. Below is a small getting started snippet for using the Spacy pos tagger: diff --git a/docs/models/sentiment_analysis.md b/docs/models/sentiment_analysis.md index ccb2350..2745d5b 100644 --- a/docs/models/sentiment_analysis.md +++ b/docs/models/sentiment_analysis.md @@ -77,11 +77,11 @@ classifier._clases() SpaCy sentiment is a text classification model trained using spacy built in command line interface. It uses the CoNLL2017 word vectors (read about it [here](embeddings.md)). -The model is trained using hard distil of the [BERT Tone](#wrenchbert-tone) (beta) - Meaning, the BERT Tone model is used to make predictions on 50.000 sentences from Twitter and 50.000 sentences from [Europarl7](http://www.statmt.org/europarl/). These data is then used to trained a spacy model. Notice the dataset has first been balanced between the classes by oversampling. The model recognizes the classses: 'positiv', 'neutral' and 'negative'. +The model is trained using hard distil of the [BERT Tone](#bert-tone) (beta) - Meaning, the BERT Tone model is used to make predictions on 50.000 sentences from Twitter and 50.000 sentences from [Europarl7](http://www.statmt.org/europarl/). These data is then used to trained a spacy model. Notice the dataset has first been balanced between the classes by oversampling. The model recognizes the classses: 'positiv', 'neutral' and 'negative'. It is a first version. -Read more about using the Danish spaCy model [here](../spacy.md). +Read more about using the Danish spaCy model [here](../frameworks/spacy.md). Below is a small snippet for getting started using the spaCy sentiment model. Currently the danlp packages provide both a spaCy model which do not provide any classes in the textcat module (so it is empty for you to train from scratch), and the sentiment spacy model which have pretrained the classes 'positiv', 'neutral' and 'negative'. Notice it is possible with the spacy command line interface to continue training of the sentiment classes, or add new tags. diff --git a/readthedocs/frameworks.rst b/readthedocs/frameworks.rst new file mode 100644 index 0000000..a59a32a --- /dev/null +++ b/readthedocs/frameworks.rst @@ -0,0 +1,11 @@ +Frameworks +========== + + +.. toctree:: + :maxdepth: 1 + :caption: Frameworks + + docs/frameworks/spacy.md + docs/frameworks/flair.md + docs/frameworks/bert.md \ No newline at end of file diff --git a/readthedocs/index.rst b/readthedocs/index.rst index c951d47..bba71fd 100644 --- a/readthedocs/index.rst +++ b/readthedocs/index.rst @@ -14,16 +14,16 @@ If you are new to NLP or want to know more about the project in a broader perspe :maxdepth: 1 :caption: Getting started - gettingstarted/installation.md - gettingstarted/quickstart.md - gettingstarted/contributing.md + docs/gettingstarted/installation.md + docs/gettingstarted/quickstart.md + docs/gettingstarted/contributing.md .. toctree:: :maxdepth: 2 :caption: Documentation - docs/models/models.rst - docs/frameworks.rst + models.rst + frameworks.rst docs/datasets.md .. toctree:: diff --git a/readthedocs/library/download.rst b/readthedocs/library/download.rst deleted file mode 100644 index 53fdfc6..0000000 --- a/readthedocs/library/download.rst +++ /dev/null @@ -1,7 +0,0 @@ - -Download -======== - -.. automodule:: danlp.download - :members: - :show-inheritance: \ No newline at end of file diff --git a/readthedocs/models.rst b/readthedocs/models.rst new file mode 100644 index 0000000..f95b352 --- /dev/null +++ b/readthedocs/models.rst @@ -0,0 +1,13 @@ +Models +====== + + +.. toctree:: + :maxdepth: 1 + :caption: Models + + docs/models/embeddings.md + docs/models/pos.md + docs/models/ner.md + docs/models/dependency.md + docs/models/sentiment_analysis.md \ No newline at end of file From c69b04fca87b176d042d81ffc9c51c0b3e9043bc Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Fri, 13 Nov 2020 15:39:05 +0100 Subject: [PATCH 7/8] Rename "Models" to "Tasks" --- docs/frameworks/flair.md | 2 +- docs/frameworks/spacy.md | 8 ++++---- docs/gettingstarted/quickstart.md | 12 ++++++------ docs/{models => tasks}/dependency.md | 0 docs/{models => tasks}/embeddings.md | 0 docs/{models => tasks}/ner.md | 0 docs/{models => tasks}/pos.md | 0 docs/{models => tasks}/sentiment_analysis.md | 0 readthedocs/index.rst | 2 +- readthedocs/models.rst | 13 ------------- readthedocs/tasks.rst | 13 +++++++++++++ 11 files changed, 25 insertions(+), 25 deletions(-) rename docs/{models => tasks}/dependency.md (100%) rename docs/{models => tasks}/embeddings.md (100%) rename docs/{models => tasks}/ner.md (100%) rename docs/{models => tasks}/pos.md (100%) rename docs/{models => tasks}/sentiment_analysis.md (100%) delete mode 100644 readthedocs/models.rst create mode 100644 readthedocs/tasks.rst diff --git a/docs/frameworks/flair.md b/docs/frameworks/flair.md index e713b5f..7dcc1aa 100644 --- a/docs/frameworks/flair.md +++ b/docs/frameworks/flair.md @@ -5,7 +5,7 @@ The [flair](https://github.com/flairNLP/flair) framework from Zalando is based o Through the DaNLP package, we provide a pre-trained Part-of-Speech tagger and Named Entity recognizer using the flair framework. -The models have been trained on the [Danish Dependency Treebank](../datasets.md#dane) and use fastText word embeddings and [flair contextual word embeddings](../models/embeddings.md#flair-embeddings) trained on data from Wikipedia and EuroParl corpus. +The models have been trained on the [Danish Dependency Treebank](../datasets.md#dane) and use fastText word embeddings and [flair contextual word embeddings](../tasks/embeddings.md#flair-embeddings) trained on data from Wikipedia and EuroParl corpus. The code for training can be found on flair's GitHub, and the following parameters are set: `learning_rate=1`, `mini_batch_size=32`, `max_epochs=150`, `hidden_size=256`. diff --git a/docs/frameworks/spacy.md b/docs/frameworks/spacy.md index a147876..9c02f2e 100644 --- a/docs/frameworks/spacy.md +++ b/docs/frameworks/spacy.md @@ -13,9 +13,9 @@ The spaCy model comes with **tokenization**, **dependency parsing**, **part of s The model is trained on the [Danish Dependency Treebank (DaNe)](../datasets.md#dane), and with additional data for NER which originates from news articles form a collaboration with InfoMedia. -For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](../models/embeddings.md), [named entity recognition](../models/ner.md), [part of speech tagging](../models/pos.md) and [dependency parsing](../models/dependency.md). +For comparison to other models and additional information of the tasks, check out the task individual pages for [word embeddings](../tasks/embeddings.md), [named entity recognition](../tasks/ner.md), [part of speech tagging](../tasks/pos.md) and [dependency parsing](../tasks/dependency.md). -The DaNLP github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](../models/sentiment_analysis.md). +The DaNLP github also provides a version of the spaCy model which contains a sentiment classifier, read more about it in the [sentiment analysis docs](../tasks/sentiment_analysis.md). ### Performance of the spaCy model @@ -80,7 +80,7 @@ displacy.serve(doc, style='dep') ![](../imgs/dep.PNG) -Here is an example of using Named entity recognitions . You can read more about [NER](../models/ner.md#named-entity-recognition) in the specific doc. +Here is an example of using Named entity recognitions . You can read more about [NER](../tasks/ner.md#named-entity-recognition) in the specific doc. ```python doc = nlp('Jens Peter Hansen kommer fra Danmark og arbejder hos Alexandra Instituttet') @@ -107,7 +107,7 @@ Instituttet ORG The spaCy framework provides an easy command line tool for training an existing model, for example by adding a text classifier. This short example shows how to do so using your own annotated data. It is also possible to use any static embedding provided in the DaNLP wrapper. -As an example we will use a small dataset for sentiment classification on twitter. The dataset is under development and will be added in the DaNLP package when ready, and the spacy model will be updated with the classification model as well. A first verison of a spacy model with a sentiment classifier can be load with the danlp wrapper, read more about it in the sentiment analysis [docs](../models/sentiment_analysis.md). +As an example we will use a small dataset for sentiment classification on twitter. The dataset is under development and will be added in the DaNLP package when ready, and the spacy model will be updated with the classification model as well. A first verison of a spacy model with a sentiment classifier can be load with the danlp wrapper, read more about it in the sentiment analysis [docs](../tasks/sentiment_analysis.md). **The first thing is to convert the annotated data into a data format readable by spaCy** diff --git a/docs/gettingstarted/quickstart.md b/docs/gettingstarted/quickstart.md index f167e29..5da0011 100644 --- a/docs/gettingstarted/quickstart.md +++ b/docs/gettingstarted/quickstart.md @@ -10,11 +10,11 @@ The DaNLP package provides you with several models for different NLP tasks using On this section, you will have a quick tour of the main functions of the DaNLP package. For a more detailed description of the tasks and frameworks, follow the links to the documentation: -* [Embedding of text](../models/embeddings.md) with flair, spaCy or Gensim -* [Part of speech tagging](../models/pos.md) (POS) with spaCy or flair -* [Named Entity Recognition](../models/ner.md) (NER) with spaCy, flair or BERT -* [Sentiment Analysis](../models/sentiment_analysis.md) with spaCy or BERT -* [Dependency parsing and NP-chunking](../models/dependency.md) with spaCy +* [Embedding of text](../tasks/embeddings.md) with flair, spaCy or Gensim +* [Part of speech tagging](../tasks/pos.md) (POS) with spaCy or flair +* [Named Entity Recognition](../tasks/ner.md) (NER) with spaCy, flair or BERT +* [Sentiment Analysis](../tasks/sentiment_analysis.md) with spaCy or BERT +* [Dependency parsing and NP-chunking](../tasks/dependency.md) with spaCy ## All-in-one with the spaCy models @@ -28,7 +28,7 @@ The main functions are: ### Pre-processing tasks -Perform [Part-of-Speech tagging](../models/pos.md), [Named Entity Recognition](../models/ner.md) and [dependency parsing](../models/dependency.md) at the same time with the DaNLP spaCy model. +Perform [Part-of-Speech tagging](../tasks/pos.md), [Named Entity Recognition](../tasks/ner.md) and [dependency parsing](../tasks/dependency.md) at the same time with the DaNLP spaCy model. Here is a snippet to quickly getting started: ```python diff --git a/docs/models/dependency.md b/docs/tasks/dependency.md similarity index 100% rename from docs/models/dependency.md rename to docs/tasks/dependency.md diff --git a/docs/models/embeddings.md b/docs/tasks/embeddings.md similarity index 100% rename from docs/models/embeddings.md rename to docs/tasks/embeddings.md diff --git a/docs/models/ner.md b/docs/tasks/ner.md similarity index 100% rename from docs/models/ner.md rename to docs/tasks/ner.md diff --git a/docs/models/pos.md b/docs/tasks/pos.md similarity index 100% rename from docs/models/pos.md rename to docs/tasks/pos.md diff --git a/docs/models/sentiment_analysis.md b/docs/tasks/sentiment_analysis.md similarity index 100% rename from docs/models/sentiment_analysis.md rename to docs/tasks/sentiment_analysis.md diff --git a/readthedocs/index.rst b/readthedocs/index.rst index bba71fd..cf04f3d 100644 --- a/readthedocs/index.rst +++ b/readthedocs/index.rst @@ -22,7 +22,7 @@ If you are new to NLP or want to know more about the project in a broader perspe :maxdepth: 2 :caption: Documentation - models.rst + tasks.rst frameworks.rst docs/datasets.md diff --git a/readthedocs/models.rst b/readthedocs/models.rst deleted file mode 100644 index f95b352..0000000 --- a/readthedocs/models.rst +++ /dev/null @@ -1,13 +0,0 @@ -Models -====== - - -.. toctree:: - :maxdepth: 1 - :caption: Models - - docs/models/embeddings.md - docs/models/pos.md - docs/models/ner.md - docs/models/dependency.md - docs/models/sentiment_analysis.md \ No newline at end of file diff --git a/readthedocs/tasks.rst b/readthedocs/tasks.rst new file mode 100644 index 0000000..7772f15 --- /dev/null +++ b/readthedocs/tasks.rst @@ -0,0 +1,13 @@ +Models +====== + + +.. toctree:: + :maxdepth: 1 + :caption: Models + + docs/tasks/embeddings.md + docs/tasks/pos.md + docs/tasks/ner.md + docs/tasks/dependency.md + docs/tasks/sentiment_analysis.md \ No newline at end of file From 0973450f070bb4e842e52d23464d95c7d636e5cf Mon Sep 17 00:00:00 2001 From: ophelielacroix Date: Fri, 13 Nov 2020 15:52:06 +0100 Subject: [PATCH 8/8] Merge docs and readthedocs folders --- .gitignore | 4 ++-- {readthedocs => docs}/Makefile | 0 {readthedocs => docs}/conf.py | 0 docs/{ => docs}/datasets.md | 0 docs/{ => docs}/frameworks/bert.md | 0 docs/{ => docs}/frameworks/flair.md | 0 docs/{ => docs}/frameworks/spacy.md | 0 docs/{ => docs}/gettingstarted/contributing.md | 0 docs/{ => docs}/gettingstarted/installation.md | 0 docs/{ => docs}/gettingstarted/quickstart.md | 0 docs/{ => docs}/imgs/alexandra_logo.png | Bin docs/{ => docs}/imgs/chunk_features.png | Bin docs/{ => docs}/imgs/danlp_logo.png | Bin docs/{ => docs}/imgs/data_head.PNG | Bin docs/{ => docs}/imgs/dep.PNG | Bin docs/{ => docs}/imgs/dep_example.png | Bin docs/{ => docs}/imgs/dep_features.png | Bin docs/{ => docs}/imgs/ent.PNG | Bin docs/{ => docs}/imgs/ling_feat.PNG | Bin docs/{ => docs}/imgs/postag_eksempel.gif | Bin docs/{ => docs}/imgs/snippet_json.PNG | Bin docs/{ => docs}/tasks/dependency.md | 0 docs/{ => docs}/tasks/embeddings.md | 0 docs/{ => docs}/tasks/ner.md | 0 docs/{ => docs}/tasks/pos.md | 0 docs/{ => docs}/tasks/sentiment_analysis.md | 0 {readthedocs => docs}/frameworks.rst | 0 {readthedocs => docs}/index.rst | 0 {readthedocs => docs}/library/datasets.rst | 0 {readthedocs => docs}/library/models.rst | 0 {readthedocs => docs}/requirements.txt | 0 {readthedocs => docs}/tasks.rst | 0 readthedocs/docs | 1 - 33 files changed, 2 insertions(+), 3 deletions(-) rename {readthedocs => docs}/Makefile (100%) rename {readthedocs => docs}/conf.py (100%) rename docs/{ => docs}/datasets.md (100%) rename docs/{ => docs}/frameworks/bert.md (100%) rename docs/{ => docs}/frameworks/flair.md (100%) rename docs/{ => docs}/frameworks/spacy.md (100%) rename docs/{ => docs}/gettingstarted/contributing.md (100%) rename docs/{ => docs}/gettingstarted/installation.md (100%) rename docs/{ => docs}/gettingstarted/quickstart.md (100%) rename docs/{ => docs}/imgs/alexandra_logo.png (100%) rename docs/{ => docs}/imgs/chunk_features.png (100%) rename docs/{ => docs}/imgs/danlp_logo.png (100%) rename docs/{ => docs}/imgs/data_head.PNG (100%) rename docs/{ => docs}/imgs/dep.PNG (100%) rename docs/{ => docs}/imgs/dep_example.png (100%) rename docs/{ => docs}/imgs/dep_features.png (100%) rename docs/{ => docs}/imgs/ent.PNG (100%) rename docs/{ => docs}/imgs/ling_feat.PNG (100%) rename docs/{ => docs}/imgs/postag_eksempel.gif (100%) rename docs/{ => docs}/imgs/snippet_json.PNG (100%) rename docs/{ => docs}/tasks/dependency.md (100%) rename docs/{ => docs}/tasks/embeddings.md (100%) rename docs/{ => docs}/tasks/ner.md (100%) rename docs/{ => docs}/tasks/pos.md (100%) rename docs/{ => docs}/tasks/sentiment_analysis.md (100%) rename {readthedocs => docs}/frameworks.rst (100%) rename {readthedocs => docs}/index.rst (100%) rename {readthedocs => docs}/library/datasets.rst (100%) rename {readthedocs => docs}/library/models.rst (100%) rename {readthedocs => docs}/requirements.txt (100%) rename {readthedocs => docs}/tasks.rst (100%) delete mode 120000 readthedocs/docs diff --git a/.gitignore b/.gitignore index 5dbe340..035cbb2 100644 --- a/.gitignore +++ b/.gitignore @@ -127,5 +127,5 @@ dmypy.json .idea # readthedocs -readthedocs/_build -readthedocs/make.bat +docs/_build +docs/make.bat diff --git a/readthedocs/Makefile b/docs/Makefile similarity index 100% rename from readthedocs/Makefile rename to docs/Makefile diff --git a/readthedocs/conf.py b/docs/conf.py similarity index 100% rename from readthedocs/conf.py rename to docs/conf.py diff --git a/docs/datasets.md b/docs/docs/datasets.md similarity index 100% rename from docs/datasets.md rename to docs/docs/datasets.md diff --git a/docs/frameworks/bert.md b/docs/docs/frameworks/bert.md similarity index 100% rename from docs/frameworks/bert.md rename to docs/docs/frameworks/bert.md diff --git a/docs/frameworks/flair.md b/docs/docs/frameworks/flair.md similarity index 100% rename from docs/frameworks/flair.md rename to docs/docs/frameworks/flair.md diff --git a/docs/frameworks/spacy.md b/docs/docs/frameworks/spacy.md similarity index 100% rename from docs/frameworks/spacy.md rename to docs/docs/frameworks/spacy.md diff --git a/docs/gettingstarted/contributing.md b/docs/docs/gettingstarted/contributing.md similarity index 100% rename from docs/gettingstarted/contributing.md rename to docs/docs/gettingstarted/contributing.md diff --git a/docs/gettingstarted/installation.md b/docs/docs/gettingstarted/installation.md similarity index 100% rename from docs/gettingstarted/installation.md rename to docs/docs/gettingstarted/installation.md diff --git a/docs/gettingstarted/quickstart.md b/docs/docs/gettingstarted/quickstart.md similarity index 100% rename from docs/gettingstarted/quickstart.md rename to docs/docs/gettingstarted/quickstart.md diff --git a/docs/imgs/alexandra_logo.png b/docs/docs/imgs/alexandra_logo.png similarity index 100% rename from docs/imgs/alexandra_logo.png rename to docs/docs/imgs/alexandra_logo.png diff --git a/docs/imgs/chunk_features.png b/docs/docs/imgs/chunk_features.png similarity index 100% rename from docs/imgs/chunk_features.png rename to docs/docs/imgs/chunk_features.png diff --git a/docs/imgs/danlp_logo.png b/docs/docs/imgs/danlp_logo.png similarity index 100% rename from docs/imgs/danlp_logo.png rename to docs/docs/imgs/danlp_logo.png diff --git a/docs/imgs/data_head.PNG b/docs/docs/imgs/data_head.PNG similarity index 100% rename from docs/imgs/data_head.PNG rename to docs/docs/imgs/data_head.PNG diff --git a/docs/imgs/dep.PNG b/docs/docs/imgs/dep.PNG similarity index 100% rename from docs/imgs/dep.PNG rename to docs/docs/imgs/dep.PNG diff --git a/docs/imgs/dep_example.png b/docs/docs/imgs/dep_example.png similarity index 100% rename from docs/imgs/dep_example.png rename to docs/docs/imgs/dep_example.png diff --git a/docs/imgs/dep_features.png b/docs/docs/imgs/dep_features.png similarity index 100% rename from docs/imgs/dep_features.png rename to docs/docs/imgs/dep_features.png diff --git a/docs/imgs/ent.PNG b/docs/docs/imgs/ent.PNG similarity index 100% rename from docs/imgs/ent.PNG rename to docs/docs/imgs/ent.PNG diff --git a/docs/imgs/ling_feat.PNG b/docs/docs/imgs/ling_feat.PNG similarity index 100% rename from docs/imgs/ling_feat.PNG rename to docs/docs/imgs/ling_feat.PNG diff --git a/docs/imgs/postag_eksempel.gif b/docs/docs/imgs/postag_eksempel.gif similarity index 100% rename from docs/imgs/postag_eksempel.gif rename to docs/docs/imgs/postag_eksempel.gif diff --git a/docs/imgs/snippet_json.PNG b/docs/docs/imgs/snippet_json.PNG similarity index 100% rename from docs/imgs/snippet_json.PNG rename to docs/docs/imgs/snippet_json.PNG diff --git a/docs/tasks/dependency.md b/docs/docs/tasks/dependency.md similarity index 100% rename from docs/tasks/dependency.md rename to docs/docs/tasks/dependency.md diff --git a/docs/tasks/embeddings.md b/docs/docs/tasks/embeddings.md similarity index 100% rename from docs/tasks/embeddings.md rename to docs/docs/tasks/embeddings.md diff --git a/docs/tasks/ner.md b/docs/docs/tasks/ner.md similarity index 100% rename from docs/tasks/ner.md rename to docs/docs/tasks/ner.md diff --git a/docs/tasks/pos.md b/docs/docs/tasks/pos.md similarity index 100% rename from docs/tasks/pos.md rename to docs/docs/tasks/pos.md diff --git a/docs/tasks/sentiment_analysis.md b/docs/docs/tasks/sentiment_analysis.md similarity index 100% rename from docs/tasks/sentiment_analysis.md rename to docs/docs/tasks/sentiment_analysis.md diff --git a/readthedocs/frameworks.rst b/docs/frameworks.rst similarity index 100% rename from readthedocs/frameworks.rst rename to docs/frameworks.rst diff --git a/readthedocs/index.rst b/docs/index.rst similarity index 100% rename from readthedocs/index.rst rename to docs/index.rst diff --git a/readthedocs/library/datasets.rst b/docs/library/datasets.rst similarity index 100% rename from readthedocs/library/datasets.rst rename to docs/library/datasets.rst diff --git a/readthedocs/library/models.rst b/docs/library/models.rst similarity index 100% rename from readthedocs/library/models.rst rename to docs/library/models.rst diff --git a/readthedocs/requirements.txt b/docs/requirements.txt similarity index 100% rename from readthedocs/requirements.txt rename to docs/requirements.txt diff --git a/readthedocs/tasks.rst b/docs/tasks.rst similarity index 100% rename from readthedocs/tasks.rst rename to docs/tasks.rst diff --git a/readthedocs/docs b/readthedocs/docs deleted file mode 120000 index 6246dff..0000000 --- a/readthedocs/docs +++ /dev/null @@ -1 +0,0 @@ -../docs/ \ No newline at end of file