Fix minor typos in docs

alexandrainst · Nov 25, 2020 · 3e715fb · 3e715fb
1 parent 093e57d
commit 3e715fb
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -147,19 +147,19 @@ To help you navigate we provide you with an overview of the structure in the git
 
     .
     ├── danlp		   			# Source files
-    │	├── datasets   			# Code to load dataset with different frameworks 
-    │	├── models     			# Code to load models with different frameworks 
+    │	├── datasets   			# Code to load datasets with different frameworks 
+    │	└── models     			# Code to load models with different frameworks 
     ├── docker         			# Docker image
-    ├── docs	       			# Documentation and files for setting up for Read The Docs
-    │   ├── docs	   			# Documentation for tasks, dataset and frameworks
+    ├── docs	       			# Documentation and files for setting up Read The Docs
+    │   ├── docs	   			# Documentation for tasks, datasets and frameworks
     │	    ├── tasks  			# Documentation for nlp tasks with benchmark results
     │	    ├── frameworks 		# Overview over different frameworks used
-    │		├── Gettingstarted 	# Guides for installation and getting started  
-    │	    ├── imgs   			# Images used in documentation
-    │   ├── libary     			# Files used for Read the Docs
+    │		├── gettingstarted 	  # Guides for installation and getting started  
+    │	    └── imgs   			 # Images used in documentation
+    │   └── library     		# Files used for Read the Docs
     ├── examples	   			# Examples, tutorials and benchmark scripts
-    │   ├── benchmarks 			# Scripts for reproducing benchmarks results
-    ├── tests   	   			# Test for continous integration with travis
+    │   └── benchmarks 			# Scripts for reproducing benchmarks results
+    └── tests   	   			# Test for continous integration with travis
 
 ## How do I contribute?
 

diff --git a/danlp/models/bert_models.py b/danlp/models/bert_models.py
@@ -35,8 +35,8 @@ def predict(self, text: Union[str, List[str]], IOBformat=True):
         BERTs subword tokens.
 
         :param text: can either be a raw text or a list of tokens
-        :param IOBformat: can either be TRUE or FASE, but can only be Flase if text input is a list of tokens. Specifify if output should be in IOB format or a dictionary 
-        :return: the tokenized text and the predicted labels in IOB format, or a dictonary with the tags and position
+        :param IOBformat: can either be TRUE or FALSE, but can only be False if text input is a list of tokens. Specifify if output should be in IOB format or a dictionary 
+        :return: the tokenized text and the predicted labels in IOB format, or a dictionary with the tags and position
 
         :Example:
 
@@ -360,7 +360,7 @@ class BertBase:
     The Model is trained by BotXO: https://github.com/botxo/nordic_bert
     The Bert model is transformed into pytorch version
     
-    Credit for code eksempel: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
+    Credit for code example: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
     
     :param str cache_dir: the directory for storing cached models
     :param bool verbose: `True` to increase verbosity
@@ -382,9 +382,9 @@ def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
 
     def embed_text(self, text):
         """
-        Calcualte the embeedings for each token in a sentence ant the emebedding for the sentence based on a BERT language model.
-        The embedding for a token is chossen to be the concatenated last four layers, and the sentece embeddings to be the mean of the second to last layer of all tokens in the sentence
-        The BERT tokenixer splits in subword for UNK word. The tokinized sentences is therefore returned as well. The embeddings for the special tokens is not returned.
+        Calculate the embeddings for each token in a sentence ant the embedding for the sentence based on a BERT language model.
+        The embedding for a token is chosen to be the concatenated last four layers, and the sentence embeddings to be the mean of the second to last layer of all tokens in the sentence
+        The BERT tokenizer splits in subword for UNK word. The tokenized sentence is therefore returned as well. The embeddings for the special tokens are not returned.
        
 
         :param str sentence: raw text
@@ -424,7 +424,7 @@ def embed_text(self, text):
         token_vecs_cat=token_vecs_cat[1:-1]
         tokenized_text =tokenized_text[1:-1]
 
-        # choos to summarize the last four layers
+        # chose to summarize the last four layers
         #token_vecs_sum=[torch.sum(token[-4:], dim=0) for token in token_embeddings]
 
         # sentence embedding

diff --git a/docs/docs/tasks/embeddings.md b/docs/docs/tasks/embeddings.md
@@ -101,7 +101,7 @@ from danlp.models.embeddings import load_context_embeddings_with_flair
 from flair.data import Sentence
 
 # Use the wrapper from DaNLP to download and load embeddings with Flair
-# You can combine it with on of the static emebdings
+# You can combine it with the static embeddings
 stacked_embeddings = load_context_embeddings_with_flair(word_embeddings='wiki.da.wv')
 
 # Embed two different sentences
@@ -119,7 +119,7 @@ print('{} sentences out of {} is equal'.format(int(sum(sentence2[4].embedding==s
 
 ##### 🔧 BERT embeddings {#bert-embeddings}
 
-BERT is a language model but the different layers can be used as embeddings of tokens or sentences. This code loads a pytorch version using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace of pre-trained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO model. Since the models is not a designated  models for embeddings, some choices is made of what layers to use. For each tokens in a sentence there is 13 layers of dim 768. Based on the [blogpost](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/), it has been choice to concatenate the four last layer to use as token embeddings, which gives a dimension of 4*768=3072. For sentence embeddings the second last layers is used and the mean across all tokens in the sentence is calculated. 
+BERT is a language model but the different layers can be used as embeddings of tokens or sentences. This code loads a pytorch version using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace of pre-trained [Danish BERT](https://github.com/botxo/nordic_bert) representations by BotXO model. Since the models is not a designated  models for embeddings, some choices is made of what layers to use. For each tokens in a sentence there is 13 layers of dim 768. Based on the [blogpost](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/), it has been chosen to concatenate the four last layer to use as token embeddings, which gives a dimension of 4*768=3072. For sentence embeddings the second last layers is used and the mean across all tokens in the sentence is calculated. 
 
 Note, BERT tokenize out of vocabulary words into sub words.
 

diff --git a/docs/docs/tasks/ner.md b/docs/docs/tasks/ner.md
@@ -21,11 +21,12 @@ The BERT [(Devlin et al. 2019)](https://www.aclweb.org/anthology/N19-1423/) NER
 has been finetuned on the [DaNE](../datasets.md#dane) 
 dataset [(Hvingelby et al. 2020)](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.565.pdf). The finetuning has been done using the [Transformers](https://github.com/huggingface/transformers) library from HuggingFace.
 
-To use the BERT NER model it can be loaded with the `load_bert_ner_model()` method. Please notice that it can maximum take 512 tokens as input at a time. For longer text sequences split before hand, for example be using sentence boundary detection (eg. by using the [spacy model](../frameworks/spacy.md ).) 
+The BERT NER model can be loaded with the `load_bert_ner_model()` method. Please notice that it can maximum take 512 tokens as input at a time. For longer text sequences split before hand, for example using sentence boundary detection (e.g. by using the [spacy model](../frameworks/spacy.md ).) 
+
 ```python
 from danlp.models import load_bert_ner_model
 bert = load_bert_ner_model()
-# Get lists of tokens and labesl in IBO format
+# Get lists of tokens and labels in BIO format
 tokens, labels = bert.predict("Jens Peter Hansen kommer fra Danmark")
 print(" ".join(["{}/{}".format(tok,lbl) for tok,lbl in zip(tokens,labels)]))
 

diff --git a/docs/frameworks.rst b/docs/frameworks.rst
@@ -4,7 +4,6 @@ Frameworks
 
 .. toctree::
    :maxdepth: 1
-   :caption: Frameworks
 
    docs/frameworks/spacy.md
    docs/frameworks/flair.md

diff --git a/docs/tasks.rst b/docs/tasks.rst
@@ -1,10 +1,9 @@
-Models
-======
+Tasks
+=====
 
 
 .. toctree::
    :maxdepth: 1
-   :caption: Models
 
    docs/tasks/embeddings.md
    docs/tasks/pos.md