Merge pull request #66 from alexandrainst/pos-benchmark

POS benchmarks - accuracy scores
alexandrainst · Sep 14, 2020 · 6a9596f · 6a9596f
2 parents 2ea1950 + a7c4f35
commit 6a9596f
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 42 deletions.
diff --git a/docs/models/pos.md b/docs/models/pos.md
@@ -1,6 +1,6 @@
 Part of Speech Tagging
 ======================
-This section is concerned with public available Part of Speech taggers in Danish. 
+This section is concerned with public available Part of Speech (POS) taggers in Danish. 
 
 | Model | Train Data | License | Trained by | Tags | DaNLP |
 |-------|-------|-------|-------|-------|-------|
@@ -10,7 +10,7 @@ This section is concerned with public available Part of Speech taggers in Danish
 
 The Danish UD treebank  uses 17 [universal part of speech tags](<https://universaldependencies.org/u/pos/index.html>):
 
-`ADJ`: Adjective, `ADP`: Adposition , `ADV`: Adverb, `AUX`: Auxiliary verb, `CONJ`: Coordinating conjunction, `DET`: Determiner, `INTJ`: Interjection, `NOUN`: Noun, `NUM`: Numeral, `PART`: Particle `PRON`: Pronoun `PROPN`: Proper noun `PUNCT`: Punctuation `SCONJ`: Subordinating conjunction `SYM`: Symbol `VERB`: Verb `X`: Other
+`ADJ`: Adjective, `ADP`: Adposition , `ADV`: Adverb, `AUX`: Auxiliary verb, `CCONJ`: Coordinating conjunction, `DET`: Determiner, `INTJ`: Interjection, `NOUN`: Noun, `NUM`: Numeral, `PART`: Particle `PRON`: Pronoun `PROPN`: Proper noun `PUNCT`: Punctuation `SCONJ`: Subordinating conjunction `SYM`: Symbol `VERB`: Verb `X`: Other
 
 A medium blog using Part of Speech tagging on Danish, can be found  [here](<https://medium.com/danlp/i-klasse-med-kierkegaard-eller-historien-om-det-fede-ved-at-en-computer-kan-finde-ordklasser-189774695f3b>).
 
@@ -23,7 +23,7 @@ This project provides a trained part of speech tagging model for Danish using th
 The code for training can be found on Flairs GitHub, and the following parameters are set:
 `learning_rate=1`, `mini_batch_size=32`, `max_epochs=150`, `hidden_size=256`.
 
-The flair pos tagger can be used by loading  it with the  `load_flair_pos_model` method. Please note the the text should be tokenized before hand, this can for example be done using spacy. 
+The flair pos tagger can be used by loading  it with the  `load_flair_pos_model` method. Please note that the text should be tokenized before hand, this can for example be done using spacy. 
 
 ```python
 from danlp.models import load_flair_pos_model
@@ -47,7 +47,7 @@ print(sentence.to_tagged_string())
 
 ##### :wrench:SpaCy
 
-Read more about the spaCy model in the dedicated [spaCy docs](<https://github.com/alexandrainst/danlp/blob/master/docs/spacy.md>) , it has also been trained using the the data [Danish Dependency Treebank](<https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane>) . 
+Read more about the spaCy model in the dedicated [spaCy docs](<https://github.com/alexandrainst/danlp/blob/master/docs/spacy.md>) , it has also been trained using the [Danish Dependency Treebank](<https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank-dane>) data. 
 
 Below is a small getting started snippet for using the Spacy pos tagger:
 
@@ -75,13 +75,28 @@ Read more about the polyglot model [here](<https://polyglot.readthedocs.io/en/la
 
 ## 📈 Benchmarks
 
-F1 scores is reported below and can be reproduced using `pos_benchmarks.py` in the [example](<https://github.com/alexandrainst/danlp/tree/master/examples>) folder, where the details score from each class is calculated.
+Accuracy scores is reported below and can be reproduced using `pos_benchmarks.py` in the [example](<https://github.com/alexandrainst/danlp/tree/master/examples>) folder, where the details score from each class is calculated.
+
+#### DaNLP
+
+| Model                       | Accuracy   |
+|-----------------------------|------------|
+| Flair                       | **97.97**  |
+| SpaCy                       | 96.15      |
+
+#### Polyglot
+
+The tags predicted with the polyglot model differ slightly from the universal PoS-tags. The model predicts :
+* `CONJ` instead of `CCONJ`
+* `VERB` instead of `AUX` for the auxiliary and modal verbs (i.e. `være`, `have`, `kunne`, `ville`, `skulle`, `måtte`, `burde`)
+
+We calculated the scores for the original predictions and for the corrected version.
+
+| Model                       | Accuracy   |
+| --------                    | ---------- |
+| Polyglot                    | 76.76      |
+| Polyglot (corrected output) | 83.4       |
 
-| Model    | Micro-F1   |
-| -------- | ---------- |
-| Polyglot | 0.7380     |
-| Flair    | **0.9667** |
-| SpaCy    | 0.9550     |
 
 
 

diff --git a/examples/benchmarks/pos_benchmarks.py b/examples/benchmarks/pos_benchmarks.py
@@ -6,9 +6,9 @@
 from danlp.datasets import DDT
 from danlp.models import load_spacy_model, load_flair_pos_model
 
-from seqeval.metrics import classification_report
+from tabulate import tabulate
 
-# bechmarking polyglotmodel requires
+# benchmarking polyglotmodel requires
 from polyglot.tag import POSTagger
 from polyglot.text import WordList
 
@@ -27,8 +27,38 @@
 sentences_tokens = []
 for sent in ccorpus_conll[2]:
     sentences_tokens.append([token.form for token in sent._tokens])
-
-
+
+
+
+def print_accuracy_scores(tags_true, tags_pred):
+
+    # flatening tags lists
+    tags_true = [tag for sent in tags_true for tag in sent]
+    tags_pred = [tag for sent in tags_pred for tag in sent]
+
+    # list of all tags
+    labels = sorted(list(set(tags_true)))
+
+    headers = ["label", "accuracy", "support"]
+    tab = []
+    correct_tags = {l:[] for l in labels}
+    # counting correct predictions per tag
+    for label in labels:
+        correct_tags[label] = [t == p for t, p in zip(tags_true, tags_pred) if t == label]
+        acc = round(sum(correct_tags[label])/len(correct_tags[label])*100, 2) if len(correct_tags[label])>0 else 0
+        tab.append([label, acc, len(correct_tags[label])])
+    tab.append(['', '', ''])
+    total_examples = sum(len(correct_tags[l]) for l in correct_tags)
+
+    micro_acc = round( sum(sum(correct_tags[l]) for l in correct_tags) / total_examples *100, 2)
+    tab.append(["micro average", micro_acc, total_examples])
+
+    macro_acc = round( sum([sum(correct_tags[l])/len(correct_tags[l]) for l in labels if len(correct_tags[l])>0]) / len(labels) *100, 2)
+    tab.append(["macro average", macro_acc, total_examples])
+
+    print("\n", tabulate(tab, headers=headers, colalign=["right", "decimal", "right"]), "\n")
+
+
 def benchmark_flair_mdl():
     tagger = load_flair_pos_model()
 
@@ -43,11 +73,7 @@ def benchmark_flair_mdl():
     assert len(tags_pred)==num_sentences
     assert sum([len(s) for s in tags_pred])==num_tokens
 
-
-    print(classification_report(tags_true, tags_pred,
-                                    digits=4))
-
-
+    print_accuracy_scores(tags_true, tags_pred)
 
 
 def benchmark_spacy_mdl():
@@ -74,48 +100,56 @@ def benchmark_spacy_mdl():
     assert len(tags_pred)==num_sentences
     assert sum([len(s) for s in tags_pred])==num_tokens
 
-
-    print(classification_report(tags_true, tags_pred,
-                                    digits=4))
-
+    print_accuracy_scores(tags_true, tags_pred)
 
-
-def benchmark_polyglot_mdl():
+
+auxiliary_verbs = ["være", "er", "var", "været"]
+auxiliary_verbs += ["have", "har", "havde", "haft"]
+auxiliary_verbs += ["kunne", "kan", "kunnet"]
+auxiliary_verbs += ["ville", "vil", "villet"]
+auxiliary_verbs += ["skulle", "skal", "skullet"]
+auxiliary_verbs += ["måtte", "må", "måttet"]
+auxiliary_verbs += ["burde", "bør", "burdet"]
+
+def benchmark_polyglot_mdl(corrected_output=False):
     """
-    Running ployglot requires these packages:
+    Running polyglot requires these packages:
     # Morfessor==2.0.6
     # PyICU==2.4.2
     # pycld2==0.41
     # polyglot
     
     """
+
+    def udify_tag(tag, word):
+        if tag == "CONJ":
+            return "CCONJ"
+        if tag == "VERB" and word in auxiliary_verbs:
+            return "AUX"
+        return tag
 
     start = time.time()
 
     tags_pred = []
     for tokens in  sentences_tokens:
         word_list = WordList(tokens, language='da')
-        ne_chunker =  POSTagger(lang='da')
-        word_ent_tuples = list(ne_chunker.annotate(word_list))
-
-        tags_pred.append([entity for word, entity in word_ent_tuples])
-    print('**Polyglot model**')
+        tagger =  POSTagger(lang='da')
+        word_tag_tuples = list(tagger.annotate(word_list))
+        tags_pred.append([udify_tag(tag, word) if corrected_output else tag for word, tag in word_tag_tuples])
+    print('**Polyglot model'+(' (corrected output) ' if corrected_output else '')+'**')
     print("Made predictions on {} sentences and {} tokens in {}s".format(
     num_sentences, num_tokens, time.time() - start))
-    
+
     assert len(tags_pred)==num_sentences
     assert sum([len(s) for s in tags_pred])==num_tokens
-
-
-    print(classification_report(tags_true, tags_pred,
-                                    digits=4))
-
-
-
-
-
-
+
+    print_accuracy_scores(tags_true, tags_pred)
+
+
+
+
 if __name__ == '__main__':
     benchmark_polyglot_mdl()
+    benchmark_polyglot_mdl(corrected_output=True)
     benchmark_spacy_mdl()
     benchmark_flair_mdl()