feat: Added scandi-ner model to dacy

centre-for-humanities-computing · Jan 5, 2023 · a2bec3b · a2bec3b
1 parent a86b0c8
commit a2bec3b
Show file tree

Hide file tree

Showing 8 changed files with 124 additions and 32 deletions.
diff --git a/docs/using_dacy.getting_started.rst b/docs/using_dacy.getting_started.rst
@@ -54,8 +54,8 @@ Using this we can now apply DaCy to text with conventional SpaCy syntax:
    DaCy is built using SpaCy, hence you will be able to find a lot of the required documentation for
    using the pipeline in their very well written `documentation <https://spacy.io>`__.
 
-Tagging named entities
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Named Entity Recognition
+====================================
 A named entity is a “real-world object” that's assigned a name - for example, a person, a country, a product or a book title. 
 DaCy can recognize organizations, persons, and location, as well as other miscellaneous entities.
 
@@ -77,15 +77,43 @@ We can also plot these using:
 
 .. seealso::
 
+
    For more on named entity recognition see SpaCy's `documentation <https://spacy.io/usage/linguistic-features#named-entities>`__.
 
 
 .. image:: _static/ner.png
   :width: 800
   :alt: Named entity recognition using DaCy
 
-Tagging parts-of-speech
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since its release DaCy have been outperformed by `the state-of-the-art model for NER <https://huggingface.co/saattrupdan/nbailab-base-ner-scandi>`__
+by Dan Nielsen. To allow users to access the best model for their use-case DaCy allows you to easily
+switch the NER component to obtain a state-of-the-art model.
+To do this you can simply load the model using:
+
+.. code-block:: python
+
+   # load the small dacy model excluding the NER component
+   nlp = dacy.load("da_dacy_small_trf-0.1.0", exclude=["ner"])
+   # or use an empty spacy model if you only want to do NER
+   # nlp = spacy.blank("da")
+
+   # add the ner component from the state-of-the-art model
+   nlp.add_pipe("dacy/ner")
+
+   doc = nlp("Denne NER model er trænet af Dan fra Alexandra Instituttet")
+
+   for entity in doc.ents:
+      print(entity, ":", entity.label_)
+
+   # Dan : PER
+   # Alexandra Instituttet : ORG
+
+
+Do note that this will add an additonal model to your pipeline, which will slow down the inference speed.
+
+Parts-of-speech Tagging
+====================================
 
 .. code-block:: python
 
@@ -114,7 +142,7 @@ Tagging parts-of-speech
 
 
 Dependency parsing
-^^^^^^^^^^^^^^^^^^^^^^
+====================================
 DaCy features a fast and accurate syntactic dependency parser. In DaCy this dependency parsing is also
 used for sentence segmentation and detecting noun chunks.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ keywords = [
 ]
 
 dependencies = [
-    "spacy-wrap>=1.0.2,<1.1.0",
+    "spacy-wrap>=1.2.1,<1.3.0",
     "spacy>=3.2.0,<3.5.0",
     "pandas>=1.0.0,<2.0.0",
     "wasabi>=0.8.2,<0.11.0",
@@ -85,9 +85,11 @@ tutorials = [
     "scikit-learn",
 ]
 
-[project.scripts]
-emotion = "dacy.sentiment.wrapped_model:make_emotion_transformer"
-hatespeech_classification = "dacy.hate_speech.wrapped_model:make_offensive_transformer"
+
+[project.entry-points.spacy_factories]
+emotion = "dacy.sentiment.wrapped_models:make_emotion_transformer"
+hatespeech_classification = "dacy.hate_speech.wrapped_models:make_offensive_transformer"
+ner = "dacy.ner.wrapped_models"
 
 [build-system]
 requires = ["setuptools>=61.0.0", "wheel", "setuptools_scm"]

diff --git a/src/dacy/hate_speech/wrapped_models.py b/src/dacy/hate_speech/wrapped_models.py
@@ -5,7 +5,10 @@
 from spacy.language import Language
 from spacy.tokens import Doc
 from spacy_transformers.data_classes import FullTransformerBatch
-from spacy_wrap import ClassificationTransformer, make_classification_transformer
+from spacy_wrap import (
+    SequenceClassificationTransformer,
+    make_sequence_classification_transformer,
+)
 from thinc.api import Config, Model
 
 DEFAULT_CONFIG_STR = """
@@ -19,8 +22,8 @@
 @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 
 [hatespeech_detection.model]
-@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
-name = "alexandrainst/da-bert-hatespeech-detection"
+@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
+name = "alexandrainst/da-hatespeech-detection-base"
 tokenizer_config = {"use_fast": true}
 transformer_config = {}
 mixed_precision = false
@@ -42,8 +45,8 @@
 @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 
 [hatespeech_classification.model]
-@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
-name = "alexandrainst/da-bert-hatespeech-classification"
+@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
+name = "alexandrainst/da-hatespeech-classification-base"
 tokenizer_config = {"use_fast": true}
 transformer_config = {}
 mixed_precision = false
@@ -62,7 +65,7 @@
 Danish.factory(
     "dacy/hatespeech_detection",
     default_config=DEFAULT_CONFIG["hatespeech_detection"],
-)(make_classification_transformer)
+)(make_sequence_classification_transformer)
 
 
 @Danish.factory(
@@ -78,7 +81,7 @@ def make_offensive_transformer(
     doc_extension_trf_data: str,
     doc_extension_prediction: str,
     labels: List[str],
-) -> ClassificationTransformer:
+) -> SequenceClassificationTransformer:
 
     if not Doc.has_extension("is_offensive"):
         warn(
@@ -88,7 +91,7 @@ def make_offensive_transformer(
 
     # TODO: Add a conditional forward such that the model isn't run is document is not
     # emotionally laden
-    clf_mdl = ClassificationTransformer(
+    clf_mdl = SequenceClassificationTransformer(
         vocab=nlp.vocab,
         model=model,
         set_extra_annotations=set_extra_annotations,
@@ -97,6 +100,7 @@ def make_offensive_transformer(
         labels=labels,
         doc_extension_trf_data=doc_extension_trf_data,
         doc_extension_prediction=doc_extension_prediction,
+        assign_to_cats=True,
     )
 
     # overwrite extension such that it return not offensive if the document is not

diff --git a/src/dacy/load.py b/src/dacy/load.py
@@ -11,6 +11,7 @@ def load(
     model: str,
     path: Optional[str] = None,
     force_download: bool = False,
+    **kwargs,
 ) -> Language:
     """Load a DaCy model as a SpaCy text processing pipeline. If the model is
     not downloaded it will also download the model.
@@ -22,6 +23,7 @@ def load(
             corresponds to the path optained using dacy.where_is_my_dacy().
         force_redownload (bool, optional): Should the model be redownloaded even if
             already downloaded? Default to False.
+        kwargs: additional arguments passed to spacy.load()
 
     Returns:
         Language: a SpaCy text-preprocessing pipeline
@@ -38,7 +40,7 @@ def load(
         path = DEFAULT_CACHE_DIR
 
     path = download_model(model, path, force=force_download)
-    return spacy.load(path)
+    return spacy.load(path, **kwargs)
 
 
 def where_is_my_dacy(verbose: bool = True) -> str:

diff --git a/src/dacy/ner/__init__.py b/src/dacy/ner/__init__.py
diff --git a/src/dacy/ner/wrapped_models.py b/src/dacy/ner/wrapped_models.py
@@ -0,0 +1,39 @@
+from spacy.lang.da import Danish
+from spacy_wrap.pipeline_component_tok_clf import make_token_classification_transformer
+from thinc.api import Config
+
+DEFAULT_CONFIG_STR = """
+[token_classification_transformer]
+max_batch_items = 4096
+doc_extension_trf_data = "tok_clf_trf_data"
+doc_extension_prediction = "tok_clf_predictions"
+predictions_to = null
+labels = null
+aggregation_strategy = "average"
+
+[token_classification_transformer.set_extra_annotations]
+@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
+
+[token_classification_transformer.model]
+@architectures = "spacy-wrap.TokenClassificationTransformerModel.v1"
+name="saattrupdan/nbailab-base-ner-scandi"
+tokenizer_config = {"use_fast": true}
+transformer_config = {}
+mixed_precision = false
+grad_scaler_config = {}
+
+
+[token_classification_transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+"""
+
+
+DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)
+
+
+Danish.factory(
+    "dacy/ner",
+    default_config=DEFAULT_CONFIG["token_classification_transformer"],
+)(make_token_classification_transformer)
diff --git a/src/dacy/sentiment/wrapped_models.py b/src/dacy/sentiment/wrapped_models.py
@@ -5,7 +5,10 @@
 from spacy.language import Language
 from spacy.tokens import Doc
 from spacy_transformers.data_classes import FullTransformerBatch
-from spacy_wrap import ClassificationTransformer, make_classification_transformer
+from spacy_wrap import (
+    SequenceClassificationTransformer,
+    make_sequence_classification_transformer,
+)
 from thinc.api import Config, Model
 
 DEFAULT_CONFIG_STR = """
@@ -19,8 +22,8 @@
 @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 
 [subjectivity.model]
-@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
-name = "alexandrainst/da-bert-tone-subjective-objective"
+@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
+name = "alexandrainst/da-subjectivivity-classification-base"
 tokenizer_config = {"use_fast": true}
 transformer_config = {}
 mixed_precision = false
@@ -42,8 +45,8 @@
 @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 
 [polarity.model]
-@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
-name = "alexandrainst/da-bert-tone-sentiment-polarity"
+@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
+name = "alexandrainst/da-sentiment-base"
 tokenizer_config = {"use_fast": true}
 transformer_config = {}
 mixed_precision = false
@@ -65,8 +68,8 @@
 @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 
 [emotionally_laden.model]
-@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
-name = "alexandrainst/da-bert-emotion-binary"
+@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
+name = "alexandrainst/da-binary-emotion-classification-base"
 tokenizer_config = {"use_fast": true}
 transformer_config = {}
 mixed_precision = false
@@ -89,8 +92,8 @@
 @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 
 [emotion.model]
-@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
-name = "alexandrainst/da-bert-emotion-classification"
+@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
+name = "alexandrainst/da-emotion-classification-base"
 tokenizer_config = {"use_fast": true}
 transformer_config = {}
 mixed_precision = false
@@ -108,18 +111,18 @@
 Danish.factory(
     "dacy/subjectivity",
     default_config=DEFAULT_CONFIG["subjectivity"],
-)(make_classification_transformer)
+)(make_sequence_classification_transformer)
 
 
 Danish.factory(
     "dacy/polarity",
     default_config=DEFAULT_CONFIG["polarity"],
-)(make_classification_transformer)
+)(make_sequence_classification_transformer)
 
 Danish.factory(
     "dacy/emotionally_laden",
     default_config=DEFAULT_CONFIG["emotionally_laden"],
-)(make_classification_transformer)
+)(make_sequence_classification_transformer)
 
 
 @Danish.factory(
@@ -135,7 +138,7 @@ def make_emotion_transformer(
     doc_extension_trf_data: str,
     doc_extension_prediction: str,
     labels: List[str],
-) -> ClassificationTransformer:
+) -> SequenceClassificationTransformer:
 
     if not Doc.has_extension("dacy/emotionally_laden"):
         warn(
@@ -145,7 +148,7 @@ def make_emotion_transformer(
 
     # TODO: Add a conditional forward such that the model isn't run is document is not
     # emotionally laden
-    clf_mdl = ClassificationTransformer(
+    clf_mdl = SequenceClassificationTransformer(
         vocab=nlp.vocab,
         model=model,
         set_extra_annotations=set_extra_annotations,
@@ -154,6 +157,7 @@ def make_emotion_transformer(
         labels=labels,
         doc_extension_trf_data=doc_extension_trf_data,
         doc_extension_prediction=doc_extension_prediction,
+        assign_to_cats=True,
     )
 
     # overwrite extension such that it return no emotion if the document does not have

diff --git a/tests/test_ner.py b/tests/test_ner.py
@@ -0,0 +1,13 @@
+import spacy
+
+
+def test_ner():
+    nlp = spacy.blank("da")
+    # or nlp = dacy.load("da_dacy_small_tft-0.0.0", exclude=["ner"])
+    nlp.add_pipe("dacy/ner")
+
+    doc = nlp("Jeg hedder Peter og bor i København")
+    assert doc.ents[0].text == "Peter"
+    assert doc.ents[0].label_ == "PER"
+    assert doc.ents[1].text == "København"
+    assert doc.ents[1].label_ == "LOC"