Skip to content

Commit

Permalink
feat: Added scandi-ner model to dacy
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Jan 5, 2023
1 parent a86b0c8 commit a2bec3b
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 32 deletions.
38 changes: 33 additions & 5 deletions docs/using_dacy.getting_started.rst
Expand Up @@ -54,8 +54,8 @@ Using this we can now apply DaCy to text with conventional SpaCy syntax:
DaCy is built using SpaCy, hence you will be able to find a lot of the required documentation for
using the pipeline in their very well written `documentation <https://spacy.io>`__.

Tagging named entities
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Named Entity Recognition
====================================
A named entity is a “real-world object” that's assigned a name - for example, a person, a country, a product or a book title.
DaCy can recognize organizations, persons, and location, as well as other miscellaneous entities.

Expand All @@ -77,15 +77,43 @@ We can also plot these using:
.. seealso::


For more on named entity recognition see SpaCy's `documentation <https://spacy.io/usage/linguistic-features#named-entities>`__.


.. image:: _static/ner.png
:width: 800
:alt: Named entity recognition using DaCy

Tagging parts-of-speech
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Since its release DaCy have been outperformed by `the state-of-the-art model for NER <https://huggingface.co/saattrupdan/nbailab-base-ner-scandi>`__
by Dan Nielsen. To allow users to access the best model for their use-case DaCy allows you to easily
switch the NER component to obtain a state-of-the-art model.
To do this you can simply load the model using:

.. code-block:: python
# load the small dacy model excluding the NER component
nlp = dacy.load("da_dacy_small_trf-0.1.0", exclude=["ner"])
# or use an empty spacy model if you only want to do NER
# nlp = spacy.blank("da")
# add the ner component from the state-of-the-art model
nlp.add_pipe("dacy/ner")
doc = nlp("Denne NER model er trænet af Dan fra Alexandra Instituttet")
for entity in doc.ents:
print(entity, ":", entity.label_)
# Dan : PER
# Alexandra Instituttet : ORG
Do note that this will add an additonal model to your pipeline, which will slow down the inference speed.

Parts-of-speech Tagging
====================================

.. code-block:: python
Expand Down Expand Up @@ -114,7 +142,7 @@ Tagging parts-of-speech


Dependency parsing
^^^^^^^^^^^^^^^^^^^^^^
====================================
DaCy features a fast and accurate syntactic dependency parser. In DaCy this dependency parsing is also
used for sentence segmentation and detecting noun chunks.

Expand Down
10 changes: 6 additions & 4 deletions pyproject.toml
Expand Up @@ -29,7 +29,7 @@ keywords = [
]

dependencies = [
"spacy-wrap>=1.0.2,<1.1.0",
"spacy-wrap>=1.2.1,<1.3.0",
"spacy>=3.2.0,<3.5.0",
"pandas>=1.0.0,<2.0.0",
"wasabi>=0.8.2,<0.11.0",
Expand Down Expand Up @@ -85,9 +85,11 @@ tutorials = [
"scikit-learn",
]

[project.scripts]
emotion = "dacy.sentiment.wrapped_model:make_emotion_transformer"
hatespeech_classification = "dacy.hate_speech.wrapped_model:make_offensive_transformer"

[project.entry-points.spacy_factories]
emotion = "dacy.sentiment.wrapped_models:make_emotion_transformer"
hatespeech_classification = "dacy.hate_speech.wrapped_models:make_offensive_transformer"
ner = "dacy.ner.wrapped_models"

[build-system]
requires = ["setuptools>=61.0.0", "wheel", "setuptools_scm"]
Expand Down
20 changes: 12 additions & 8 deletions src/dacy/hate_speech/wrapped_models.py
Expand Up @@ -5,7 +5,10 @@
from spacy.language import Language
from spacy.tokens import Doc
from spacy_transformers.data_classes import FullTransformerBatch
from spacy_wrap import ClassificationTransformer, make_classification_transformer
from spacy_wrap import (
SequenceClassificationTransformer,
make_sequence_classification_transformer,
)
from thinc.api import Config, Model

DEFAULT_CONFIG_STR = """
Expand All @@ -19,8 +22,8 @@
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[hatespeech_detection.model]
@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
name = "alexandrainst/da-bert-hatespeech-detection"
@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
name = "alexandrainst/da-hatespeech-detection-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
Expand All @@ -42,8 +45,8 @@
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[hatespeech_classification.model]
@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
name = "alexandrainst/da-bert-hatespeech-classification"
@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
name = "alexandrainst/da-hatespeech-classification-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
Expand All @@ -62,7 +65,7 @@
Danish.factory(
"dacy/hatespeech_detection",
default_config=DEFAULT_CONFIG["hatespeech_detection"],
)(make_classification_transformer)
)(make_sequence_classification_transformer)


@Danish.factory(
Expand All @@ -78,7 +81,7 @@ def make_offensive_transformer(
doc_extension_trf_data: str,
doc_extension_prediction: str,
labels: List[str],
) -> ClassificationTransformer:
) -> SequenceClassificationTransformer:

if not Doc.has_extension("is_offensive"):
warn(
Expand All @@ -88,7 +91,7 @@ def make_offensive_transformer(

# TODO: Add a conditional forward such that the model isn't run is document is not
# emotionally laden
clf_mdl = ClassificationTransformer(
clf_mdl = SequenceClassificationTransformer(
vocab=nlp.vocab,
model=model,
set_extra_annotations=set_extra_annotations,
Expand All @@ -97,6 +100,7 @@ def make_offensive_transformer(
labels=labels,
doc_extension_trf_data=doc_extension_trf_data,
doc_extension_prediction=doc_extension_prediction,
assign_to_cats=True,
)

# overwrite extension such that it return not offensive if the document is not
Expand Down
4 changes: 3 additions & 1 deletion src/dacy/load.py
Expand Up @@ -11,6 +11,7 @@ def load(
model: str,
path: Optional[str] = None,
force_download: bool = False,
**kwargs,
) -> Language:
"""Load a DaCy model as a SpaCy text processing pipeline. If the model is
not downloaded it will also download the model.
Expand All @@ -22,6 +23,7 @@ def load(
corresponds to the path optained using dacy.where_is_my_dacy().
force_redownload (bool, optional): Should the model be redownloaded even if
already downloaded? Default to False.
kwargs: additional arguments passed to spacy.load()
Returns:
Language: a SpaCy text-preprocessing pipeline
Expand All @@ -38,7 +40,7 @@ def load(
path = DEFAULT_CACHE_DIR

path = download_model(model, path, force=force_download)
return spacy.load(path)
return spacy.load(path, **kwargs)


def where_is_my_dacy(verbose: bool = True) -> str:
Expand Down
Empty file added src/dacy/ner/__init__.py
Empty file.
39 changes: 39 additions & 0 deletions src/dacy/ner/wrapped_models.py
@@ -0,0 +1,39 @@
from spacy.lang.da import Danish
from spacy_wrap.pipeline_component_tok_clf import make_token_classification_transformer
from thinc.api import Config

DEFAULT_CONFIG_STR = """
[token_classification_transformer]
max_batch_items = 4096
doc_extension_trf_data = "tok_clf_trf_data"
doc_extension_prediction = "tok_clf_predictions"
predictions_to = null
labels = null
aggregation_strategy = "average"
[token_classification_transformer.set_extra_annotations]
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[token_classification_transformer.model]
@architectures = "spacy-wrap.TokenClassificationTransformerModel.v1"
name="saattrupdan/nbailab-base-ner-scandi"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
grad_scaler_config = {}
[token_classification_transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
"""


DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)


Danish.factory(
"dacy/ner",
default_config=DEFAULT_CONFIG["token_classification_transformer"],
)(make_token_classification_transformer)
32 changes: 18 additions & 14 deletions src/dacy/sentiment/wrapped_models.py
Expand Up @@ -5,7 +5,10 @@
from spacy.language import Language
from spacy.tokens import Doc
from spacy_transformers.data_classes import FullTransformerBatch
from spacy_wrap import ClassificationTransformer, make_classification_transformer
from spacy_wrap import (
SequenceClassificationTransformer,
make_sequence_classification_transformer,
)
from thinc.api import Config, Model

DEFAULT_CONFIG_STR = """
Expand All @@ -19,8 +22,8 @@
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[subjectivity.model]
@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
name = "alexandrainst/da-bert-tone-subjective-objective"
@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
name = "alexandrainst/da-subjectivivity-classification-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
Expand All @@ -42,8 +45,8 @@
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[polarity.model]
@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
name = "alexandrainst/da-bert-tone-sentiment-polarity"
@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
name = "alexandrainst/da-sentiment-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
Expand All @@ -65,8 +68,8 @@
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[emotionally_laden.model]
@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
name = "alexandrainst/da-bert-emotion-binary"
@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
name = "alexandrainst/da-binary-emotion-classification-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
Expand All @@ -89,8 +92,8 @@
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[emotion.model]
@architectures = "spacy-wrap.ClassificationTransformerModel.v1"
name = "alexandrainst/da-bert-emotion-classification"
@architectures = "spacy-wrap.SequenceClassificationTransformerModel.v1"
name = "alexandrainst/da-emotion-classification-base"
tokenizer_config = {"use_fast": true}
transformer_config = {}
mixed_precision = false
Expand All @@ -108,18 +111,18 @@
Danish.factory(
"dacy/subjectivity",
default_config=DEFAULT_CONFIG["subjectivity"],
)(make_classification_transformer)
)(make_sequence_classification_transformer)


Danish.factory(
"dacy/polarity",
default_config=DEFAULT_CONFIG["polarity"],
)(make_classification_transformer)
)(make_sequence_classification_transformer)

Danish.factory(
"dacy/emotionally_laden",
default_config=DEFAULT_CONFIG["emotionally_laden"],
)(make_classification_transformer)
)(make_sequence_classification_transformer)


@Danish.factory(
Expand All @@ -135,7 +138,7 @@ def make_emotion_transformer(
doc_extension_trf_data: str,
doc_extension_prediction: str,
labels: List[str],
) -> ClassificationTransformer:
) -> SequenceClassificationTransformer:

if not Doc.has_extension("dacy/emotionally_laden"):
warn(
Expand All @@ -145,7 +148,7 @@ def make_emotion_transformer(

# TODO: Add a conditional forward such that the model isn't run is document is not
# emotionally laden
clf_mdl = ClassificationTransformer(
clf_mdl = SequenceClassificationTransformer(
vocab=nlp.vocab,
model=model,
set_extra_annotations=set_extra_annotations,
Expand All @@ -154,6 +157,7 @@ def make_emotion_transformer(
labels=labels,
doc_extension_trf_data=doc_extension_trf_data,
doc_extension_prediction=doc_extension_prediction,
assign_to_cats=True,
)

# overwrite extension such that it return no emotion if the document does not have
Expand Down
13 changes: 13 additions & 0 deletions tests/test_ner.py
@@ -0,0 +1,13 @@
import spacy


def test_ner():
nlp = spacy.blank("da")
# or nlp = dacy.load("da_dacy_small_tft-0.0.0", exclude=["ner"])
nlp.add_pipe("dacy/ner")

doc = nlp("Jeg hedder Peter og bor i København")
assert doc.ents[0].text == "Peter"
assert doc.ents[0].label_ == "PER"
assert doc.ents[1].text == "København"
assert doc.ents[1].label_ == "LOC"

0 comments on commit a2bec3b

Please sign in to comment.