Add spans in spacy benchmark (explosion#12575)

* Add spans in spacy benchmark The current implementation of spaCy benchmark accuracy / spacy evaluate doesn't include the "spans" type, so calling the command doesn't render the HTML displaCy file needed. This PR attempts to fix that by creating a new parameter for "spans" and calling the appropriate displaCy value. * Reformat file with black * Add tests for evaluate * Fix spans -> span for displacy style * Update test to check render instead * Update source so mypy passes * Add parser information to avoid warnings
adrianeboyd · May 12, 2023 · 9ec12fc · 9ec12fc
1 parent 139368d
commit 9ec12fc
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 0 deletions.
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
@@ -122,13 +122,16 @@ def evaluate(
         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
         render_deps = "parser" in factory_names
         render_ents = "ner" in factory_names
+        render_spans = "spancat" in factory_names
+
         render_parses(
             docs,
             displacy_path,
             model_name=model,
             limit=displacy_limit,
             deps=render_deps,
             ents=render_ents,
+            spans=render_spans,
         )
         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 
@@ -182,6 +185,7 @@ def render_parses(
     limit: int = 250,
     deps: bool = True,
     ents: bool = True,
+    spans: bool = True,
 ):
     docs[0].user_data["title"] = model_name
     if ents:
@@ -195,6 +199,11 @@ def render_parses(
         with (output_path / "parses.html").open("w", encoding="utf8") as file_:
             file_.write(html)
 
+    if spans:
+        html = displacy.render(docs[:limit], style="span", page=True)
+        with (output_path / "spans.html").open("w", encoding="utf8") as file_:
+            file_.write(html)
+
 
 def print_prf_per_type(
     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
@@ -12,6 +12,7 @@
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
 from thinc.api import Config, ConfigValidationError
+from spacy.tokens import DocBin
 
 from spacy import about
 from spacy.cli import info
@@ -27,6 +28,7 @@
 from spacy.cli.debug_data import _print_span_characteristics
 from spacy.cli.debug_data import _get_spans_length_freq_dist
 from spacy.cli.download import get_compatibility, get_version
+from spacy.cli.evaluate import render_parses
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.init_pipeline import _init_labels
 from spacy.cli.package import get_third_party_dependencies
@@ -144,6 +146,70 @@ def test_issue11235():
     assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
 
 
+@pytest.mark.issue(12566)
+@pytest.mark.parametrize(
+    "factory,output_file",
+    [("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
+)
+def test_issue12566(factory: str, output_file: str):
+    """
+    Test if all displaCy types (ents, dep, spans) produce an HTML file
+    """
+    with make_tempdir() as tmp_dir:
+        # Create sample spaCy file
+        doc_json = {
+            "ents": [
+                {"end": 54, "label": "nam_adj_country", "start": 44},
+                {"end": 83, "label": "nam_liv_person", "start": 69},
+                {"end": 100, "label": "nam_pro_title_book", "start": 86},
+            ],
+            "spans": {
+                "sc": [
+                    {"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
+                    {"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
+                    {
+                        "end": 100,
+                        "kb_id": "",
+                        "label": "nam_pro_title_book",
+                        "start": 86,
+                    },
+                ]
+            },
+            "text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
+            "Briana McNaira - Cultural Chaos .",
+            "tokens": [
+                # fmt: off
+                {"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
+                {"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
+                {"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
+                {"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
+                {"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
+                {"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
+                {"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
+                {"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
+                {"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
+                {"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
+                {"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
+                {"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
+                {"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
+                {"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
+                {"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
+                # fmt: on
+            ],
+        }
+
+        # Create a .spacy file
+        nlp = spacy.blank("pl")
+        doc = Doc(nlp.vocab).from_json(doc_json)
+
+        # Run the evaluate command and check if the html files exist
+        render_parses(
+            docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
+        )
+
+        assert (tmp_dir / output_file).is_file()
+
+
 def test_cli_info():
     nlp = Dutch()
     nlp.add_pipe("textcat")