In [None]:
@Language.component("entity_ruler")
def custom_sentencizer(doc):
    for i, token in enumerate(doc[:-2]):
        # Define sentence start if pipe + titlecase token
        if token.text == "|" and doc[i + 1].is_title:
            doc[i + 1].is_sent_start = True
        else:
            # Explicitly set sentence start to False otherwise, to tell
            # the parser to leave those tokens alone
            doc[i + 1].is_sent_start = False
    return doc

In [None]:
 def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).
        path (str / Path): The JSONL file to save.
        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        cfg = {
            "overwrite": self.overwrite,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "patterns": lambda p: srsly.write_jsonl(
                p.with_suffix(".jsonl"), self.patterns
            ),
            "cfg": lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            to_disk(path, serializers, {})

In [None]:
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training import Example

# Training data for an entity recognizer (option 2)
doc = nlp("Laura flew to Silicon Valley.")
gold_dict = {"entities": [(0, 5, "PRS"), (14, 28, "LOC")]}
example = Example.from_dict(doc, gold_dict)

print(example.text)
examples = []
examples.append(example)

# Spacy V3 The Language.evaluate method now takes a batch of Example objects instead of tuples of Doc and GoldParse objects.
scores = nlp.evaluate(examples)
print(scores["ents_p"])


# Default scoring pipeline
#scorer = Scorer()

# Provided scoring pipeline
scores = Scorer.score_tokenization(examples)
print(scores)


# Returns A dictionary containing the PRF scores under the keys {attr}_p, {attr}_r, {attr}_f and the per-type PRF scores under {attr}_per_type
spans = Scorer.score_spans(examples, "ents")
print(spans["ents_per_type"])

from spacy import displacy

colors = {
          "SYM": "linear-gradient(90deg, #99154e, #99154e)",
          "NEG": "linear-gradient(90deg, #ffc93c, #ffc93c)"
          }
 

options = {"compact": False, 
           "bg": "#09a3d5",
           "color": "white",
           "font": "Source Sans Pro"}

displacy.render(doc, style="ent", jupyter=True, options=options)

print(example.to_dict())

In [None]:
    ''' 
    Add matcher from dictionaries 

    To create the patterns, each phrase has to be processed with the nlp object. If you have a trained pipeline loaded, doing this in a loop or list comprehension can easily become inefficient and slow. If you only need the tokenization and lexical attributes, you can run nlp.make_doc instead, which will only run the tokenizer. For an additional speed boost, you can also use the nlp.tokenizer.pipe method, which will process the texts as a stream.
    from spacy.matcher import PhraseMatcher
    '''

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# Only run nlp.make_doc to speed things up

icd_patterns = [nlp.make_doc(text) for text in ICD]
neg_patterns = [nlp.make_doc(text) for text in NEG]

matcher.add("SYM", icd_patterns)
matcher.add("NEG", neg_patterns)

# This is not what we want to do, as this is not integrated as a pipe in the pipeline and is therfore harder to analyse
matches = matcher(doc)
entities = []
for match_id, start, end in matches:
    span = doc[start:end]
    class_id = nlp.vocab.strings[match_id]
    print(span.text, span.start_char, span.end_char, class_id)
    ent = {"start": span.start_char, "end": span.end_char, "label": class_id}
    entities.append(ent)
print(entities)

'''
 Changed in v3.0
As of spaCy v3.0, PhraseMatcher.add takes a list of patterns as the second argument (instead of a variable number of arguments). The on_match callback becomes an optional keyword argument.

```
patterns = [nlp("health care reform"), nlp("healthcare reform")]
- matcher.add("HEALTH", on_match, *patterns)
+ matcher.add("HEALTH", patterns, on_match=on_match) 
``` 
'''

# ANALYSIS eXPERIMENTS

In [None]:
# https://support.prodi.gy/t/evaluation-of-rule-based-matching/1431
true_positives = guesses.intersection(truth)
false_positives = guesses - truth
false_negatives = truth - guesses

precision = len(true_positives) / len(guesses)
recall = len(true_positives) / len(truth)
fscore = 2 * ((p * r) / (p + r + 1e-100))

When you make your sets, make sure that you’re representing the spans by the start and end offsets with the label, instead of just the text. It’s not so relevant in your case, but it covers you if you do have inputs with multiple annotations that have the same text content. A tuple (start, end, label) will be hashable, so you can store it in a set.

If you’re making the set over a whole dataset, you’ll also want to add in the input hash, to make sure you’re referring to the right examples. All up, it should be as easy as this:

In [None]:
def get_annotations(dataset):
    annotations = set()
    for eg in dataset:
        for span in eg["spans"]:
            annotations.add((span["start"], span["end"], span["label"]))
    return annotations

# DB = connect()
truth = get_annotations(DB.get_dataset(gold_annotations))
guesses =  get_annotations(matcher_output)
scores = spacy.scorer.PRFScore()
scores.score_set(guesses, truth)
print(scores.precision, scores.recall, scores.fscore)

In [None]:
# load an example dataset
from vega_datasets import data
cars = data.cars()

import altair as alt

points = alt.Chart(cars).mark_point().encode(
  x='Year:T',
  y='Miles_per_Gallon',
  color='Origin'
).properties(
  width=800
)

lines = alt.Chart(cars).mark_line().encode(
  x='Year:T',
  y='mean(Miles_per_Gallon)',
  color='Origin'
).properties(
  width=800
).interactive(bind_y=False)
              
points + lines

In [None]:
print("Create a new set:")
x = set()
print(x)
print(type(x))
print("\nCreate a non empty set:")
n = set([0, 1, 2, 3, 4])
print(n)
print(type(n))
print("\nUsing a literal:")
a = {1,2,3,'foo','bar'}
print(type(a))
print(a)

A set is an unordered collection of items. Every set element is unique (no duplicates) and must be immutable (cannot be changed).

However, a set itself is mutable. We can add or remove items from it.

Sets can also be used to perform mathematical set operations like union, intersection, symmetric difference, etc.

In [None]:
# Training data for a part-of-speech tagger
doc = Doc(vocab, words=["I", "like", "stuff"])
gold_dict = {"tags": ["NOUN", "VERB", "NOUN"]}
example = Example.from_dict(doc, gold_dict)

# Training data for an entity recognizer (option 1)
doc = nlp("Laura flew to Silicon Valley.")
gold_dict = {"entities": ["U-PERS", "O", "O", "B-LOC", "L-LOC"]}
example = Example.from_dict(doc, gold_dict)

# Training data for an entity recognizer (option 2)
doc = nlp("Laura flew to Silicon Valley.")
gold_dict = {"entities": [(0, 5, "PERSON"), (14, 28, "LOC")]}
example = Example.from_dict(doc, gold_dict)

# Training data for text categorization
doc = nlp("I'm pretty happy about that!")
gold_dict = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
example = Example.from_dict(doc, gold_dict)

# Training data for an Entity Linking component (also requires entities & sentences)
doc = nlp("Russ Cochran his reprints include EC Comics.")
gold_dict = {"entities": [(0, 12, "PERSON")],
             "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
             "sent_starts": [1, -1, -1, -1, -1, -1, -1, -1]}
example = Example.from_dict(doc, gold_dict)

In [None]:
%cd clinical_NLP_SE/
!git pull

In [33]:
!python -m spacy convert --converter ner /content/clinical_NLP_SE/data/raw/corpus/conll2003/chart1.txt/admin.conll /content/clinical_NLP_SE/data/interim

In [None]:
python -m spacy debug-data de /content/clinical_NLP_SE/data/interim/admin.spacy -p ner -b de_core_news_md

In [None]:
!pip install conllu

In [None]:
 from conllu import parse
 from io import open
from conllu import parse_incr

data_file = open("/content/clinical_NLP_SE/data/raw/corpus/mockup-patient-records/chart1.txt/admin.conllu", "r", encoding="utf-8")
for tokenlist in parse_incr(data_file):
    print(tokenlist)

In [None]:
from spacy.tokens import DocBin
from spacy.training import Corpus

doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./data.spacy")
reader = Corpus("./data.spacy")

In [None]:

doc = nlp(/content/clinical_NLP_SE/data/interim/admin.spacy)

In [None]:
!python -m spacy debug-data --help

In [None]:
import pandas as pd

cols = ("space", "text", "lemma", "normalization", "POS", "explain", "stopword", "dep","NE", "sentiment")
rows = []

for t in doc:
      if not t.is_space:
        row = [t.is_space, t.text, t.lemma_, t.norm_, t.pos_, spacy.explain(t.pos_), t.is_stop, t.dep_, t.ent_type_, t.sentiment]
        rows.append(row)

df = pd.DataFrame(rows, columns=cols)
    
df


Unnamed: 0,space,text,lemma,normalization,POS,explain,stopword,dep,NE,sentiment
0,False,data_path,data_path,data_path,NOUN,noun,False,ROOT,,0.0
1,False,/,/,/,SYM,symbol,False,cc,,0.0
2,False,interim,interim,interim,NOUN,noun,False,conj,,0.0
3,False,/,/,/,SYM,symbol,False,cc,,0.0
4,False,admin.spacy,admin.spacy,admin.spacy,NOUN,noun,False,conj,,0.0


In [None]:
!python -m spacy convert /content/clinical_NLP_SE/data/raw/corpus/mockup-patient-records/chart1.txt/admin.conllu /path/to/output/doc.jsonl -c conllu.