In [4]:
import spacy
from spacy.tokens import Token, Doc
from spacy import displacy
from app.train import main
from app.classifiy.main import HealthTokenizer

nlp = main.load_spacy()


Loading SPACY with model=nb_core_news_sm from file=combined.jsonl
Loading EntityRuler with patterns from file=combined.jsonl
Loading custom patterns of size=18
Finished loading SPACY


In [27]:
class HealthTokenizer:
    def __init__(self, doc: Doc):
        super().__init__()
        self.doc = doc

    def tokenize(self):
        for token in self.doc:
            self.handle_token(token)


    def handle_token(self, token: Token):
        if token.ent_iob:
            """IOB code of named entity tag. 
            3 means the token begins an entity, 
            2 means it is outside an entity, 
            1 means it is inside an entity, and 
            0 means no entity tag is set."""
            print(f"{token} is ENTITY {token.ent_iob} --> {token.ent_kb_id_}")

In [35]:
doc = nlp("BT: 180 / 80 mmHg. Puls : 44 . Redd for å dø")
displacy.render(doc, style="ent") # dep | ent
ht = HealthTokenizer(doc)
ht.tokenize()



BT is ENTITY 3 --> 
: is ENTITY 2 --> 
180 is ENTITY 2 --> 
/ is ENTITY 2 --> 
80 is ENTITY 2 --> 
mmHg is ENTITY 3 --> 
. is ENTITY 2 --> 
Puls is ENTITY 3 --> 
: is ENTITY 2 --> 
44 is ENTITY 2 --> 
. is ENTITY 2 --> 
Redd is ENTITY 2 --> 
for is ENTITY 2 --> 
å is ENTITY 3 --> 
dø is ENTITY 1 --> 


In [36]:
for token in doc:
    print(f"{token} -->  {token.like_num} {token.pos_}")

BT -->  False PROPN
: -->  False PUNCT
180 -->  True NUM
/ -->  False SYM
80 -->  True NUM
mmHg -->  False NOUN
. -->  False PUNCT
Puls -->  False NOUN
: -->  False PUNCT
44 -->  True NUM
. -->  False PUNCT
Redd -->  False ADJ
for -->  False ADP
å -->  False PART
dø -->  False VERB


In [12]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

BT ROOT BT PROPN [:]
: punct BT PROPN []
180/80 nummod mmHg NOUN []
mmHg ROOT mmHg NOUN [180/80, .]
. punct mmHg NOUN []
Puls ROOT Puls NOUN [:]
: punct Puls NOUN []
44 ROOT 44 NUM []


In [13]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

BT 0 2 OBSERVATION
mmHg 11 15 UNIT
Puls 17 21 OBSERVATION
