In [6]:
import spacy
import random
# from spacy import util
from spacy.tokens import Doc
from spacy.training import Example
# from spacy.language import Language
from pathlib import Path


In [7]:
train_data = [
    ("Type II diabetes are the most common ones.", [(8,16, "DISEASE")]),
    ("In type 2 diabetes, there are primarily two interrelated problems at work.", [(10,18, "DISEASE")]),
    ("You can lower your cholesterol by eating healthily and getting more exercise.", [(19,30, "DISEASE")]),
    ("Treatment for this anemia can include blood transfusions to boost levels of red blood cells.", [(19,25, "DISEASE")]),
    ("When Sebastian Thrun started working on self-driving cars at", [(5,20, "PERSON")]), 
    ("Google in 2007, few people outside of the company took him", [(0, 6, 'ORG'), (10, 13, 'DATE')])
    ]

### Load an existent spacy model

In [8]:
nlp = spacy.load('en_core_web_md')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
# Check the entities that are recognized by the ner model
ori_ents = nlp.get_pipe('ner').labels
print(ori_ents)

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


### Select ner pipeline

In [10]:


# Pipelines in core pretrained model are tagger, parser, ner. Create new if blank model is to be trained using `spacy.blank('en')` else get the existing one.
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner") # Creating the pipeline component
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe("ner") # Getting the pipeline component

### Add the new entity
add_ents = ['DISEASE'] # The new entity
ori_ents = nlp.get_pipe('ner').labels # All the existing entities recognised by the model
print('[Existing Entities] = ', nlp.get_pipe('ner').labels)
for ent in add_ents:
    ner.add_label(ent)
new_ents = nlp.get_pipe('ner').labels
print('\n[All Entities] = ', nlp.get_pipe('ner').labels)

[Existing Entities] =  ('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')

[All Entities] =  ('CARDINAL', 'DATE', 'DISEASE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


### Train the model

In [11]:
# DISABLE THE OTHER PIPES THAT DONT NEED TO CHANGE
disabled_pipes = []
for pipe_name in nlp.pipe_names:
    if pipe_name != 'ner':
        nlp.disable_pipes(pipe_name)
        disabled_pipes.append(pipe_name)

# train ner
print("  Starting the Training ...")
optimizer = nlp.create_optimizer()
for _ in range(25):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, {"entities": entity_offsets})
        nlp.update([example], sgd=optimizer)


# Enable all previously disabled pipe components
for pipe_name in disabled_pipes:
    nlp.enable_pipe(pipe_name)



  Starting the Training ...




In [12]:
### function to print the entities
def print_doc_entities(_doc: Doc):
    if _doc.ents:
        for _ent in _doc.ents:
            print(f"     {_ent.text} {_ent.label_}")
    else:
        print("     NONE")

### Predict on new texts

In [13]:

# Result after training for cholesterol
print(f"Result AFTER training for cholesterol:")
doc = nlp(u'cholesterol')
print_doc_entities(doc)

# Result after training for diabetes
print(f"Result AFTER training  for diabetes:")
doc = nlp(u'The main difference between the two types of diabetes is that type 1 diabetes is a genetic disorder that often shows up early in life, and type 2 is largely diet-related and develops over time. ')
print_doc_entities(doc)

# Result after training for DATE
print(f"Result AFTER training  for DATE:")
doc = nlp(u'Google in 2007')
print_doc_entities(doc)

Result AFTER training for cholesterol:
     cholesterol DISEASE
Result AFTER training  for diabetes:
     diabetes DISEASE
     diabetes DISEASE
Result AFTER training  for DATE:
     Google ORG
     2007 DATE


### Save the model

In [14]:
### Save the model
# Save the  model to directory

output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)



Saved model to \content


### Upload the saved model

In [18]:
# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Several signs and symptoms occur in all types of anemia, such as fatigue, shortness of breath and feeling cold." )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Loading from \content
Entities [('anemia', 'DISEASE')]


In [19]:
doc = nlp_updated("The fiber and potassium in bananas can help lower cholesterol and blood pressure. Coronavirus disease 2019 (COVID-19) dominated 2020. This is a look back at how the pandemic evolved and progressed through the year, which closed with the arrival of vaccines, but also continued challenges." )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('cholesterol', 'DISEASE'), ('Coronavirus', 'ORG'), ('2019', 'DISEASE'), ('COVID-19', 'ORG'), ('2020', 'DATE')]


In [20]:
print(nlp_updated.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [21]:
print(nlp_updated.get_pipe('ner').labels)

('CARDINAL', 'DATE', 'DISEASE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [22]:
nlp_updated = spacy.load(output_dir)
from spacy.training import biluo_tags_to_spans
from spacy.language import Language

@Language.component("custom_ner_wrapper")
def custom_ner_wrapper(doc):
    words = [token.text for token in doc]
    custom_entities = nlp_updated(words)
    doc.ents = biluo_tags_to_spans(doc, custom_entities)
    return doc

In [23]:
a = custom_ner_wrapper(doc)

ValueError: [E866] Expected a string or 'Doc' as input, but got: <class 'list'>.