In [None]:
#load the pre-existing spacy model you want to use and get the ner pipeline throughget_pipe() method.
# Import and load the spacy model
import spacy
nlp=spacy.load("en_core_web_sm") 

# Getting the ner component
ner=nlp.get_pipe('ner')

The format of the training data is a list of tuples. Each tuple contains the example text and a dictionary. The dictionary will have the key entities , that stores the start and end indices along with the label of the entitties present in the text.


For example , To pass “Pizza is a common fast food” as example the format will be : ("Pizza is a common fast food",{"entities" : [(0, 5, "FOOD")]})

In [None]:
# New label to add
LABEL = "FOOD"

# Training examples in the required format
TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [None]:
# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

For each iteration , the model or ner is update through the nlp.update() command. Parameters of nlp.update() are :

- docs : This expects a batch of texts as input. You can pass each batch to the zip method , which will return you batches of text and annotations.
`
- sgd : You have to pass the optimizer that was returned by resume_training() here.

- golds : You can pass the annotations we got through zip method here

- drop : This represents the dropout rate.

- losses: A dictionary to hold the losses against each pipeline component. Create an empty dictionary and pass it here.




In [None]:
# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

  sizes = compounding(1.0, 4.0, 1.001)
  # Training for 30 iterations     
  for itn in range(30):
    # shuffle examples before training
    random.shuffle(TRAIN_DATA)
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=sizes)
    # ictionary to store losses
    losses = {}
    for batch in batches:
      texts, annotations = zip(*batch)
      # Calling update() over the iteration
      nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
      print("Losses", losses)

Losses {'ner': 10.305896391102578}
Losses {'ner': 15.879611501062755}
Losses {'ner': 22.412719465258622}
Losses {'ner': 26.601724364264417}
Losses {'ner': 36.18511471099268}
Losses {'ner': 40.138153814711444}
Losses {'ner': 43.623866257155825}
Losses {'ner': 52.20846163936942}
Losses {'ner': 56.53697416909692}
Losses {'ner': 60.478709640858355}
Losses {'ner': 65.12382746411241}
Losses {'ner': 73.78020515019684}
Losses {'ner': 76.73062296290789}
Losses {'ner': 79.79987325598393}
Losses {'ner': 6.229953083391138}
Losses {'ner': 12.403696040804334}
Losses {'ner': 18.41340605467076}
Losses {'ner': 21.374699432879567}
Losses {'ner': 26.974140422509805}
Losses {'ner': 32.38149624420126}
Losses {'ner': 35.26578252364636}
Losses {'ner': 39.089769519254624}
Losses {'ner': 43.62643986227369}
Losses {'ner': 51.38964396514377}
Losses {'ner': 55.76489518777846}
Losses {'ner': 60.985744701786025}
Losses {'ner': 69.13893448011396}
Losses {'ner': 71.58105947761243}
Losses {'ner': 3.2458255933597684}
L

In [None]:
# Testing the NER

test_text = "I ate Sushi yesterday. Maggi is a common fast food "
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
  print(ent)

Entities in 'I ate Sushi yesterday. Maggi is a common fast food '
I
Sushi


In [None]:
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)