In [90]:
import spacy
import pickle 
import random
from tqdm.notebook import tqdm
from spacy.util import minibatch, compounding
from spacy.training.example import Example
from spacy.tokens import DocBin

### Train NER model on Data of 200 Resume
Data Source: https://github.com/laxmimerit/Resume-and-CV-Summarization-and-Parsing-with-Spacy-in-Python/

In [6]:
!ls ../data/

train_data.pkl


In [8]:
TRAIN_DATA = pickle.load(open("../data/train_data.pkl","rb"))
type(TRAIN_DATA)

list

In [9]:
len(TRAIN_DATA)

200

In [100]:
nlp = spacy.blank("en") # load a new spacy model

In [99]:
?doc.spans

In [101]:
db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("../data/train.spacy")

  0%|          | 0/200 [00:00<?, ?it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


ValueError: [E1010] Unable to set entity information for token 226 which is included in more than one span in entities, blocked, missing or outside.

### Initialise blank ner model, and add entities from training data to it

In [23]:
ner_model = spacy.blank("en")
ner_pipe = ner_model.add_pipe("ner",last=True)

In [24]:
for text,annotation in tran_data_org:
    for entity in annotation["entities"]:
        ner_pipe.add_label(entity[2])

In [44]:
TRAIN_DATA = tran_data_org.copy()

In [84]:
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

In [89]:
for batch in batches:
    for text, annotations in batch:
        doc = ner_model.make_doc(text)
        example = Example.from_dict(doc, annotations)

ValueError: [E103] Trying to set conflicting doc.ents: '(2128, 2144, 'Companies worked at')' and '(2116, 2143, 'College Name')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.

In [87]:
annotations

({'entities': [(1894, 2173, 'Skills'),
   (1726, 1851, 'Skills'),
   (1711, 1716, 'Graduation Year'),
   (1643, 1678, 'College Name'),
   (1610, 1642, 'Degree'),
   (1433, 1468, 'College Name'),
   (1385, 1390, 'Graduation Year'),
   (1359, 1363, 'Location'),
   (1327, 1356, 'College Name'),
   (1276, 1325, 'Degree'),
   (269, 274, 'Graduation Year'),
   (242, 246, 'Location'),
   (204, 238, 'Companies worked at'),
   (181, 202, 'Designation'),
   (145, 149, 'Location'),
   (33, 37, 'Location'),
   (17, 31, 'Designation'),
   (0, 16, 'Name')]},
 {'entities': [(1520, 1524, 'Graduation Year'),
   (1507, 1518, 'Degree'),
   (1496, 1500, 'Graduation Year'),
   (1488, 1492, 'Graduation Year'),
   (1440, 1463, 'College Name'),
   (1415, 1439, 'Degree'),
   (1409, 1413, 'Graduation Year'),
   (1334, 1366, 'College Name'),
   (1291, 1333, 'Degree'),
   (917, 940, 'Skills'),
   (413, 417, 'Graduation Year'),
   (405, 409, 'Graduation Year'),
   (356, 368, 'Skills'),
   (313, 318, 'Companies wor

In [77]:
def train_ner(model,TRAIN_DATA,n_iter=20):
    model.begin_training()
    for itn in tqdm(range(n_iter)):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            doc = model.make_doc(text)
            example = Example.from_dict(doc, annotations)
            model.update(
                example,
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print(f"For itr {itn}, Losses", losses)

In [69]:
?spacy.training.example

In [78]:
train_ner(ner_model,tran_data_org.copy())

  0%|          | 0/20 [00:00<?, ?it/s]

TypeError: Argument 'example_dict' has incorrect type (expected dict, got tuple)

In [62]:
?ner_model.update

In [30]:
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


False

In [None]:
import spacy_streamlit
import typer


def main(models: str, default_text: str):
    models = [name.strip() for name in models.split(",")]
    spacy_streamlit.visualize(models, default_text, visualizers=["ner"])