## spaCy Model Customization

### Create Custom Text Categorizer

Documentation: https://spacy.io/api/textcategorizer

In [3]:
#typically a couple hundred examples are required for accurate results 
train_data = [
    (u"Malta", {'cats': {'EUROPEAN': 1.0, 'NA': 0.0}}),
    (u"Crete", {'cats': {'EUROPEAN': 1.0, 'NA': 0.0}}),
    (u"USA", {'cats': {'EUROPEAN': 0.0, 'NA': 1.0}}),
    (u"British", {'cats': {'EUROPEAN': 1.0, 'NA': 0.0}}),
    (u"Brazil", {'cats': {'EUROPEAN': 0.0, 'NA': 1.0}}),
    (u"Canadian", {'cats': {'EUROPEAN': 0.0, 'NA': 1.0}}),
    (u"Sweden", {'cats': {'EUROPEAN': 1.0, 'NA': 0.0}})]

In [5]:
import spacy
import plac
import random
from pathlib import Path

#create blank nlp model in english
nlp = spacy.blank('en')

#initialize nlp pipeline
textcat = nlp.create_pipe('textcat')
nlp.add_pipe(textcat, last=True)

#add custom categories
textcat.add_label('EUROPEAN')
textcat.add_label('NA')

#begin training
nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
for itn in range(10):
    for doc, gold in train_data:
        nlp.update([doc], [gold], sgd=optimizer)
doc = nlp(u'United States')
print(doc.cats)
doc = nlp(u'Britain')
print(doc.cats)

#output model to be used later 
output_dir = r'file path'
output_dir = Path(output_dir) 
if not output_dir.exists():
    output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

{'EUROPEAN': 0.8027938604354858, 'NA': 0.13836686313152313}
{'EUROPEAN': 0.5758070945739746, 'NA': 0.6145948171615601}


In [None]:
#load the saved model
output_dir = r'file path'
print("Loading from", output_dir)
nlp = spacy.load(output_dir)

#test the model
doc = nlp(u'Italy')
print(doc.cats)

### Create Custom Entity Recognition & Update Pre-Existing Entities

In [11]:
import spacy
import random
from pathlib import Path

# new entity label
LABEL = 'ROLL_TIDE'

TRAIN_DATA = [
    ("Feb", {
        'entities': [(0, 2, 'DATE')]
    }),

    ("15/16E", {
        'entities': [(0, 5, 'DATE')]
    }),

    ("Apr 20", {
        'entities': [(0, 6, 'DATE')]
    }),
   
    ("Nick Saban", {
        'entities': [(0, 10, 'ROLL_TIDE')]
    }),
    ("Crimson Tide", {
        'entities': [(0, 12, 'ROLL_TIDE')]
    }),

    ("Big Al", {
        'entities': [(0, 6, 'ROLL_TIDE')]
    }),
    ("Tua 4 Heisman", {
        'entities': [(0, 13, 'ROLL_TIDE')]
    })]

In [12]:
def main(model=None, new_model_name='ROLL_TIDE', output_dir=r'file_path', n_iter=100):
    
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        #loads pre-trained spacy en model - other options available here: https://spacy.io/usage/models
        #nlp = spacy.load('en_core_web_sm') ~ set model in function to the string value
        
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:        
        # create blank Language class
        nlp = spacy.blank('en') 
        print("Created'en' model")
        
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    
    # add new entity label to entity recognizer
    ner.add_label(LABEL)   
    
    #prepares model for training
    if model is None:
        nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = "Tua will win the heisman for Nick Saban"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
        
    return nlp

In [13]:
bama_nlp = main()

Created blank 'en' model
{'ner': 15.334686279296875}
{'ner': 12.651902079582477}
{'ner': 7.685541325721033}
{'ner': 9.873688644726956}
{'ner': 4.876519405549766}
{'ner': 8.090691986142303}
{'ner': 3.625201584482973}
{'ner': 3.729167474906981}
{'ner': 1.998796757055175}
{'ner': 3.689720597178588}
{'ner': 3.245556431308616}
{'ner': 5.1487211471227035}
{'ner': 2.5634499418059926}
{'ner': 1.8287298862904928}
{'ner': 1.5394464414217934}
{'ner': 1.926327181039602}
{'ner': 5.9955483556850275}
{'ner': 1.4431897458790681}
{'ner': 1.4667095744657224}
{'ner': 2.874917328417063}
{'ner': 0.5000000151990281}
{'ner': 6.506236419112741e-06}
{'ner': 1.4483597933900536}
{'ner': 3.3520205829331924}
{'ner': 0.9349699296397354}
{'ner': 4.331620216369806}
{'ner': 0.9444444180519406}
{'ner': 1.8620462580376957}
{'ner': 0.9375544141773794}
{'ner': 3.269705060213063}
{'ner': 2.876355238258844}
{'ner': 2.862998170264444}
{'ner': 2.78917238676702}
{'ner': 1.1039174944177994}
{'ner': 0.9182705085061468}
{'ner': 1