# POS Tagging

In [1]:
import spacy
nlp = spacy.load('en')

In [3]:
sent_0 = nlp(u'Mathieu and I went to the park.')
sent_1 = nlp(u'If Clement was asked to take out the garbage, he would refuse.')
sent_2 = nlp(u'Baptiste was in charge of the refuse treatment center.')
sent_3 = nlp(u'Marie took out her rather suspicious and fishy cat to go fish for fish.')

In [5]:
for token in sent_0:
    print((token.text, token.pos_, token.tag_))

('Mathieu', 'PROPN', 'NNP')
('and', 'CCONJ', 'CC')
('I', 'PRON', 'PRP')
('went', 'VERB', 'VBD')
('to', 'ADP', 'IN')
('the', 'DET', 'DT')
('park', 'NOUN', 'NN')
('.', 'PUNCT', '.')


In [4]:
for token in sent_1:
    print((token.text, token.pos_, token.tag_))

('If', 'ADP', 'IN')
('Clement', 'PROPN', 'NNP')
('was', 'VERB', 'VBD')
('asked', 'VERB', 'VBN')
('to', 'PART', 'TO')
('take', 'VERB', 'VB')
('out', 'PART', 'RP')
('the', 'DET', 'DT')
('garbage', 'NOUN', 'NN')
(',', 'PUNCT', ',')
('he', 'PRON', 'PRP')
('would', 'VERB', 'MD')
('refuse', 'VERB', 'VB')
('.', 'PUNCT', '.')


In [5]:
for token in sent_2:
    print((token.text, token.pos_, token.tag_))

('Baptiste', 'PROPN', 'NNP')
('was', 'VERB', 'VBD')
('in', 'ADP', 'IN')
('charge', 'NOUN', 'NN')
('of', 'ADP', 'IN')
('the', 'DET', 'DT')
('refuse', 'ADJ', 'JJ')
('treatment', 'NOUN', 'NN')
('center', 'NOUN', 'NN')
('.', 'PUNCT', '.')


In [7]:
for token in sent_3:
    print((token.text, token.pos_, token.tag_))

('Marie', 'PROPN', 'NNP')
('took', 'VERB', 'VBD')
('out', 'PART', 'RP')
('her', 'PRON', 'PRP')
('rather', 'ADV', 'RB')
('suspicious', 'ADJ', 'JJ')
('and', 'CCONJ', 'CC')
('fishy', 'ADJ', 'JJ')
('cat', 'NOUN', 'NN')
('to', 'PART', 'TO')
('go', 'VERB', 'VB')
('fish', 'NOUN', 'NN')
('for', 'ADP', 'IN')
('fish', 'NOUN', 'NN')
('.', 'PUNCT', '.')


# Training your own POS-tagger

In [2]:
import random
from pathlib import Path
import spacy

TAG_MAP = {
    'N': {'pos': 'NOUN'},
    'V': {'pos': 'VERB'},
    'J': {'pos': 'ADJ'},
}

In [3]:
TRAIN_DATA = [
    ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
    ("Eat blue ham", {'tags': ['V', 'J', 'N']})
]

In [4]:
%tb

def main(lang='en', output_dir=None, n_iter=25):
    """Create a new model, set up the pipeline and train the tagger. In order to
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
    nlp = spacy.blank(lang)
    # add the tagger to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    tagger = nlp.create_pipe('tagger')
    # Add the tags. This needs to be done before you start training.
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)
    
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
        print(losses)
        
    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
    
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])


No traceback available to show.


In [5]:
if __name__ == '__main__':
    main()

{'tagger': 0.5731045007705688}
{'tagger': 0.5470398366451263}
{'tagger': 0.4321519583463669}
{'tagger': 0.26743578910827637}
{'tagger': 0.12172483652830124}
{'tagger': 0.03240921627730131}
{'tagger': 0.0039986324263736606}
{'tagger': 0.00036780487425858155}
{'tagger': 4.438006453710841e-05}
{'tagger': 8.323409701915807e-06}
{'tagger': 1.8317383023713774e-06}
{'tagger': 6.832339920492814e-07}
{'tagger': 2.2298168289580644e-07}
{'tagger': 9.801566491773883e-08}
{'tagger': 5.375977885080374e-08}
{'tagger': 2.9275055268840333e-08}
{'tagger': 1.755438905348683e-08}
{'tagger': 1.1347002004669093e-08}
{'tagger': 7.573777383029778e-09}
{'tagger': 5.780358192097879e-09}
{'tagger': 4.297178834988813e-09}
{'tagger': 3.438658691123919e-09}
{'tagger': 2.829821377225983e-09}
{'tagger': 2.4201857184991127e-09}
{'tagger': 2.1093340407674077e-09}
Tags [('I', 'N', 'NOUN'), ('like', 'V', 'VERB'), ('blue', 'J', 'ADJ'), ('eggs', 'N', 'NOUN')]


# POS-tagging code examples

In [6]:
def make_verb_upper(text, pos):
    return text.upper() if pos == "VERB" else text 

doc = nlp(u'Tom ran swiftly and walked slowly')
text = ''.join(make_verb_upper(w.text_with_ws, w.pos_) for w in doc)
print(text)

Tom RAN swiftly and WALKED slowly
