In [8]:
import spacy
nlp = spacy.load('en')

In [9]:
sent_0 = nlp(u'Myriam saw Clement with a telescope.')
sent_1 = nlp(u'Self-driving cars shift insurance liability toward manufacturers.')
sent_2 = nlp(u'I shot the elephant in my pyjamas.')

In [11]:
for chunk in sent_0.noun_chunks:
    print((chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text))

('Myriam', 'Myriam', 'nsubj', 'saw')
('Clement', 'Clement', 'dobj', 'saw')
('a telescope', 'telescope', 'pobj', 'with')


In [17]:
for token in sent_0 :
    print((token.text, token.dep_, token.head.text, token.head. pos_, [child for child in token .children ] ))

('Myriam', 'nsubj', 'saw', 'VERB', [])
('saw', 'ROOT', 'saw', 'VERB', [Myriam, Clement, with, .])
('Clement', 'dobj', 'saw', 'VERB', [])
('with', 'prep', 'saw', 'VERB', [telescope])
('a', 'det', 'telescope', 'NOUN', [])
('telescope', 'pobj', 'with', 'ADP', [a])
('.', 'punct', 'saw', 'VERB', [])


In [12]:
for chunk in sent_1.noun_chunks:
    print((chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text))

('Self-driving cars', 'cars', 'nsubj', 'shift')
('insurance liability', 'liability', 'dobj', 'shift')
('manufacturers', 'manufacturers', 'pobj', 'toward')


In [18]:
for token in sent_1:
    print((token.text, token.dep_, token.head.text, token.head. pos_, [child for child in token .children ] ))

('Self', 'npadvmod', 'driving', 'VERB', [])
('-', 'punct', 'driving', 'VERB', [])
('driving', 'amod', 'cars', 'NOUN', [Self, -])
('cars', 'nsubj', 'shift', 'VERB', [driving])
('shift', 'ROOT', 'shift', 'VERB', [cars, liability, toward, .])
('insurance', 'compound', 'liability', 'NOUN', [])
('liability', 'dobj', 'shift', 'VERB', [insurance])
('toward', 'prep', 'shift', 'VERB', [manufacturers])
('manufacturers', 'pobj', 'toward', 'ADP', [])
('.', 'punct', 'shift', 'VERB', [])


In [16]:
for chunk in sent_2.noun_chunks:
    print((chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text))

('I', 'I', 'nsubj', 'shot')
('the elephant', 'elephant', 'dobj', 'shot')
('my pyjamas', 'pyjamas', 'pobj', 'in')


In [19]:
for token in sent_2 :
    print((token.text, token.dep_, token.head.text, token.head. pos_, [child for child in token .children ] ))

('I', 'nsubj', 'shot', 'VERB', [])
('shot', 'ROOT', 'shot', 'VERB', [I, elephant, in, .])
('the', 'det', 'elephant', 'NOUN', [])
('elephant', 'dobj', 'shot', 'VERB', [the])
('in', 'prep', 'shot', 'VERB', [pyjamas])
('my', 'poss', 'pyjamas', 'NOUN', [])
('pyjamas', 'pobj', 'in', 'ADP', [my])
('.', 'punct', 'shot', 'VERB', [])


In [22]:
from spacy.symbols import nsubj, VERB

verbs = set()
for possible_subject in sent_1:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
        
print(verbs)

{shift}


In [24]:
root = [token for token in sent_1 if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print((descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights,
          [ancestor.text for ancestor in descendant.ancestors]))


('Self', 'npadvmod', 0, 0, ['driving', 'cars', 'shift'])
('-', 'punct', 0, 0, ['driving', 'cars', 'shift'])
('driving', 'amod', 2, 0, ['cars', 'shift'])
('cars', 'nsubj', 1, 0, ['shift'])


# Training our dependency parsers

In [27]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy


# training data
TRAIN_DATA = [
    ("They trade mortgage-backed securities.", {
        'heads': [1, 1, 4, 4, 5, 1, 1],
        'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
    }),
    ("I like London and Berlin.", {
        'heads': [1, 1, 1, 2, 2, 1],
        'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
    })
]

def main(model=None, output_dir=None, n_iter=10):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe('parser')

    # add labels to the parser
    for _, annotations in TRAIN_DATA:
        for dep in annotations.get('deps', []):
            parser.add_label(dep)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
            print(losses)

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])


if __name__ == '__main__':
    main()

Created blank 'en' model
{'parser': 11.298212479960057}
{'parser': 5.9547552042770215}
{'parser': 2.917428063696304}
{'parser': 1.1171131859917285}
{'parser': 1.7865017133655645}
{'parser': 0.0010677267878588155}
{'parser': 0.0013433697685028753}
{'parser': 9.223689810625636e-07}
{'parser': 2.986945557667453e-06}
{'parser': 1.4261345918727126e-08}
Dependencies [('I', 'nsubj', 'like'), ('like', 'ROOT', 'like'), ('securities', 'dobj', 'like'), ('.', 'punct', 'like')]


Using the parser to recognise your own semantics spaCy's parser component can be used to trained to predict any type of tree structure over your input text. You can also predict trees over whole documents or chat logs, with connections between the sentence-roots used to annotate discourse structure. In this example, we'll build a message parser for a common "chat intent": finding local businesses. Our message semantics will have the following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.

"show me the best hotel in berlin"
- ('show', 'ROOT', 'show')
- ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
- ('hotel', 'PLACE', 'show') --> show PLACE hotel
- ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
Compatible with: spaCy v2.0.0+

In [36]:
from __future__ import unicode_literals, print_function
import random
import spacy
from pathlib import Path
from pprint import pprint

# training data: texts, heads and dependency labels
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
TRAIN_DATA = [
    ("find a cafe with great wifi", {
        'heads': [0, 2, 0, 5, 5, 2],  # index of token head
        'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
    }),
    ("find a hotel near the beach", {
        'heads': [0, 2, 0, 5, 5, 2],
        'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
    }),
    ("find me the closest gym that's open late", {
        'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
    }),
    ("show me the cheapest store that sells flowers", {
        'heads': [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
    }),
    ("find a nice restaurant in london", {
        'heads': [0, 3, 3, 0, 3, 3],
        'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
    }),
    ("show me the coolest hostel in berlin", {
        'heads': [0, 0, 4, 4, 0, 4, 4],
        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
    }),
    ("find a good italian restaurant near work", {
        'heads': [0, 4, 4, 4, 0, 4, 5],
        'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
    })
]

def main(model=None, output_dir=None, n_iter=20):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    parser = nlp.create_pipe('parser')
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get('deps', []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
            print(losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)

        
def test_model(nlp):
    texts = ["find a hotel with good wifi",
             "find me the cheapest gym near work",
             "show me the best hotel in berlin"]
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text, end="\n\n")
        pprint([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
        
if __name__ == '__main__':
    main()

Created blank 'en' model
{'parser': 51.407414456036236}
{'parser': 31.480874742963184}
{'parser': 20.12761407093807}
{'parser': 19.185761280429674}
{'parser': 15.29019559702668}
{'parser': 11.334652493114973}
{'parser': 6.416284825391444}
{'parser': 12.452193891421608}
{'parser': 1.953789578566763}
{'parser': 7.76863297926673e-06}
{'parser': 0.009107380910317153}
{'parser': 0.0007426568706164702}
{'parser': 1.6191893502016575e-07}
{'parser': 1.0665281457102804}
{'parser': 2.4679586752613613e-08}
{'parser': 6.584187440729652e-08}
{'parser': 1.5149161229617652e-08}
{'parser': 3.6550350058416912e-09}
{'parser': 3.152468886002731e-09}
{'parser': 1.2200577720914826e-09}
find a hotel with good wifi

[('find', 'ROOT', 'find'),
 ('hotel', 'PLACE', 'find'),
 ('with', 'QUALITY', 'wifi'),
 ('good', 'QUALITY', 'wifi'),
 ('wifi', 'ATTRIBUTE', 'hotel')]
find me the cheapest gym near work

[('find', 'ROOT', 'find'),
 ('cheapest', 'QUALITY', 'gym'),
 ('gym', 'PLACE', 'find'),
 ('near', 'ATTRIBUTE', 'g