In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp('Hello, world. Here are two sentences.')

In [4]:
print([t.text for t in doc])

['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']


In [5]:
nlp_de = spacy.load('de_core_news_sm')

In [6]:
doc_de = nlp_de('Ich ben ein Berliner.')

In [7]:
print([t.text for t in doc_de])

['Ich', 'ben', 'ein', 'Berliner', '.']


In [9]:
list(doc.noun_chunks)

[two sentences]

In [10]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

In [11]:
apple = doc[0]

In [12]:
print('Fine-grained POS tag', apple.pos_, apple.pos)

Fine-grained POS tag PROPN 95


In [13]:
print('Coarse-grained POS tag', apple.tag_, apple.tag)

Coarse-grained POS tag NNP 15794550382381185553


In [14]:
print('Word shape', apple.shape_, apple.shape)

Word shape Xxxxx 16072095006890171862


In [15]:
print('Alphanumeric characters?', apple.is_alpha)

Alphanumeric characters? True


In [16]:
print('Punctuation mark?', apple.is_punct)

Punctuation mark? False


In [17]:
billion = doc[10]

In [18]:
print('Digit?', billion.is_digit)

Digit? False


In [19]:
print('Like a number?', billion.like_num)

Like a number? True


In [20]:
print('Like an email address?', billion.like_email)

Like an email address? False


In [21]:
doc = nlp('San Francisco considers banning sidewalk delivery robots')

In [22]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE


In [23]:
from spacy.tokens import Span

In [24]:
doc = nlp('FBI is hiring a new VP of global policy')

In [25]:
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['ORG'])]

In [26]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

FBI 0 3 ORG


In [27]:
# Train and update neural network models

In [28]:
import random

In [29]:
nlp = spacy.load('en')

In [30]:
train_data = [('Uber blew through $1 million', {'entities': [(0, 4, 'ORG')]})]

In [32]:
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_data)
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk('model')

In [33]:
# Visualize a dependency parse
from spacy import displacy

In [34]:
doc_dep = nlp('This is a sentence.')

In [36]:
# displacy.serve(doc_dep, style='dep')

In [38]:
doc_ent = nlp('When Sebastian Thrun starte working on self-driving cars at Google '
              'in 2007, few people outside of the company took him seriously.')

In [40]:
# displacy.serve(doc_ent, style='ent')

In [41]:
# Get word vectors and similarity
nlp = spacy.load('en_core_web_md')

In [42]:
doc = nlp('Apple and banana are similar. Pasta and hippo are not.')

In [43]:
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

In [44]:
print('apple <-> banana', apple.similarity(banana))

apple <-> banana 0.5831845


In [45]:
print('pasta <-> hippo', pasta.similarity(hippo))

pasta <-> hippo 0.12069741


In [46]:
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

True True True True


In [47]:
# Match text with token rules
from spacy.matcher import Matcher

In [49]:
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

In [50]:
def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1
    

In [51]:
pattern1 = [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]

In [52]:
matcher.add('GoogleIO', None, pattern1)
matcher.add('HAPPY', set_sentiment, *pattern2)

In [53]:
doc = nlp('A text about Google I/O 😀😀')
matches = matcher(doc)

In [54]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span.text)
print('Sentiment', doc.sentiment)

GoogleIO Google I/O
HAPPY 😀😀
HAPPY 😀
Sentiment 0.20000000298023224


In [55]:
# Get syntactic dependencies
doc = nlp('When Sebastian Thrun started working on self-driving cars at Google '
         'in 2007, few people outside of the company took him seriously.')

In [56]:
dep_labels = []
for token in doc:
    while token.head != token:
        dep_labels.append(token.dep_)
        token = token.head
print(dep_labels)

['advmod', 'advcl', 'compound', 'nsubj', 'advcl', 'nsubj', 'advcl', 'advcl', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'npadvmod', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'nsubj', 'nsubj', 'prep', 'nsubj', 'prep', 'prep', 'nsubj', 'det', 'pobj', 'prep', 'prep', 'nsubj', 'pobj', 'prep', 'prep', 'nsubj', 'dobj', 'advmod', 'punct']


In [57]:
# Export to numpy arrays
from spacy.attrs import ORTH, LIKE_URL

In [58]:
doc = nlp('Check our https://spacy.io')
for token in doc:
    print(token.text, token.orth, token.like_url)

Check 8104846059040039827 False
our 17577015140245084783 False
https://spacy.io 17142293684782158888 True


In [59]:
attr_ids = [ORTH, LIKE_URL]
doc_array = doc.to_array(attr_ids)
print(doc_array.shape)
print(len(doc), len(attr_ids))

(3, 2)
3 2


In [60]:
print(list(doc_array[:,1]))

[0, 0, 1]


In [61]:
# Calculate inline markup on original string
def put_spans_around_tokens(doc):
    output = []
    html = '<span class="{classes}">{word}</span>{space}'
    for token in doc:
        if token.is_space:
            output.append(token.text)
        else:
            classes = 'pos-{} dep-{}'.format(token.pos_, token.dep_)
            output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
    string = ''.join(output)
    string = string.replace('\n', '')
    string = string.replace('\t', '    ')
    return '<pre>{}</pre>'.format(string)

doc = nlp('This is a test.\n\nHello  world.')
html = put_spans_around_tokens(doc)
print(html)

<pre><span class="pos-DET dep-nsubj">This</span> <span class="pos-VERB dep-ROOT">is</span> <span class="pos-DET dep-det">a</span> <span class="pos-NOUN dep-attr">test</span><span class="pos-PUNCT dep-punct">.</span><span class="pos-INTJ dep-ROOT">Hello</span>  <span class="pos-NOUN dep-npadvmod">world</span><span class="pos-PUNCT dep-punct">.</span></pre>
