In [149]:
import spacy
from spacy import displacy
from itertools import islice
%run script.py

nlp = spacy.load('en')

In [145]:
nyt = nlp("""Sure, we all wish that Trump treated Justin Trudeau or Angela Merkel with the respect that he now shows Kim Jong-un. Yes, it seems that Trump has been played by Kim. Yet another way of putting it is that Trump is finally investing in the kind of diplomatic engagement that he used to denounce, and that we should all applaud.

Trump’s newfound pragmatism is infinitely preferable to the threat of nuclear war that used to hang over all of us, so it’s mystifying to see Democrats carping about any possible North Korea deal.""")
nyt

Sure, we all wish that Trump treated Justin Trudeau or Angela Merkel with the respect that he now shows Kim Jong-un. Yes, it seems that Trump has been played by Kim. Yet another way of putting it is that Trump is finally investing in the kind of diplomatic engagement that he used to denounce, and that we should all applaud.

Trump’s newfound pragmatism is infinitely preferable to the threat of nuclear war that used to hang over all of us, so it’s mystifying to see Democrats carping about any possible North Korea deal.

# tokenization

nlp.Defaults.tokenizer_exceptions

In [150]:
for i, s in islice(enumerate(nyt), 10):
    print(i, s)

0 Sure
1 ,
2 we
3 all
4 wish
5 that
6 Trump
7 treated
8 Justin
9 Trudeau


# sentence segmentation

In [151]:
for i, s in islice(enumerate(nyt.sents), 5):
    print(i, s)

0 Sure, we all wish that Trump treated Justin Trudeau or Angela Merkel with the respect that he now shows Kim Jong-un.
1 Yes, it seems that Trump has been played by Kim.
2 Yet another way of putting it is that Trump is finally investing in the kind of diplomatic engagement that he used to denounce, and that we should all applaud.


3 Trump’s newfound pragmatism is infinitely preferable to the threat of nuclear war that used to hang over all of us, so it’s mystifying to see Democrats carping about any possible North Korea deal.


# noun chunks

In [154]:
for i, s in islice(enumerate(nyt.noun_chunks), 5):
    print(i, s)

0 we
1 Trump
2 Justin Trudeau
3 Angela Merkel
4 the respect


# Part Of Speech

word function inside a sentence

In [173]:
trudeau = nyt[9]
trudeau

Trudeau

In [174]:
trudeau.pos_, POS_TAGS[trudeau.pos_]

('PROPN', 'proper noun')

# dependency relation

In [160]:
trudeau.dep_, DEP_TAGS[trudeau.dep_]

('dobj', 'Direct Object')

In [168]:
sen1_as_doc = next(nyt.sents).as_doc()

displacy.render(sen1_as_doc, style='dep', jupyter=True)

# lemmatization

In [175]:
treated = nyt[7]
treated

treated

In [176]:
treated.lemma_

'treat'

# shape

In [178]:
trudeau.shape_

'Xxxxx'

# is alpha

In [179]:
trudeau.is_alpha

True

# stopword

very frequent words that carry only a tiny part of sentence's information

nlp.Defaults.stop_words

In [180]:
trudeau.is_stop, wish.is_stop

(False, True)

# named entity recognition

In [181]:
for i, s in enumerate(nyt.ents):
    print(i, s)

0 Trump
1 Justin Trudeau
2 Angela Merkel
3 Kim Jong-un
4 Trump
5 Kim
6 Trump
7 Trump’s
8 Democrats
9 North Korea


In [182]:
displacy.render(nyt, style='ent', jupyter=True)

# to DataFrame

In [184]:
doc_to_df(nyt).head()

Unnamed: 0,Text,POS,Dep,Lemma,Tag,Shape,Alpha,Stop,Head,Left,Right,Entity,EntIOB,Lemma.1
0,Sure,interjection,intj,sure,UH,Xxxx,X,--,wish,Sure,Sure,,O,sure
1,",",punctuation,punct,",",",",",",--,--,wish,",",",",,O,","
2,we,pronoun,nsubj,-PRON-,PRP,xx,X,X,wish,we,all,,O,-PRON-
3,all,determiner,appos,all,DT,xxx,X,X,we,all,all,,O,all
4,wish,verb,ROOT,wish,VBP,xxxx,X,--,wish,Sure,.,,O,wish


# word vector

In [194]:
doc = nlp('Paris Brazil mango apple')

from itertools import product
import numpy as np

pd.DataFrame(np.array([w1.similarity(w2) for w1, w2 in product(doc, doc)]).reshape((4, 4)), index=doc, columns=doc)

Unnamed: 0,Paris,Brazil,mango,apple
Paris,1.0,0.436625,0.276051,0.062527
Brazil,0.436625,1.0,0.315326,-0.039096
mango,0.276051,0.315326,1.0,0.400696
apple,0.062527,-0.039096,0.400696,1.0


# Pipeline

https://spacy.io/usage/processing-pipelines

## ['tagger', 'parser', 'ner']

You can disable pipeline steps when you load the model or when you create a Doc.

disable=['parser']

https://spacy.io/usage/linguistic-features#disabling

# Rule Based Matching

Available attributes: https://spacy.io/usage/linguistic-features#adding-patterns-attributes

In [87]:
from spacy.matcher import Matcher

def match_results(doc, pattern):
    
    matcher = Matcher(nlp.vocab)
    matcher.add('pattern', None, pattern)

    for match_id, start, end in matcher(doc):

        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]

        print(match_id, string_id, start, end)
        print(span.text)

In [88]:
doc = nlp('My e-mail is abevieiramota@gmail.com')

# {} = any token
pattern = [{}, {'LIKE_EMAIL': True}]

match_results(doc, pattern)

15329811787164753587 pattern 4 6
is abevieiramota@gmail.com


In [95]:
doc = nlp('My car works now. I fixed it last month.')

pattern = [{'POS': 'NOUN', 'OP': '!'}, {'POS': 'VERB'}]

match_results(doc, pattern)

15329811787164753587 pattern 5 7
I fixed


# Phrase Based Matching

https://spacy.io/usage/linguistic-features#adding-phrase-patterns

In [108]:
from spacy.matcher import PhraseMatcher

reference = [nlp(piece.text) for piece in nlp('Einstein was born in Ulm.')]
reference = [nlp('Einstein')]
candidate = nlp('Ulm is the birth place of Einstein')

matcher = PhraseMatcher(nlp.vocab)
matcher.add('', None, *reference)

for match_id, start, end in matcher(candidate):
    
    span = candidate[start:end]
    
    print(span.text)

In [105]:
reference

[Einstein, was, born, in, Ulm, .]

In [107]:
list(candidate)

[Ulm, is, the, birth, place, of, Einstein]