In [1]:
import spacy
from spacy import displacy
import pandas as pd

spacy.info('en_core_web_lg')


    [93mInfo about model en_core_web_lg[0m

    lang               en             
    pipeline           ['tagger', 'parser', 'ner']
    accuracy           {'token_acc': 99.8890484271, 'ents_p': 85.540697997, 'ents_r': 86.1621863298, 'uas': 91.8900594047, 'tags_acc': 97.2044842264, 'ents_f': 85.8503174073, 'las': 90.0726533777}
    name               core_web_lg    
    license            CC BY-SA 3.0   
    author             Explosion AI   
    url                https://explosion.ai
    vectors            {'keys': 684830, 'width': 300, 'vectors': 684831}
    sources            ['OntoNotes 5', 'Common Crawl']
    version            2.0.0          
    spacy_version      >=2.0.0a18     
    parent_package     spacy          
    speed              {'gpu': None, 'nwords': 291344, 'cpu': 5023.1042787614}
    email              contact@explosion.ai
    description        English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, con

# Pipeline

https://spacy.io/usage/processing-pipelines

## ['tagger', 'parser', 'ner']

You can disable pipeline steps when you load the model or when you create a Doc.

disable=['parser']

https://spacy.io/usage/linguistic-features#disabling

In [2]:
# https://spacy.io/api/annotation#pos-tagging
POS_TAGS = \
{'ADJ': ' adjective',
 'ADP': ' adposition',
 'ADV': ' adverb',
 'AUX': ' auxiliary verb',
 'CONJ': ' coordinating conjunction',
 'DET': ' determiner',
 'INTJ': ' interjection',
 'NOUN': ' noun',
 'NUM': ' numeral',
 'PART': ' particle',
 'PRON': ' pronoun',
 'PROPN': ' proper noun',
 'PUNCT': ' punctuation',
 'SCONJ': ' subordinating conjunction',
 'SYM': ' symbol',
 'VERB': ' verb',
 'X': ' other'}
MAP_0_1 = {0: '--', 1: 'X'}
COLUMNS = ["Text", "POS", "Dep", "Lemma", "Tag", "Shape", "Alpha", "Stop", "Head", "Left", "Right", "Entity", "EntIOB", "Lemma"]

def doc_to_df(doc):
    
    data = []
    for token in doc:
        
        data.append((token.text, POS_TAGS[token.pos_], token.dep_, token.lemma_, token.tag_, 
                         token.shape_, MAP_0_1[token.is_alpha], MAP_0_1[token.is_stop],
                         token.head.text, token.left_edge.text, token.right_edge.text,
                         token.ent_type_, token.ent_iob_, token.lemma_))
        
    return pd.DataFrame(data, columns=COLUMNS)

In [3]:
nlp = spacy.load('en_core_web_lg')

# Steve Jobs ate about 3 burguers!

In [4]:
doc = nlp("Steve Jobs ate about 3 burguers!")

doc_to_df(doc)

Unnamed: 0,Text,POS,Dep,Lemma,Tag,Shape,Alpha,Stop,Head,Left,Right,Entity,EntIOB,Lemma.1
0,Steve,proper noun,compound,steve,NNP,Xxxxx,X,--,Jobs,Steve,Steve,PERSON,B,steve
1,Jobs,proper noun,nsubj,jobs,NNP,Xxxx,X,--,ate,Steve,Jobs,PERSON,I,jobs
2,ate,verb,ROOT,eat,VBD,xxx,X,--,ate,Steve,!,,O,eat
3,about,adverb,advmod,about,RB,xxxx,X,--,3,about,about,CARDINAL,B,about
4,3,numeral,nummod,3,CD,d,--,--,burguers,about,3,CARDINAL,I,3
5,burguers,noun,dobj,burguer,NNS,xxxx,X,--,ate,about,burguers,,O,burguer
6,!,punctuation,punct,!,.,!,--,--,ate,!,!,,O,!


In [5]:
displacy.render(doc, style='dep', jupyter=True, options={"distance": 150})

In [6]:
displacy.render(doc, style='ent', jupyter=True, options={"distance": 150})

In [7]:
doc.ents

(Steve Jobs, about 3)

# Text Similarity

https://spacy.io/usage/spacy-101#vectors-similarity

To make them compact and fast, spaCy's small models (all packages that end in sm) don't ship with word vectors, and only include context-sensitive tensors. This means you can still use the similarity() methods to compare documents, spans and tokens – but the result won't be as good, and individual tokens won't have any vectors assigned. So in order to use real word vectors, you need to download a larger model:

In [8]:
tokens = nlp("cat dog banana war peace")

data = {}
for token1 in tokens:
    data[token1.text] = {}
    for token2 in tokens:
        
        data[token1.text][token2.text] = token1.similarity(token2)
        
pd.DataFrame(data)

Unnamed: 0,cat,dog,banana,war,peace
banana,0.281544,0.243276,1.0,0.11435,0.173628
cat,1.0,0.801686,0.281544,0.189269,0.194324
dog,0.801686,1.0,0.243276,0.246757,0.22431
peace,0.194324,0.22431,0.173628,0.552485,1.0
war,0.189269,0.246757,0.11435,1.0,0.552485


In [9]:
doc_to_df(nlp('Hi! How are you? -_-'))

Unnamed: 0,Text,POS,Dep,Lemma,Tag,Shape,Alpha,Stop,Head,Left,Right,Entity,EntIOB,Lemma.1
0,Hi,interjection,ROOT,hi,UH,Xx,X,--,Hi,Hi,!,,O,hi
1,!,punctuation,punct,!,.,!,--,--,Hi,!,!,,O,!
2,How,adverb,advmod,how,WRB,Xxx,X,--,are,How,How,,O,how
3,are,verb,ROOT,be,VBP,xxx,X,--,are,How,?,,O,be
4,you,pronoun,nsubj,-PRON-,PRP,xxx,X,--,are,you,you,,O,-PRON-
5,?,punctuation,punct,?,.,?,--,--,are,?,?,,O,?
6,-_-,punctuation,ROOT,-_-,.,-_-,--,--,-_-,-_-,-_-,,O,-_-


# Language defaults

In [10]:
nlp.Defaults.stop_words
nlp.Defaults.tokenizer_exceptions

{' ': [{65: ' ', 74: 102}],
 '\t': [{65: '\t', 74: 102}],
 '\\t': [{65: '\\t', 74: 102}],
 '\n': [{65: '\n', 74: 102}],
 '\\n': [{65: '\\n', 74: 102}],
 '—': [{65: '—', 74: 96, 73: '--'}],
 '\xa0': [{65: '\xa0', 74: 102, 73: '  '}],
 "'": [{65: "'"}],
 '\\")': [{65: '\\")'}],
 '<space>': [{65: '<space>'}],
 "''": [{65: "''"}],
 'C++': [{65: 'C++'}],
 'a.': [{65: 'a.'}],
 'b.': [{65: 'b.'}],
 'c.': [{65: 'c.'}],
 'd.': [{65: 'd.'}],
 'e.': [{65: 'e.'}],
 'f.': [{65: 'f.'}],
 'g.': [{65: 'g.'}],
 'h.': [{65: 'h.'}],
 'i.': [{65: 'i.'}],
 'j.': [{65: 'j.'}],
 'k.': [{65: 'k.'}],
 'l.': [{65: 'l.'}],
 'm.': [{65: 'm.'}],
 'n.': [{65: 'n.'}],
 'o.': [{65: 'o.'}],
 'p.': [{65: 'p.'}],
 'q.': [{65: 'q.'}],
 'r.': [{65: 'r.'}],
 's.': [{65: 's.'}],
 't.': [{65: 't.'}],
 'u.': [{65: 'u.'}],
 'v.': [{65: 'v.'}],
 'w.': [{65: 'w.'}],
 'x.': [{65: 'x.'}],
 'y.': [{65: 'y.'}],
 'z.': [{65: 'z.'}],
 'ä.': [{65: 'ä.'}],
 'ö.': [{65: 'ö.'}],
 'ü.': [{65: 'ü.'}],
 '(:': [{65: '(:'}],
 '-_-': [{65: '-_-

# Noun chunks

In [47]:
doc = nlp("The story you told to Peter is false!")

In [48]:
list(doc.noun_chunks)

[The story, you, Peter]

In [49]:
doc[3].text, list(doc[3].lefts), list(doc[3].rights)

('told', [you], [to])

In [50]:
from spacy import displacy

displacy.render(doc, jupyter=True)

In [67]:
[doc[1].left_edge, doc[1].right_edge], list(doc[1].subtree)

([The, Peter], [The, story, you, told, to, Peter])

# Named Entity Recognition

https://spacy.io/usage/linguistic-features#101

A named entity is a "real-world object" that's assigned a name – for example, a person, a country, a product or a book title. 

In [69]:
list(doc.ents)

[Peter]

In [72]:
displacy.render(doc, style='ent', jupyter=True)

# Rule Based Matching

Available attributes: https://spacy.io/usage/linguistic-features#adding-patterns-attributes

In [87]:
from spacy.matcher import Matcher

def match_results(doc, pattern):
    
    matcher = Matcher(nlp.vocab)
    matcher.add('pattern', None, pattern)

    for match_id, start, end in matcher(doc):

        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]

        print(match_id, string_id, start, end)
        print(span.text)

In [88]:
doc = nlp('My e-mail is abevieiramota@gmail.com')

# {} = any token
pattern = [{}, {'LIKE_EMAIL': True}]

match_results(doc, pattern)

15329811787164753587 pattern 4 6
is abevieiramota@gmail.com


In [95]:
doc = nlp('My car works now. I fixed it last month.')

pattern = [{'POS': 'NOUN', 'OP': '!'}, {'POS': 'VERB'}]

match_results(doc, pattern)

15329811787164753587 pattern 5 7
I fixed


# Phrase Based Matching

https://spacy.io/usage/linguistic-features#adding-phrase-patterns

In [108]:
from spacy.matcher import PhraseMatcher

reference = [nlp(piece.text) for piece in nlp('Einstein was born in Ulm.')]
reference = [nlp('Einstein')]
candidate = nlp('Ulm is the birth place of Einstein')

matcher = PhraseMatcher(nlp.vocab)
matcher.add('', None, *reference)

for match_id, start, end in matcher(candidate):
    
    span = candidate[start:end]
    
    print(span.text)

In [105]:
reference

[Einstein, was, born, in, Ulm, .]

In [107]:
list(candidate)

[Ulm, is, the, birth, place, of, Einstein]