In [146]:
import spacy
from spacy import displacy
import pandas as pd

spacy.info('en_core_web_lg')


    [93mInfo about model en_core_web_lg[0m

    lang               en             
    pipeline           ['tagger', 'parser', 'ner']
    accuracy           {'token_acc': 99.8890484271, 'ents_p': 85.540697997, 'ents_r': 86.1621863298, 'uas': 91.8900594047, 'tags_acc': 97.2044842264, 'ents_f': 85.8503174073, 'las': 90.0726533777}
    name               core_web_lg    
    license            CC BY-SA 3.0   
    author             Explosion AI   
    url                https://explosion.ai
    vectors            {'keys': 684830, 'width': 300, 'vectors': 684831}
    sources            ['OntoNotes 5', 'Common Crawl']
    version            2.0.0          
    spacy_version      >=2.0.0a18     
    parent_package     spacy          
    speed              {'gpu': None, 'nwords': 291344, 'cpu': 5023.1042787614}
    email              contact@explosion.ai
    description        English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, con

In [147]:
POS_TAGS = \
{'ADJ': ' adjective',
 'ADP': ' adposition',
 'ADV': ' adverb',
 'AUX': ' auxiliary verb',
 'CONJ': ' coordinating conjunction',
 'DET': ' determiner',
 'INTJ': ' interjection',
 'NOUN': ' noun',
 'NUM': ' numeral',
 'PART': ' particle',
 'PRON': ' pronoun',
 'PROPN': ' proper noun',
 'PUNCT': ' punctuation',
 'SCONJ': ' subordinating conjunction',
 'SYM': ' symbol',
 'VERB': ' verb',
 'X': ' other'}
MAP_0_1 = {0: '--', 1: 'X'}
COLUMNS = ["Text", "POS", "Dep", "Lemma", "Tag", "Shape", "Alpha", "Stop", "Head", "Left", "Right", "Entity", "EntIOB", "Lemma"]

def doc_to_df(doc):
    
    data = []
    for token in doc:
        
        data.append((token.text, POS_TAGS[token.pos_], token.dep_, token.lemma_, token.tag_, 
                         token.shape_, MAP_0_1[token.is_alpha], MAP_0_1[token.is_stop],
                         token.head.text, token.left_edge.text, token.right_edge.text,
                         token.ent_type_, token.ent_iob_, token.lemma_))
        
    return pd.DataFrame(data, columns=COLUMNS)

In [149]:
nlp = spacy.load('en_core_web_lg')

# Steve Jobs ate about 3 burguers!

In [150]:
doc = nlp("Steve Jobs ate about 3 burguers!")

doc_to_df(doc)

Unnamed: 0,Text,POS,Dep,Lemma,Tag,Shape,Alpha,Stop,Head,Left,Right,Entity,EntIOB,Lemma.1
0,Steve,proper noun,compound,steve,NNP,Xxxxx,X,--,Jobs,Steve,Steve,PERSON,B,steve
1,Jobs,proper noun,nsubj,jobs,NNP,Xxxx,X,--,ate,Steve,Jobs,PERSON,I,jobs
2,ate,verb,ROOT,eat,VBD,xxx,X,--,ate,Steve,!,,O,eat
3,about,adverb,advmod,about,RB,xxxx,X,--,3,about,about,CARDINAL,B,about
4,3,numeral,nummod,3,CD,d,--,--,burguers,about,3,CARDINAL,I,3
5,burguers,noun,dobj,burguer,NNS,xxxx,X,--,ate,about,burguers,,O,burguer
6,!,punctuation,punct,!,.,!,--,--,ate,!,!,,O,!


In [151]:
displacy.render(doc, style='dep', jupyter=True, options={"distance": 150})

In [152]:
displacy.render(doc, style='ent', jupyter=True, options={"distance": 150})

In [153]:
doc.ents

(Steve Jobs, about 3)

# Text Similarity

https://spacy.io/usage/spacy-101#vectors-similarity

To make them compact and fast, spaCy's small models (all packages that end in sm) don't ship with word vectors, and only include context-sensitive tensors. This means you can still use the similarity() methods to compare documents, spans and tokens – but the result won't be as good, and individual tokens won't have any vectors assigned. So in order to use real word vectors, you need to download a larger model:

In [154]:
tokens = nlp("cat dog banana war peace")

data = {}
for token1 in tokens:
    data[token1.text] = {}
    for token2 in tokens:
        
        data[token1.text][token2.text] = token1.similarity(token2)
        
pd.DataFrame(data)

Unnamed: 0,cat,dog,banana,war,peace
banana,0.281544,0.243276,1.0,0.11435,0.173628
cat,1.0,0.801686,0.281544,0.189269,0.194324
dog,0.801686,1.0,0.243276,0.246757,0.22431
peace,0.194324,0.22431,0.173628,0.552485,1.0
war,0.189269,0.246757,0.11435,1.0,0.552485


In [157]:
doc_to_df(nlp('Hi! How are you? -_-'))

Unnamed: 0,Text,POS,Dep,Lemma,Tag,Shape,Alpha,Stop,Head,Left,Right,Entity,EntIOB,Lemma.1
0,Hi,interjection,ROOT,hi,UH,Xx,X,--,Hi,Hi,!,,O,hi
1,!,punctuation,punct,!,.,!,--,--,Hi,!,!,,O,!
2,How,adverb,advmod,how,WRB,Xxx,X,--,are,How,How,,O,how
3,are,verb,ROOT,be,VBP,xxx,X,--,are,How,?,,O,be
4,you,pronoun,nsubj,-PRON-,PRP,xxx,X,--,are,you,you,,O,-PRON-
5,?,punctuation,punct,?,.,?,--,--,are,?,?,,O,?
6,-_-,punctuation,ROOT,-_-,.,-_-,--,--,-_-,-_-,-_-,,O,-_-


# Language defaults

In [None]:
nlp.Defaults.stop_words

nlp.Defaults.tokenizer_exceptions

# Noun chunks

In [163]:
doc = nlp("It's been a long time. Nowadays, people eat lots of flour and drink lots of beer!")

In [164]:
list(doc.noun_chunks)

[It, a long time, people, lots, flour, lots, beer]