# Lightning Tour

## Get Tokens, Noun Chunks, and Sentences

- .text
- .noun_chunks
- .sents

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp("Huckleberry is the greatest dog in the world. He runs fast, is super chill, and smells like cinnamon.")
print(doc.text, '\n')

#tokens with the .text attribute
print(doc[0].text)
print(doc[10:14].text, '\n')

#noun chunks with .noun_chunks attribute
noun_chunks = list(doc.noun_chunks)
print(noun_chunks, '\n')

#sentence with .sents attribute
sentences = list(doc.sents)
print(sentences[1])


#sents + noun chunks
print(list(sentences[1].noun_chunks))

Huckleberry is the greatest dog in the world. He runs fast, is super chill, and smells like cinnamon. 

Huckleberry
runs fast, is 

[Huckleberry, the greatest dog, the world, He, cinnamon] 

He runs fast, is super chill, and smells like cinnamon.
[He, cinnamon]


## Parts of Speech Tags

In [2]:
doc = nlp("The sandalwood strapped to the ceiling.")
print(doc.text, '\n')


print ("Word: ", doc[1].text)
print("POS Tag: ", doc[1].pos_, doc[1].pos)
print("Word Shape: ", doc[1].shape_, doc[1].shape)


The sandalwood strapped to the ceiling. 

Word:  sandalwood
POS Tag:  NOUN 91
Word Shape:  xxxx 13110060611322374290


## Recognize Named Entities

In [3]:
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

from spacy.tokens import Span
doc = nlp(u'FB is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE
FB 0 2 ORG


## Visualize a Dependency Parse

## Word Vectors and Similarity

In [6]:
nlp2 = spacy.load('en_core_web_md')
doc = nlp2("Wood and banana are the fuel of life.")

wood = doc[0]
banana = doc[2]
fuel = doc[5]
life = doc[7]

print('wood : banana ', wood.similarity(banana))
print ('wood: ', wood.has_vector,'\n banana: ', banana.has_vector)

wood : banana  0.2815443
wood:  True 
 banana:  True


## Serialization (Saving)


In [9]:
doc = nlp2("Huckleberry is the best dog in the world")
print(doc.text)

#Need to get this to work

Huckleberry is the best dog in the world


## Match Text with Token Rules
???
Need to figure out fuzzy matching ASAP

In [11]:
from spacy.matcher import Matcher
matcher  =  Matcher(nlp.vocab)
pattern = [{'ORTH':'Woman'}]
matcher.add("woman", None, pattern)

doc= nlp('woman, Woman, women')
matches = matcher(doc)

for idx, strt, end in matches:
    string = nlp.vocab.strings[idx]
    span = doc[strt:end]
    print(string, span)

woman Woman
