In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [None]:
for token in doc:
  print(token.text,)

Tesla
is
looking
at
buying
U.S.
startup
for
$
6
million


In [None]:
for token in doc:
  print(token.text, token.pos) #pos parts of speach

Tesla 96
is 87
looking 100
at 85
buying 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93


In [None]:
for token in doc:
  print(token.text, token.pos_)

Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM


In [None]:
for token in doc:
  print(token.text, token.pos_, token.dep_) # dep : dependency

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [None]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fe6214b2110>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fe62074be50>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fe62074bc90>)]

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
doc2 = nlp(u"Tesla isn't     looking into startups anymore.")

In [None]:
for token in doc2:
  print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
     SPACE 
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [None]:
doc[6].pos_

'NOUN'

In [None]:
# spacy.io

In [None]:
doc2[0].dep_

'nsubj'

In [None]:
doc2[0].text

'Tesla'

In [None]:
doc2[0].lemma_

'Tesla'

In [None]:
doc2[0].tag_

'NNP'

In [None]:
doc2[0].pos_

'PROPN'

In [None]:
doc2[0].shape_

'Xxxxx'

In [None]:
doc2[0].is_alpha

True

In [None]:
doc2[0].is_stop

False

#Spans

In [None]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [None]:
life_quote = doc3[16:30]

In [None]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [None]:
type(life_quote)

spacy.tokens.span.Span

In [None]:
type(doc3)

spacy.tokens.doc.Doc

In [None]:
doc4 =nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [None]:
for sentence in doc4.sents:
  print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence


In [None]:
doc4[6]

This

In [None]:
doc4[6].is_sent_start

True

In [None]:
doc4[7].is_sent_start

In [None]:
doc4[8].is_sent_start

In [None]:
doc4[2].is_sent_start

In [None]:
doc4[6].is_sent_start

True

#Tokenization

Tokenization is the process of breaking up the original text into component pieces(tokens).

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
mystring = '"We\'re moving to L.A.!"'

In [None]:
print(mystring)

"We're moving to L.A.!"


In [None]:
doc = nlp(mystring)

In [None]:
for token in doc:
  print(token.text)

"
We
're
moving
to
L.A.
!
"


In [None]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [None]:
for t in doc2:
  print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [None]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [None]:
for t in doc3:
  print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [None]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [None]:
for t in doc4:
  print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [None]:
len(doc4)

11

In [None]:
doc4.vocab

<spacy.vocab.Vocab at 0x7fe61af0fb90>

In [None]:
len(doc4.vocab)

512

In [None]:
doc5 = nlp(u" It is better to give than receive.")

In [None]:
doc5[0]

 

In [None]:
doc5[2:5]

is better to

In [None]:
doc5[0] = 'test'

TypeError: ignored

In [None]:
doc8 =nlp(u" Apple to build a Hong Kong factory for $6 million")

In [None]:
for token in doc8:
  print(token.text, end=' | ')

  | Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [None]:
for entity in doc8.ents:
  print(entity)
  print(entity.label_)
  print(str(spacy.explain(entity.label_)))
  print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [None]:
for entity in doc8.ents:
  print(entity)
  print('\n')

Apple


Hong Kong


$6 million




In [None]:
for entity in doc8.ents:
  print(entity)
  print(entity.label_)
  print('\n')

Apple
ORG


Hong Kong
GPE


$6 million
MONEY




In [None]:
for entity in doc8.ents:
  print(entity)
  print(entity.label_)
  print(str(spacy.explain(entity.label_)))
  print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [None]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [None]:
for chunk in doc9.noun_chunks:
  print(chunk)

Autonomous cars
insurance liability
manufacturers


#Tokenization Visualized

In [None]:
from spacy import displacy

In [None]:
doc = nlp(u"apple is going to build a U.K. factory for $6 million.")

In [None]:
displacy.render(doc,style = 'dep', jupyter = True, options={'distance':100})

In [None]:
doc =nlp(u"Over the last quarter Apple 20 thousand iPods for a profit of $6 million")

In [None]:
displacy.render(doc, style='ent', jupyter = True)

In [None]:
# spaCy library for visualization Options

#Stemming

Stemming is a somewhat crude method for cataloging related words; it essetially chops off letters from the end until the stem is reached.

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
p_stemmer = PorterStemmer()

In [None]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [None]:
for word in words:
  print(word + "----->" + p_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli
fairness----->fair


In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language='english')

In [None]:
for word in words:
  print(word + "----->" + s_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fair
fairness----->fair


In [None]:
words = ['generous', 'generation', 'generously', 'generate']

In [None]:
for word in words:
  print(word + "----->" + s_stemmer.stem(word))

generous----->generous
generation----->generat
generously----->generous
generate----->generat


#Lemmatization



In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [None]:
for token in doc1:
  print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [None]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [None]:
doc2 = nlp(u"I saw ten mice today!")

In [None]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop Words

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
print(nlp.Defaults.stop_words)

{'over', 'onto', 'while', 'whose', 'eight', 'however', 'would', 'just', 'will', 'everything', 'someone', 'us', 'same', 'your', 'top', 'twenty', 'few', 'therein', 'still', '‘re', 'him', 'yet', 'ca', 'mostly', 'always', 'its', 'regarding', 'mine', 'keep', 'in', 'sometime', 'under', 'through', 'after', 'then', 'should', 'whole', 'were', 'by', 'various', 'if', '‘m', 'alone', 'does', 'five', 'full', 'is', 'off', 'has', 'make', 'too', 'since', 'doing', 'within', 'whereas', 'forty', 'take', '‘ve', 'a', '‘ll', 'upon', 'whence', 'may', 'say', 'once', 'hence', 'empty', 'even', 'hereafter', 'except', 'are', 'quite', 'herein', 'also', 'front', 'himself', 'themselves', 'during', 'formerly', 'up', 'together', 'though', 'no', 'all', 'because', 'thence', 'hundred', 'nobody', 'namely', "'re", 'one', 'when', 'twelve', 'back', 'our', 'before', 'of', 'done', 'others', 'must', 'anyone', 'get', 'only', 'not', 'wherein', '’re', 'where', 'towards', 'do', 'bottom', 'perhaps', 'noone', 'four', 'other', 'otherwi

In [None]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['is']

<spacy.lexeme.Lexeme at 0x7fe61a0fb460>

In [None]:
nlp.vocab['is'].is_stop

True

In [None]:
nlp.vocab['mystery'].is_stop

False

In [None]:
nlp.Defaults.stop_words.add('btw')

In [None]:
nlp.vocab['btw'].is_stop = True

In [None]:
len(nlp.Defaults.stop_words)

327

In [None]:
nlp.vocab['btw'].is_stop

True

In [None]:
nlp.Defaults.stop_words.remove('beyond')

In [None]:
nlp.vocab['beyond'].is_stop = False

In [None]:
nlp.vocab['beyond'].is_stop

False

#Vocabulary and Matching

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
from spacy.matcher import Matcher

In [None]:
matcher = Matcher(nlp.vocab)

In [None]:
# SolarPower
pattern1 = [{'LOWER': 'solarPower'}]
#Solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
#Solar power
pattern1 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [None]:
doc = nlp(u'The Solar Power industry continues to grow as demand for solarpower increases. Solar-power cars are gaining popularity.')

In [None]:
found_matches =matcher(doc)

In [None]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [None]:
for match_id, start, end in found_matches:
  string_id =nlp.vocab.strings[match_id] # get string representation
  span = doc[start:end]                  # get the matched span
  print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [None]:
matcher.remove('SolarPower')

In [None]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

In [None]:
matcher.add('SolarPower', None,pattern1,pattern2)

In [None]:
from spacy.matcher import PhraseMatcher

In [None]:
matcher = PhraseMatcher(nlp.vocab)

In [None]:
with open('reaganomics.txt') as f:
  doc3 = nlp(f.read())

UnicodeDecodeError: ignored