In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [8]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN dobj
startup 100 VERB dep
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj


In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x267842687c0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x26782b249a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x26784262e40>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2678291c380>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x26784117b80>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x26784262ba0>)]

In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [13]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [14]:
for token in doc2:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
n't 94 PART neg
looking 100 VERB ROOT
into 85 ADP prep
startups 92 NOUN pobj
anymore 86 ADV advmod
. 97 PUNCT punct


In [15]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [16]:
life_quote = doc3[16:30]

In [17]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [18]:
type(life_quote)

spacy.tokens.span.Span

In [19]:
type(doc3)

spacy.tokens.doc.Doc

In [20]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [21]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [22]:
doc4[6].is_sent_start

True

In [23]:
doc4[8].is_sent_start

False

In [24]:
doc4[5].is_sent_end

True

### Tokenization

In [25]:
mystring = '"We\'re moving to L.A.!"'

In [26]:
mystring

'"We\'re moving to L.A.!"'

In [27]:
print(mystring)

"We're moving to L.A.!"


In [28]:
doc = nlp(mystring)

In [29]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [33]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@gmail.com or visit us at http://gmail.com!")

In [34]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@gmail.com
or
visit
us
at
http://gmail.com
!


In [35]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [36]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [37]:
len(doc4)

11

In [38]:
doc4.vocab

<spacy.vocab.Vocab at 0x26784b6bb80>

In [39]:
len(doc4.vocab)

844

In [40]:
len(doc.vocab)

844

In [42]:
doc5 = nlp(u'It is better to give than to receive.')
doc5[2:5]

better to give

In [43]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

In [44]:
for token in doc8:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [46]:
for ent in doc8.ents:
    print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

Apple ORG Companies, agencies, institutions, etc.
Hong Kong GPE Countries, cities, states
$6 million MONEY Monetary values, including unit


In [47]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [48]:
for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [49]:
from spacy import displacy

In [50]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [51]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110})

In [52]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')

In [53]:
displacy.render(doc, style='ent', jupyter=True)

In [54]:
doc = nlp(u'This is a sentence.')

In [55]:
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [26/Mar/2022 17:45:01] "GET / HTTP/1.1" 200 3395
127.0.0.1 - - [26/Mar/2022 17:45:01] "GET /favicon.ico HTTP/1.1" 200 3395


Shutting down server on port 5000.


### Stemming

In [56]:
import nltk

In [57]:
from nltk.stem.porter import PorterStemmer

In [58]:
p_stemmer = PorterStemmer()

In [66]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [67]:
for word in words:
    print(word + '----->'+ p_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli
fairness----->fair


In [62]:
from nltk.stem.snowball import SnowballStemmer

In [64]:
s_stemmer = SnowballStemmer(language='english')

In [68]:
for word in words:
    print(word + '----->'+ s_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fair
fairness----->fair


In [69]:
words = ['generous', 'generation', 'generously', 'generate']

In [70]:
for word in words:
    print(word + '---->'+s_stemmer.stem(word))

generous---->generous
generation---->generat
generously---->generous
generate---->generat


### Lemmatization

In [71]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [74]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [75]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [76]:
show_lemmas(doc1)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   4690420944186131903    I
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   4690420944186131903    I
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


### Stop words

In [77]:
print(nlp.Defaults.stop_words)

{'thru', 'herein', 'get', 'never', 'quite', 'five', 'nothing', 'ourselves', 'had', 'yours', 'other', 'can', '‘s', 'down', 'of', 'thereupon', 'among', 'under', 'last', 'neither', 'no', 'something', 'whether', 'whence', 'whereas', 'across', 'could', 'he', 'us', 'along', 'through', 'however', 'via', '’s', 'third', 'should', 'ca', 'either', 'noone', '’ve', 'every', 'ever', 'also', 'whole', 'seems', 'whenever', 'give', 'less', 'serious', 'whose', 'wherein', 'being', '‘ll', 'two', 'none', 'towards', 'beyond', 'about', 'eight', 'which', 'everything', 'here', 'are', 'therefore', 'when', 'must', 'more', 'anyhow', 'own', 'seem', 'by', 'done', 'you', 'most', 'a', 'empty', 'am', 'one', 'for', 'she', 'see', 'may', 'although', 'twenty', 'up', 'back', 'off', 'front', 'in', 'these', 'once', 'your', 'how', 'since', 'ours', 'have', 'as', 'my', '‘m', 'name', 'is', 'afterwards', 'will', 'becoming', 'hers', 'on', '‘d', 'enough', 'between', 'while', 'doing', 'we', 'made', 'anyone', 'do', 'former', 'whither'

In [78]:
len(nlp.Defaults.stop_words)

326

In [79]:
nlp.vocab['is'].is_stop

True

In [80]:
nlp.vocab['mystery'].is_stop

False

In [81]:
nlp.Defaults.stop_words.add('btw')

In [82]:
nlp.vocab['btw'].is_stop=True

In [83]:
nlp.vocab['btw'].is_stop

True

In [88]:
nlp.Defaults.stop_words.remove('beyond')

In [90]:
nlp.vocab['beyond'].is_stop

False

### Rule-based Matching

In [91]:
from spacy.matcher import Matcher

In [92]:
matcher = Matcher(nlp.vocab)

In [112]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]


In [113]:
matcher.add('SolarPower', patterns=[pattern1, pattern2, pattern3])

In [114]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [115]:
found_matces = matcher(doc)
print(found_matces)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [116]:
for match_id, start, end in found_matces:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [127]:
matcher.remove('SolarPower')

In [128]:
pattern1 = [{'LOWER': 'solarpowered'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LOWER': 'powered'}]


In [129]:
matcher.add('SolarPower', patterns=[pattern1, pattern2])

In [130]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [131]:
found_matces = matcher(doc2)
print(found_matces)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [133]:
for match_id, start, end in found_matces:
    string_id = nlp.vocab.strings[match_id]
    span = doc2[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 Solar-powered
8656102463236116519 SolarPower 5 8 solar-powered


In [134]:
from spacy.matcher import PhraseMatcher

In [135]:
matcher = PhraseMatcher(nlp.vocab)

In [136]:
with open('data/reaganomics.txt') as f:
    doc3= nlp(f.read())

In [137]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [138]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [139]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [141]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [142]:
found_matches = matcher(doc3)

In [143]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [145]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2991 trickle-down economics
