In [1]:
import spacy
#import en_core_web_sm

In [2]:
#nlp = en_core_web_sm.load()
nlp=spacy.load('en_core_web_sm')

In [3]:
doc=nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text, token.pos_,token.dep_)

Tesla NOUN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2730eee7fa0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2730eee7e80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2730ec4fba0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2730ef8ffc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2730ef9f5c0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2730ec4fb30>)]

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc2=nlp(u"Tesla isn't looking into  startups anymore.")

In [8]:
for token in doc2:
    print(token.text, token.pos_,token.dep_)

Tesla NOUN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
  SPACE dep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [9]:
doc2[0].pos_

'NOUN'

In [10]:
doc2[0].dep_

'nsubj'

In [11]:
doc3=nlp(u'Although commonly attributed to Join Lennon from his song "beautiful Boy", \
the pharase "life is what happens to us while we are making other plans" was written by \
cartoonist allen Saunders and published in Reader\'s digest in 1957, when lennon was 17.')

In [12]:
life_quote= doc3[16:30]

In [13]:
print(life_quote)

"life is what happens to us while we are making other plans"


In [14]:
type(life_quote)

spacy.tokens.span.Span

In [15]:
type(doc3)

spacy.tokens.doc.Doc

In [16]:
doc4=nlp(u"This is the first sentence. This is another sentence. this is the last sentencen")

In [17]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
this is the last sentencen


In [18]:
doc4[6].is_sent_start

True

In [19]:
doc4[6]

This

# Tokenization

In [20]:
mystring='"we\'re moving to L.A.!"'

In [21]:
print(mystring)

"we're moving to L.A.!"


In [22]:
doc=nlp(mystring)

In [23]:
for token in doc:
    print(token.text)

"
we
're
moving
to
L.A.
!
"


In [24]:
doc5=nlp(u"we're here to help! Send snall-mail, email support@oursite.com or visti us at http://www.oursite.com!")

In [25]:
for t in doc5:
    print(t)

we
're
here
to
help
!
Send
snall
-
mail
,
email
support@oursite.com
or
visti
us
at
http://www.oursite.com
!


In [26]:
doc6=nlp(u"A 5km NYC cab ride cost $10.30")

In [27]:
for t in doc6:
    print(t)

A
5
km
NYC
cab
ride
cost
$
10.30


In [28]:
doc7=nlp(u"Let's vist St. louis in the U.S next year.")

In [29]:
for t in doc7:
    print(t)

Let
's
vist
St.
louis
in
the
U.S
next
year
.


In [30]:
len(doc7)

11

In [31]:
doc7.vocab

<spacy.vocab.Vocab at 0x2730ec83040>

In [32]:
len(doc7.vocab)

846

In [33]:
doc8=nlp(u"It is better to give than receive.")

In [34]:
doc8[0]

It

In [35]:
doc8[2:5]

better to give

In [36]:
doc9=nlp(u'Apple to build a Hong Kong factory for $6 million')

In [37]:
for token in doc9:
    print(token.text,end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [38]:
for entity in doc9.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [39]:
doc10=nlp(u'Autonomonus cars shift insurance libality toward manufacturers.')

In [40]:
for chunk in doc10.noun_chunks:
    print(chunk)

Autonomonus cars
insurance libality
manufacturers


In [41]:
#visualize libaray in spacy
from spacy import displacy

In [42]:
displacy.render(doc9,style='dep',jupyter=True,options={'distance':100})

In [43]:
doc11=nlp(u"over the last quarter Apple sold nearly 20 thousand iPods for a profit $6 million")

In [44]:
# displacy.render(doc11,style='ent',jupyter=True)

In [45]:
doc=nlp(u"This is a sentence.")
#displacy.serve(doc,style='dep')

# Stemming

In [46]:
from nltk.stem.porter import PorterStemmer

In [47]:
p_stemmer=PorterStemmer()

In [48]:
words=['run','runner','ran', 'runs', 'easily', 'fairly','fairness']

In [49]:
for word in words:
    print(word+'----->'+p_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli
fairness----->fair


In [50]:
from nltk.stem.snowball import SnowballStemmer

In [51]:
s_stemmer=SnowballStemmer(language='english')

In [52]:
for word in words:
    print(word+'----->'+s_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fair
fairness----->fair


In [53]:
words=['generous','generation','generously','generate']

In [54]:
for word in words:
    print(word+'----->'+s_stemmer.stem(word))

generous----->generous
generation----->generat
generously----->generous
generate----->generat


# Lemmatization

In [55]:
doc1=nlp(u"I am a runner running in a race because i love to run since I ran faster")

In [56]:
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
i 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
faster 	 ADV 	 1826119438242743099 	 fast


In [57]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}}{token.pos_:{6}}{token.lemma:<{22}}{token.lemma_}')

In [58]:
doc2=nlp(u"I saw ten mice today!")

In [59]:
show_lemmas(doc2)

I           PRON  4690420944186131903   I
saw         VERB  11925638236994514241  see
ten         NUM   7970704286052693043   ten
mice        NOUN  1384165645700560590   mouse
today       NOUN  11042482332948150395  today
!           PUNCT 17494803046312582752  !


# stop_words

In [60]:
print(nlp.Defaults.stop_words)

{'his', 'quite', 'thence', '‘ve', 'under', 'mine', 'where', 'moreover', 'most', 'after', 'everywhere', 'used', 'our', 'latter', '‘s', 'once', 'have', 'eleven', 'whose', 'eight', 'by', 'an', 'regarding', 'namely', 'of', 'had', 'otherwise', 'such', 'least', 'no', 'both', 'empty', 'except', 'toward', 'four', 'give', 'go', 'i', 'perhaps', 'himself', 'hereafter', 'along', 'on', 'hence', 'seemed', 'during', 'get', 'made', 'side', 'front', "'re", 'just', 'third', 'me', 'fifty', 'ten', '‘m', 'in', 'nowhere', 'above', 'elsewhere', 'via', 'before', 'whence', 'your', 'n’t', 'amongst', 'serious', 'hundred', 'but', "'ll", 'any', 'others', 'does', 'thereupon', 'should', '‘re', 'against', 'either', 'twenty', 'hereby', 'anyhow', 'all', 'its', '’s', 'herein', 'off', 'though', 'will', 'is', 'fifteen', 'name', 'part', 'from', '’ll', 'wherein', 'behind', 'this', 'thru', 'below', 'someone', 'why', 'many', 'who', 'wherever', 'top', 'down', 'some', 'only', 'becomes', 'few', 'call', 'are', 'can', 'itself', 'b

In [61]:
len(nlp.Defaults.stop_words)

326

In [62]:
nlp.vocab['is'].is_stop

True

In [63]:
nlp.Defaults.stop_words.add('btw')

In [64]:
nlp.vocab['btw'].is_stop=True

In [65]:
len(nlp.Defaults.stop_words)

327

In [66]:
nlp.vocab['btw'].is_stop

True

In [67]:
nlp.Defaults.stop_words.remove('beyond')

In [68]:
nlp.vocab['beyond'].is_stop

False

# Vocabulary

In [69]:
from spacy.matcher import Matcher

In [70]:
matcher=Matcher(nlp.vocab)

In [71]:
#solarPower
#solar-Power
pattern = [[{'LOWER':'solarpower'}],
           [{'LOWER':'solar'}
            ,{'IS_PUNCT':True},
            {'LOWER':'power'}],
           [{'LOWER':'solar'},
            {'LOWER':'power'}]] 

In [72]:
matcher.add('SolarPower',pattern)

In [73]:
doc=nlp(u"the solar power industy continues to grow a solarpower")

In [74]:
found_match=matcher(doc)

In [75]:
print(found_match)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9)]


In [76]:
for match_id,start,end in found_match:
    string_id=nlp.vocab.strings[match_id]
    span=doc[start:end]
    print(match_id,string_id,start, end, span.text)

8656102463236116519 SolarPower 1 3 solar power
8656102463236116519 SolarPower 8 9 solarpower


In [77]:
matcher.remove('SolarPower')

# Part Of Speech Tag


In [78]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [79]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [80]:
print(doc[4].tag_)

VBD


In [81]:
print(doc[4].pos_)

VERB


In [82]:
for token in doc:
    print(f"{token.text:{10}}{token.pos_:{10}}{token.tag_:{10}},{spacy.explain(token.tag_)}")

The       DET       DT        ,determiner
quick     ADJ       JJ        ,adjective (English), other noun-modifier (Chinese)
brown     ADJ       JJ        ,adjective (English), other noun-modifier (Chinese)
fox       NOUN      NN        ,noun, singular or mass
jumped    VERB      VBD       ,verb, past tense
over      ADP       IN        ,conjunction, subordinating or preposition
the       DET       DT        ,determiner
lazy      ADJ       JJ        ,adjective (English), other noun-modifier (Chinese)
dog       NOUN      NN        ,noun, singular or mass
's        PART      POS       ,possessive ending
back      NOUN      NN        ,noun, singular or mass
.         PUNCT     .         ,punctuation mark, sentence closer


In [83]:
doc=nlp(u"I read books on NLP.")


In [84]:
word=doc[1]

In [85]:
word.text

'read'

In [86]:
token=word

In [87]:
print(f"{token.text:{10}}{token.pos_:{10}}{token.tag_:{10}},{spacy.explain(token.tag_)}")

read      VERB      VBD       ,verb, past tense


In [88]:
doc=nlp(u"I read a book on NLP.")

In [89]:
word=doc[1]
token=word
print(f"{token.text:{10}}{token.pos_:{10}}{token.tag_:{10}},{spacy.explain(token.tag_)}")

read      VERB      VBD       ,verb, past tense


In [90]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [91]:
POS_counts=doc.count_by(spacy.attrs.POS)

In [92]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [93]:
doc.vocab[84].text

'ADJ'

In [94]:
doc[2].pos_

'ADJ'

In [95]:
for k,v in sorted (POS_counts.items()):
    print(f"{k}.{doc.vocab[k].text:{5}}{v}")

84.ADJ  3
85.ADP  1
90.DET  2
92.NOUN 3
94.PART 1
97.PUNCT1
100.VERB 1


In [96]:
tag_counts=doc.count_by(spacy.attrs.TAG)
for k,v in sorted (tag_counts.items()):
    print(f"{k}.{doc.vocab[k].text:{5}}{v}")

74.POS  1
1292078113972184607.IN   1
10554686591937588953.JJ   3
12646065887601541794..    1
15267657372422890137.DT   2
15308085513773655218.NN   3
17109001835818727656.VBD  1


In [97]:
len(doc.vocab)

916

In [98]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [99]:
from spacy import displacy

In [100]:
#displacy.render(doc,style='dep',jupyter=True)

In [101]:
options={'distance':110,'compact':'True','color':'yellow','bg':'#09a3d5','font': 'Times'}

In [102]:
displacy.render(doc,style='dep',jupyter=True,options=options)

In [103]:
doc2=nlp(u"This is a sentence. This is another sentence, possibly longer than the other.")

In [104]:
spans=list(doc2.sents)

In [105]:
#displacy.serve(spans,style='dep',options={'distance':110})

# Name Entity Recognition

In [106]:
def show_ents(doc):
    if doc.ents:
        
        for ent in doc.ents:
            print(ent.text+'-'+ent.label_+'-'+str(spacy.explain(ent.label_)))
    else:
        print("no entities found")

In [107]:
doc=nlp(u"hi how are you")

In [108]:
show_ents(doc)

no entities found


In [109]:
doc=nlp(u"May I go to Washington, Dc next May to see the washington Monument?")

In [110]:
show_ents(doc)

Washington-GPE-Countries, cities, states
Dc-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the washington Monument-ORG-Companies, agencies, institutions, etc.


In [111]:
doc=nlp(u" Can I please have 500 dollars in Microsoft company")

In [112]:
show_ents(doc)

500 dollars-MONEY-Monetary values, including unit
Microsoft-ORG-Companies, agencies, institutions, etc.


In [113]:
doc=nlp(u" Tesla to build a U.K factory for $6 million")

In [114]:
show_ents(doc)

U.K-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [115]:
from spacy.tokens import Span

In [116]:
ORG=doc.vocab.strings[u"ORG"]

In [117]:
ORG

383

In [118]:
new_ent=Span(doc,0,1,label=ORG)

In [119]:
doc.ents=list(doc.ents)+[new_ent]

In [120]:
show_ents(doc)

 -ORG-Companies, agencies, institutions, etc.
U.K-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [121]:
doc=nlp(u"Our company created a brand new vacuum cleanner."
        U"This new vacuum-cleanner i sthe best in show.")

In [122]:
show_ents(doc)

no entities found


In [123]:
from spacy.matcher import PhraseMatcher

In [124]:
matcher=PhraseMatcher(nlp.vocab)

In [125]:
phrase_list=['vacuum cleanner','vacuum-cleanner']

In [126]:
phrase_patterns=[nlp(text) for text in phrase_list]

In [127]:
matcher.add('newproduct',phrase_patterns)

In [128]:
found_matches=matcher(doc)

In [129]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [130]:
from spacy.tokens import Span

In [131]:
PROD= doc.vocab.strings[u"PRODUCT"]

In [132]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [133]:
new_ents=[Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [134]:
doc.ents=list(doc.ents)+new_ents

In [135]:
show_ents(doc)

vacuum cleanner-PRODUCT-Objects, vehicles, foods, etc. (not services)
vacuum-cleanner-PRODUCT-Objects, vehicles, foods, etc. (not services)


In [136]:
doc=nlp(u"Originally I paid $29.95 for this car to, but now it is maked down")

In [137]:
len([ent for ent in doc.ents if ent.label_=="MONEY"])

1

In [138]:
doc=nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit"
       u" Sony only sold 1 thousand Walkman music players.")

In [139]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent', jupyter=True)

In [140]:
colors={'ORG':'radial-gradient(yellow,red)'}
options={'ents': ['PRODUCT','ORG'],'colors':colors}

In [141]:
displacy.render(nlp(sent.text),style='ent', jupyter=True,options=options)

# sentence segmentation

In [142]:
doc=nlp(u'This is the first sentence. This is another sentence.')

In [143]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.


In [144]:
list(doc.sents)

[This is the first sentence., This is another sentence.]

In [145]:
doc=nlp(u'"management is dong the right thinds; leadership is doing the right thing."-Peter Drucker')

In [146]:
doc.text

'"management is dong the right thinds; leadership is doing the right thing."-Peter Drucker'

In [147]:
for sent in doc.sents:
    print(sent)
    print('\n')

"management is dong the right thinds; leadership is doing the right thing.


"-Peter Drucker




In [148]:
from spacy.language import Language
@Language.component("component")
#ADD a SEGMENTATION Rule 
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text==';':
            doc[token.i+1].is_sent_start= True
    return doc

#CHANGFE SEGMENTTION RULES

In [149]:
nlp.add_pipe("component",before='parser')
nlp.pipe_names

['tok2vec',
 'tagger',
 'component',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [150]:
doc[:-1]

"management is dong the right thinds; leadership is doing the right thing."-Peter

In [151]:
doc4=nlp(u'"management is dong the right thinds; leadership is doing the right thing."-Peter Drucker')

In [152]:
for sent in doc4.sents:
    print(sent)

"management is dong the right thinds;
leadership is doing the right thing.
"-Peter Drucker


In [153]:
mystring=u"This is a sentence. This is another.\n\nthis is a \nthird sentence."

In [154]:
print(mystring)

This is a sentence. This is another.

this is a 
third sentence.


In [155]:
doc=nlp(mystring)

In [156]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


this is a 
third sentence.
