# Getting Started with Spacy

In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [2]:
doc=nlp(u'Apple is looking at buting a U.K. startup for $10 Billion')
type(doc)

spacy.tokens.doc.Doc

In [5]:
for token in doc:  #pos= part of speech
    print(token.text,token.pos_,token.dep_)

Apple PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buting VERB pcomp
a DET det
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
10 NUM compound
Billion NUM pobj


In [6]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7fe5e9556dd0>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7fe5fa4f1050>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7fe5fa4f15f0>)]

In [7]:
doc2=nlp("Apple isn't looking into buying startups in U.K. anymore.")

In [10]:
for token in doc2:
    print(f'{token.text:{10}}{token.pos_:{10}}{token.dep_:{10}}')

Apple     PROPN     nsubj     
is        VERB      aux       
n't       ADV       neg       
looking   VERB      ROOT      
into      ADP       prep      
buying    VERB      compound  
startups  NOUN      pobj      
in        ADP       prep      
U.K.      PROPN     pobj      
anymore   ADV       advmod    
.         PUNCT     punct     


In [11]:
doc2[0]

Apple

In [12]:
type(doc2[0])

spacy.tokens.token.Token

In [13]:
token=doc2[0]

In [14]:
token.dep_ #dependency

'nsubj'

In [15]:
spacy.explain('nsubj')

'nominal subject'

In [16]:
spacy.explain(str(doc[0].pos_))  #proper noun 专有名词

'proper noun'

## spans

In [20]:
doc3 = nlp(u'Although commonly attributed to JL from his song "Beautiful boy",\
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist AS and published in Reader\'s Digest in 1957, when L was 17.')

In [21]:
life_quote=doc3[12:26]
print(life_quote)

"Life is what happens to us while we are making other plans"


**break your text into sentences**

In [23]:
doc4 = nlp(u'This is the first sentence. Hey, second sentence. Third sentence. Fourth sentence. And stupid sentence.')
for sentence in doc4.sents:
    print(sentence,type(sentence))

This is the first sentence. <class 'spacy.tokens.span.Span'>
Hey, second sentence. <class 'spacy.tokens.span.Span'>
Third sentence. <class 'spacy.tokens.span.Span'>
Fourth sentence. <class 'spacy.tokens.span.Span'>
And stupid sentence. <class 'spacy.tokens.span.Span'>


In [24]:
doc4[0].is_sent_start

In [26]:
doc4[6].is_sent_start

True

# Tokenization

In [27]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [28]:
string='"We\'re moving to L.A.!"'
print(string)

"We're moving to L.A.!"


In [30]:
doc_new=nlp(string)
for token in doc_new:
    print(token,end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [31]:
doc2=nlp(u"We're here to help! Send snail-mail, email supporot@oursite.com or visit us at http://www.oursite.com!")
for token in doc2:
    print(token)

We
're
here
to
help
!
Send
snail
-
mail
,
email
supporot@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [32]:
doc3=nlp(u"I paid $9.98 for a used furniture.")
for token in doc3:
    print(token)

I
paid
$
9.98
for
a
used
furniture
.


In [33]:
doc4=nlp(u"Let's visit St.Louis in U.S. next month.")
for token in doc4:
    print(token)

Let
's
visit
St
.
Louis
in
U.S.
next
month
.


In [34]:
doc4[2:5]

visit St.

**Doc object does not support assignment** (imutable)

In [35]:
doc4[0]='Make'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [36]:
doc5=nlp('My dinner was horrible.')
doc6=nlp('Your dinner was great.')
doc5[3]=doc6[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

## Named Entities

In [40]:
doc8=nlp(u"Apple is trying to build a new factory in Hong Kong in 2035.")
for token in doc8:
    print(token,end=' | ' )
    
print('\n--------------------\n')
    
for entity in doc8.ents:
    print(entity.text,'\t',entity.label_,'\t',spacy.explain(str(entity.label_)))

Apple | is | trying | to | build | a | new | factory | in | Hong | Kong | in | 2035 | . | 
--------------------

Apple 	 ORG 	 Companies, agencies, institutions, etc.
Hong Kong 	 GPE 	 Countries, cities, states
2035 	 DATE 	 Absolute or relative dates or periods


In [41]:
type(doc8.ents)

tuple

### Noun Chunks 名词短语

In [42]:
doc9=nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [43]:
doc10=nlp(u"Red cars do not carry higher insurance rates.")
for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


## visualization

In [44]:
from spacy import displacy

In [45]:
displacy.render(doc10,style='dep',jupyter=True,options={'distance':100})

In [46]:
doc=nlp(u"Over the last quarter Apple sold nearly 20 thousand iPhones for a profit of $6 million.")
displacy.render(doc,style='ent',jupyter=True)

# Stemming 词干提取

In [56]:
# porter stemmer

In [47]:
import nltk
from nltk.stem.porter import PorterStemmer

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [48]:
p_stemmer=PorterStemmer()

In [51]:
words=['run','runs','runner','running','ran','easily','fairly']

In [52]:
for word in words:
    print(word+'----->'+p_stemmer.stem(word))

run----->run
runs----->run
runner----->runner
running----->run
ran----->ran
easily----->easili
fairly----->fairli


In [53]:
# snowball stemmer

In [54]:
from nltk.stem.snowball import SnowballStemmer

s_stemmer=SnowballStemmer(language='english')

In [55]:
for word in words:
    print(word+'----->'+s_stemmer.stem(word))

run----->run
runs----->run
runner----->runner
running----->run
ran----->ran
easily----->easili
fairly----->fair


**limitations of stemmers**

In [57]:
phrase='I am meeting Raj at the meeting this afternoon.'
for word in phrase.split():
    print(word+'---->'+s_stemmer.stem(word))

I---->i
am---->am
meeting---->meet
Raj---->raj
at---->at
the---->the
meeting---->meet
this---->this
afternoon.---->afternoon.


two 'meeting' are not the same meaning

# Lemmatization 词形还原

In [59]:
doc=nlp("I am a runner running in a race because I lover to run and I ran earlier this morning.")
for token in doc:
    print(token.text,'\t',token.pos_,'\t',token.lemma_)

I 	 PRON 	 -PRON-
am 	 VERB 	 be
a 	 DET 	 a
runner 	 NOUN 	 runner
running 	 VERB 	 run
in 	 ADP 	 in
a 	 DET 	 a
race 	 NOUN 	 race
because 	 ADP 	 because
I 	 PRON 	 -PRON-
lover 	 VERB 	 lover
to 	 PART 	 to
run 	 VERB 	 run
and 	 CCONJ 	 and
I 	 PRON 	 -PRON-
ran 	 VERB 	 run
earlier 	 ADV 	 earlier
this 	 DET 	 this
morning 	 NOUN 	 morning
. 	 PUNCT 	 .


In [60]:
def show_lemmas(doc):
    for token in doc:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma_}')

In [61]:
show_lemmas(doc)

I            PRON   -PRON-
am           VERB   be
a            DET    a
runner       NOUN   runner
running      VERB   run
in           ADP    in
a            DET    a
race         NOUN   race
because      ADP    because
I            PRON   -PRON-
lover        VERB   lover
to           PART   to
run          VERB   run
and          CCONJ  and
I            PRON   -PRON-
ran          VERB   run
earlier      ADV    earlier
this         DET    this
morning      NOUN   morning
.            PUNCT  .


In [63]:
doc2=nlp(u"I saw 18 mice today and six mice yesterday.")
show_lemmas(doc2)

I            PRON   -PRON-
saw          VERB   see
18           NUM    18
mice         NOUN   mouse
today        NOUN   today
and          CCONJ  and
six          NUM    six
mice         NOUN   mouse
yesterday    NOUN   yesterday
.            PUNCT  .


In [78]:
doc_x=nlp(u"I do not know whether 4-grams or trigrams is better")
show_lemmas(doc_x)

I            PRON   -PRON-
do           VERB   do
not          ADV    not
know         VERB   know
whether      ADP    whether
4-grams      NOUN   4-gram
or           CCONJ  or
trigrams     NOUN   trigram
is           VERB   be
better       ADJ    good


In [64]:
doc3=nlp("That's an enormous automobile.")
show_lemmas(doc3)

That         DET    that
's           VERB   be
an           DET    an
enormous     ADJ    enormous
automobile   NOUN   automobile
.            PUNCT  .


In [65]:
doc4=nlp(u"That was ridiculously easy and esaily done and fairly.")
show_lemmas(doc4)

That         DET    that
was          VERB   be
ridiculously ADV    ridiculously
easy         ADJ    easy
and          CCONJ  and
esaily       ADV    esaily
done         VERB   do
and          CCONJ  and
fairly       ADV    fairly
.            PUNCT  .


# stop words

In [66]:
print(nlp.Defaults.stop_words)

{'as', 'beside', 'has', 'until', 'full', 'former', 'itself', 'neither', 'you', 'however', 'again', 'between', 'throughout', 'eleven', 'seems', 'due', 'onto', 'fifty', 'give', 'seemed', 'no', 'formerly', 'please', 'towards', 'another', 'three', 'did', 'became', 'an', 'hereafter', 'off', 'ours', 'latter', 'all', 'are', 'behind', 'now', 'our', 'yourselves', 'ca', 'really', 'go', 'most', 'seem', 'someone', 'above', 'though', 'never', 'one', 'become', 'see', 'yet', 'show', 'name', 'same', 'out', 'when', 'much', 'nevertheless', 'whatever', 'him', 'make', 'first', 'down', 'not', 'then', 'wherever', 'hers', 'into', 'top', 'part', 'rather', 'through', 'your', 'quite', 'yourself', 'whither', 'regarding', 'in', 'but', 'cannot', 'should', 'keep', 'six', 'under', 'bottom', 'perhaps', 'themselves', 'many', 'moreover', 'others', 'indeed', 'even', 'ourselves', 'mostly', 'eight', 'toward', 'us', 'its', 'somewhere', 'such', 'there', 'do', 'third', 'just', 'i', 'were', 'anyway', 'everything', 'else', 'mo

In [67]:
nlp.vocab['whenever'].is_stop

True

In [68]:
nlp.vocab['teacher'].is_stop

False

In [70]:
len(nlp.Defaults.stop_words)

305

**two steps to set a new word into stop word**

In [71]:
nlp.Defaults.stop_words.add('btw')

In [72]:
len(nlp.Defaults.stop_words)

306

In [73]:
nlp.vocab['btw'].is_stop=True

In [74]:
nlp.vocab['btw'].is_stop

True

**remove a word from default(such same as add)**

In [75]:
nlp.Defaults.stop_words.remove('btw')

In [76]:
nlp.vocab['btw'].is_stop=False

In [77]:
len(nlp.Defaults.stop_words)

305