1) Tokenization Basics

In [None]:
s1 = 'Apple is looking at Buying U.K. startup for $1 billion !'
s2 = 'Hello all, We are here to help you! email support@udemy.com or visit at http://www.udemy.com!'
s3 = '10km cob ride almost costs $20 in NYC'
s4 = "Let's watch a movie together."

In [None]:
import spacy



In [None]:
nlp = spacy.load(name = 'en_core_web_sm')

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import en_core_web_md

In [None]:
nlp_1 = en_core_web_md.load()

In [None]:
doc1 = nlp(s1)

In [None]:
doc1

Apple is looking at Buying U.K. startup for $1 billion !

In [None]:
for token in doc1:
  print(token)

Apple
is
looking
at
Buying
U.K.
startup
for
$
1
billion
!


In [None]:
doc2=nlp(s2)
print(s2)
for token in doc2:
  print(token)

Hello all, We are here to help you! email support@udemy.com or visit at http://www.udemy.com!
Hello
all
,
We
are
here
to
help
you
!
email
support@udemy.com
or
visit
at
http://www.udemy.com
!


In [None]:
type(doc2)

spacy.tokens.doc.Doc

In [None]:
doc2

Hello all, We are here to help you! email support@udemy.com or visit at http://www.udemy.com!

In [None]:
len(doc2)

17

In [None]:
doc2[0]

Hello

In [None]:
doc2[3:10]

We are here to help you!

In [None]:
doc2[-1]

!

In [None]:
doc2[0] = "New"

TypeError: ignored

2) Stemming and Lemmatization

In [None]:
words = ['run','runnner','running','ran','runs','easily','fairly']

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [None]:
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')

In [None]:
for word in words:
  print(word + ' --------- '+ p_stemmer.stem(word))

run --------- run
runnner --------- runnner
running --------- run
ran --------- ran
runs --------- run
easily --------- easili
fairly --------- fairli


In [None]:
for word in words:
  print(word + ' --------- '+ s_stemmer.stem(word))

run --------- run
runnner --------- runnner
running --------- run
ran --------- ran
runs --------- run
easily --------- easili
fairly --------- fair


In [None]:
document_1 = nlp("The stripped bats are hanging on their feet for best")

In [None]:
for token in document_1:
  print(token.text + ' ----- ' +token.lemma_)

The ----- the
stripped ----- strip
bats ----- bat
are ----- be
hanging ----- hang
on ----- on
their ----- their
feet ----- foot
for ----- for
best ----- good


3) Stopwords

In [None]:
print(nlp.Defaults.stop_words)

{'keep', 'this', 'do', 'else', 'everywhere', 'never', 'across', "'ll", 'both', 'perhaps', 'just', 'has', 'sometimes', 'whenever', 'it', 'however', 'its', 'least', 'namely', 'ca', 'an', 'latterly', 'various', 'side', 'eleven', 'myself', 'cannot', 'everything', 'another', 'down', 'could', 'alone', 'above', 'done', 'when', 'seem', 'latter', 'must', 'always', 'became', 'regarding', 'well', 'why', 'noone', 'were', 'quite', 'fifteen', 'much', 'is', 'hers', 'might', 'yours', 'become', 'formerly', 'thereafter', 'made', 'twelve', 'behind', 'anyhow', "n't", 're', 'if', 'them', '’ll', 'less', 'elsewhere', 'should', 'n‘t', 'than', 'how', 'anywhere', 'something', 'by', 'sometime', 'whereupon', 'there', 'mostly', 'neither', 'amount', 'would', 'many', 'six', 'you', 'several', '’d', 'such', 'been', 'one', 'who', 'under', 'seemed', 'becomes', 'after', 'for', 'their', 'whoever', 'two', 'bottom', 'same', 'so', 'am', 'therefore', 'upon', 'nobody', 'indeed', 'not', 'itself', 'himself', 'others', 'everyone'

In [None]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['always'].is_stop

True

In [None]:
nlp.vocab['finance'].is_stop

False

In [None]:
nlp.vocab['tatatata'].is_stop

False

In [None]:
nlp.Defaults.stop_words.add('tatatata')

In [None]:
nlp.vocab['tatatata'].is_stop = True

In [None]:
len(nlp.Defaults.stop_words)

327

In [None]:
print(nlp.vocab)

<spacy.vocab.Vocab object at 0x7f6b0002dd30>


4) Rule Based Matching

In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [None]:
pattern_1 = [{'LOWER':'hello'},{'LOWER':'world'}]
pattern_2 = [{'LOWER':'hello'},{'IS_PUNCT':True},{'LOWER':'world'}]

In [None]:
matcher.add('Hello World',None,pattern_1,pattern_2)

TypeError: ignored

In [None]:
find_matches = matcher(doc_1)

NameError: ignored

5) Phrase Based Matching

6) POS Tagging

In [None]:
s1 = "Apple is looking at buying U.K. startup for $1 buillion"

In [None]:
doc = nlp(s1)

In [None]:
for token in doc:
  print(token.text,token.pos_,token.tag_,spacy.explain(token.tag_))

Apple PROPN NNP noun, proper singular
is AUX VBZ verb, 3rd person singular present
looking VERB VBG verb, gerund or present participle
at ADP IN conjunction, subordinating or preposition
buying VERB VBG verb, gerund or present participle
U.K. PROPN NNP noun, proper singular
startup NOUN NN noun, singular or mass
for ADP IN conjunction, subordinating or preposition
$ SYM $ symbol, currency
1 NUM CD cardinal number
buillion NOUN NN noun, singular or mass


In [None]:
for key,val in doc.count_by(spacy.attrs.POS).items():
  print(key, doc.vocab[key].text,val)

96 PROPN 2
87 AUX 1
100 VERB 2
85 ADP 2
92 NOUN 2
99 SYM 1
93 NUM 1


In [None]:
from spacy import displacy

In [None]:
displacy.render(docs = doc,style='dep',options={'distance':80},jupyter = True)

7) Named Entity Recognition

In [None]:
s1 = "Apple is looking at buying U.K. startup for $1 billion"
s2 = "San Francisco considers banning sidewalk delivery robots"
s3 = "Facebook is hiring a new vice president in U.S."

In [None]:
   doc1 = nlp(s1)
for ent in doc1.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

Apple ORG Companies, agencies, institutions, etc.
U.K. GPE Countries, cities, states
$1 billion MONEY Monetary values, including unit


In [None]:
doc2 = nlp(s2)
for ent in doc2.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

San Francisco GPE Countries, cities, states


In [None]:
doc3 = nlp(s3)
for ent in doc3.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

Facebook ORG Companies, agencies, institutions, etc.
U.S. GPE Countries, cities, states


Adding a new word into a entity

In [None]:
ORG = doc3.vocab.strings['ORG']

In [None]:
from spacy.tokens import Span
new_ent = Span(doc3 , 0, 1, label = ORG)

In [None]:
doc3.ents = list(doc3.ents) + [new_ent]

In [None]:
doc3.ents

(Facebook, U.S.)

In [None]:
for ent in doc3.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

Facebook ORG Companies, agencies, institutions, etc.
U.S. GPE Countries, cities, states


In [None]:
displacy.render(docs = doc1, style = 'ent' , jupyter = True)

In [None]:
displacy.render(docs = doc1, style = 'ent' , options = {'ents':['ORG']},jupyter = True)

8)  Sentence Segmentation

In [None]:
s1= "This is a sentence. This is second sentence. This is last sentence."
s2= "This is a sentence ; This is second sentence ; This is last sentence."

In [None]:
doc1 = nlp(s1)


In [None]:
doc1.sents

<generator at 0x7f6aec49d7c0>

In [None]:
for sent in doc1.sents:
  print(sent.text)

This is a sentence.
This is second sentence.
This is last sentence.


In [None]:
s3= "This is a sentence. This is second  U.K. sentence. This is last sentence."


In [None]:
doc3=nlp(s3)
for sent in doc3.sents:
  print(sent.text)

This is a sentence.
This is second  U.K. sentence.
This is last sentence.


In [None]:
doc2=nlp(s2)
for sent in doc2.sents:
  print(sent.text)

This is a sentence; This is second sentence; This is last sentence.


In [None]:
from spacy.language import Language
@Language.component("set_custom_boundary")
def set_custom_boundary(doc):
   for token in doc[:-1]:
     if token.text == ';':
       print(token.i)
       doc[token.i+1].is_sent_start = True
   return doc

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.add_pipe('set_custom_boundary',before="parser")


<function __main__.set_custom_boundary(doc)>

In [None]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundary',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [None]:
doc_2 = nlp(s2)
for sent in doc_2.sents:
  print(sent.text)

4
9
This is a sentence ;
This is second sentence ;
This is last sentence.
