In [3]:
# install spacy in colab or your local environment in order to execute the notebook

In [4]:
!pip install spacy



In [5]:
import spacy

In [6]:
nlp = spacy.load('en')

In [7]:
#reading text
doc1 = nlp(u"This is nlp basics with spacy")

In [8]:
doc1

This is nlp basics with spacy

In [9]:
doc1.text

'This is nlp basics with spacy'

#sentence tokenization

In [10]:
sentence = "My name is Prasanna Kumar.  I have nearly 10 years experience in software development.  I have been working as ML Eng for 3 years."

In [11]:
doc2 = nlp(sentence)

In [12]:
for i, sent in enumerate(doc2.sents, 1):
  print(f'{i}: {sent}')

1: My name is Prasanna Kumar.  
2: I have nearly 10 years experience in software development.  
3: I have been working as ML Eng for 3 years.


In [13]:
#word tokenization

In [14]:
for i, sent in enumerate(doc2.sents, 1):
  for word in sent:
    print(word)  

My
name
is
Prasanna
Kumar
.
 
I
have
nearly
10
years
experience
in
software
development
.
 
I
have
been
working
as
ML
Eng
for
3
years
.


In [15]:
doc = nlp("SpaCy is an amazing tool")

In [16]:
for word in doc:
  print(word.text)

SpaCy
is
an
amazing
tool


In [17]:
doc

SpaCy is an amazing tool

In [18]:
[token.text for token in doc]

['SpaCy', 'is', 'an', 'amazing', 'tool']

In [19]:
#split by a special character
doc.text.split(" ")

['SpaCy', 'is', 'an', 'amazing', 'tool']

In [20]:
for word in doc:
  print(word.text, word.shape, word.shape_)

SpaCy 14101195205177134206 XxxXx
is 4370460163704169311 xx
an 4370460163704169311 xx
amazing 13110060611322374290 xxxx
tool 13110060611322374290 xxxx


### Part of Speech Tagging

In [22]:
ex1 = nlp("He drinks a drink")

In [23]:
for word in ex1:
  print(word.text, word.pos, word.pos_)

He 95 PRON
drinks 100 VERB
a 90 DET
drink 92 NOUN


In [24]:
ex2 = nlp("I fish a fish")

In [25]:
for word in ex2:
  print(word.text, word.pos, word.pos_)

I 95 PRON
fish 100 VERB
a 90 DET
fish 92 NOUN


In [26]:
for word in ex2:
  print(word.text,  word.pos_, word.tag_)

I PRON PRP
fish VERB VBP
a DET DT
fish NOUN NN


In [27]:
spacy.explain("VBP")

'verb, non-3rd person singular present'

In [28]:
ex1 = nlp(u"All the faith he had had had no effect on the outcome of his life")

In [29]:
for word in ex1:
  print((word.text, word.tag_, word.pos_))

('All', 'PDT', 'DET')
('the', 'DT', 'DET')
('faith', 'NN', 'NOUN')
('he', 'PRP', 'PRON')
('had', 'VBD', 'AUX')
('had', 'VBN', 'VERB')
('had', 'VBN', 'VERB')
('no', 'DT', 'DET')
('effect', 'NN', 'NOUN')
('on', 'IN', 'ADP')
('the', 'DT', 'DET')
('outcome', 'NN', 'NOUN')
('of', 'IN', 'ADP')
('his', 'PRP$', 'DET')
('life', 'NN', 'NOUN')


In [30]:
spacy.explain('ADP')

'adposition'

#### Another example of part of speech tagging

In [65]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'talk', 'say']
Sebastian NORP
Google ORG
2007 DATE
American NORP
Recode ORG
earlier this week DATE


### Semantic analysis

In [32]:
ex3 = nlp("Sally likes Sam")

In [33]:
for word in ex3:
  print(word.text, word.tag_, word.pos_, word.dep_)

Sally NNP PROPN nsubj
likes VBZ VERB ROOT
Sam NNP PROPN dobj


In [34]:
spacy.explain("NNP")

'noun, proper singular'

In [35]:
spacy.explain("VBZ")

'verb, 3rd person singular present'

In [36]:
from spacy import displacy

In [37]:
displacy.render(ex3, style='dep', jupyter=True)

###  **Lemmatization**

In [38]:
ex4 = nlp("study studying studious studio student")

In [39]:
for word in ex4:
  print(word.text, word.lemma, word.lemma_)

study 4251533498015236010 study
studying 4251533498015236010 study
studious 18244412806944160880 studious
studio 256657426954626641 studio
student 8980609300697527695 student


In [40]:
for word in ex4:
  print(word.text, word.lemma_, word.pos_)

study study VERB
studying study VERB
studious studious ADJ
studio studio NOUN
student student NOUN


In [41]:
ex4 = nlp("walk walking walker walked walks")

In [42]:
for word in ex4:
  print(word.text, word.lemma_, word.pos_)

walk walk VERB
walking walk VERB
walker walker NOUN
walked walk VERB
walks walk NOUN


## Named Entity Recognition

In [43]:
ex5 = nlp("Rupee is India currency and today is Friday and rupee falls below 60 dollars.")

In [44]:
for word in ex5:
  print((word.text, word.ent_type_))

('Rupee', '')
('is', '')
('India', 'GPE')
('currency', '')
('and', '')
('today', 'DATE')
('is', '')
('Friday', 'DATE')
('and', '')
('rupee', '')
('falls', '')
('below', 'MONEY')
('60', 'MONEY')
('dollars', 'MONEY')
('.', '')


In [45]:
spacy.explain('GPE')

'Countries, cities, states'

In [46]:
from spacy import displacy

In [47]:
displacy.render(ex5, style="ent", jupyter=True)


#STOPWORDS

In [48]:
from spacy.lang.en.stop_words import STOP_WORDS

In [49]:
print(STOP_WORDS)

{'each', 'hereupon', 'there', 'after', 'by', 'everything', '‘m', 'upon', 'both', 'yet', 'amount', 'perhaps', 'had', 'someone', 'among', 'rather', 'i', 'somewhere', 'has', 'every', 'any', 'namely', 'nine', 'twenty', 'whose', 'yours', 'against', 'either', 'least', 'must', 'when', 'because', 'many', 'were', 'anywhere', 'everywhere', 'why', 'above', 'our', 'say', 'seem', 'wherein', 'nothing', 'on', 'some', 'hereby', '’m', 'even', 'formerly', 'however', 'latter', 'may', 'part', 'herein', 'ever', 'side', 'thereby', 'take', 'n’t', '‘s', 'amongst', 'are', 'sometime', 'whereupon', 'serious', 'from', 'not', 'could', '‘ve', 'am', 'beyond', 'six', 'several', 'move', 'then', 'forty', 'well', 'something', 'becoming', 'sometimes', 'always', 'doing', 'most', 'made', 'few', 'be', 'often', 'fifty', 'also', 'show', 'sixty', 'therein', 'regarding', 'back', 'hundred', 'yourselves', 'mostly', 'seems', 'across', 'only', 'whereas', 'all', "'m", 'for', 'eleven', '‘d', 'no', 'its', 'nor', '’re', 'onto', 'althou

In [50]:
ex1 = nlp("He will n't go to market and that's it for today")

In [51]:
for word in ex1:
  print(word)

He
will
n't
go
to
market
and
that
's
it
for
today


In [None]:
for word in ex1:
  if word.text not in STOP_WORDS:
    print(word.text)

He
market
today


In [None]:
len(STOP_WORDS)

326

In [None]:
for word in ex1:
  if not nlp.vocab[word.text].is_stop:
    print(word.text)

market
today


In [None]:
nlp.vocab["He"].is_stop

True

In [None]:
for word in ex1:
  if word.is_stop == False:
    print(word)  

market
today


In [None]:
ex1 = nlp("This setence has stop words for nlp")

In [None]:
" ".join([word.text for word in ex1 if word.is_stop == False])

'setence stop words nlp'

In [None]:
STOP_WORDS.add('lol')

In [None]:
nlp.vocab['lol'].is_stop

True

# Chunking - noun phrases

In [52]:
ex1 = nlp("the man with the axe is a fighter")

In [53]:
for word in ex1.noun_chunks:
  print(word.text)

the man
the axe
a fighter


In [54]:
for chunk in ex1.noun_chunks:
  print(chunk.root.text)

man
axe
fighter


In [55]:
ex2 = nlp("This man speaks fine English, well versed with writting.")

In [56]:
[word for word in ex2]

[This, man, speaks, fine, English, ,, well, versed, with, writting, .]

In [57]:
[word for word in ex2 if word.is_stop == False and word.is_punct == False and word.pos_ == "VERB"]

[speaks, versed, writting]

In [58]:
# 

MOST COMMON WORDS

In [59]:
from collections import Counter

In [60]:
ex1 = nlp("Surveys to check the prevalence of coronavirus among the general population of Delhi will now be held every month since Delhi is most general population of the population")        

In [61]:
words_wo_sw = [token.text for token in ex1 if token.is_stop == False and token.is_punct == False]

In [62]:
counter = Counter(words_wo_sw)

In [63]:
counter.most_common(10)

[('population', 3),
 ('general', 2),
 ('Delhi', 2),
 ('Surveys', 1),
 ('check', 1),
 ('prevalence', 1),
 ('coronavirus', 1),
 ('held', 1),
 ('month', 1)]