In [None]:
import spacy

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
def show_Ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text+" - "+ent.label_+" - "+spacy.explain(ent.label_))
  else:    
    print("No Entity Found!!!")

In [None]:
doc=nlp(u"How are you?")

In [None]:
show_Ents(doc)

No Entity Found!!!


In [None]:
doc=nlp(u"May i go to New Delhi, to next December see Red Fort? Can you have 600 dollars for software tools?")

In [None]:
show_Ents(doc)

New Delhi - GPE - Countries, cities, states
next December - DATE - Absolute or relative dates or periods
Red Fort - FAC - Buildings, airports, highways, bridges, etc.
600 dollars - MONEY - Monetary values, including unit


In [None]:
ORG=doc.vocab.strings[u"ORG"]

In [None]:
ORG

383

In [None]:
newdoc=nlp(u"She_tech is a very small company with respect to Google")

In [None]:
show_Ents(newdoc)

Google - ORG - Companies, agencies, institutions, etc.


In [None]:
from spacy.tokens import Span
newEnt=Span(newdoc,0,1,label=ORG)
newdoc.ents=list(newdoc.ents)+[newEnt]

In [None]:
show_Ents(newdoc)

She_tech - ORG - Companies, agencies, institutions, etc.
Google - ORG - Companies, agencies, institutions, etc.


In [None]:
#Add Multiple Entity
from spacy.matcher import PhraseMatcher

In [None]:
matcher=PhraseMatcher(nlp.vocab)

In [None]:
article=nlp(u"Our company created a brand new vacuum*cleaner The new vacuum$cleaner is the best in show")

In [None]:
show_Ents(article)

No Entity Found!!!


In [None]:
phr_list=['vacuum*cleaner','vacuum$cleaner']

In [None]:
phr_patt=[nlp(text) for text in phr_list]
matcher.add('vacuumcleaner',None,*phr_patt)


In [None]:
found_matches=matcher(article)

In [None]:
found_matches

[(2062361704163117874, 6, 7), (2062361704163117874, 9, 10)]

In [None]:
PROD=doc.vocab.strings[u"PRODUCT"]

In [None]:
new_ents=[Span(doc,match[1],match[2],label=PROD)for match in found_matches]

In [None]:
article.ents=list(article.ents)+new_ents

In [None]:
show_Ents(article)

vacuum*cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum$cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [None]:
#display in color
from spacy import displacy

In [None]:
displacy.render(doc,jupyter=True,style='ent')

In [None]:
import requests
from bs4 import BeautifulSoup
import re

In [None]:
def String_to__url(url):
  res=requests.get(url)
  html=res.text
  soup=BeautifulSoup(html,'html5lib')
  for script in soup(["script","style","aside"]):
    script.extract()
  return "".join(re.split(r'[\n\t]+',soup.get_text()))


In [None]:
web_data_string=String_to__url("https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news")

In [None]:
article=nlp(web_data_string)

In [None]:
sentences=[x for x in article.sents]

In [None]:
sentences[2]

Credit...

In [None]:
from collections import Counter

In [None]:
#count all labels
labels=[x.label_ for x in article.ents]

In [None]:
Counter(labels)

Counter({'CARDINAL': 3,
         'DATE': 23,
         'GPE': 9,
         'LOC': 1,
         'NORP': 2,
         'ORDINAL': 1,
         'ORG': 37,
         'PERSON': 77})

In [None]:
#most frequent token
items=[x.text for x in article.ents]

In [None]:
Counter(items).most_common(4)

[('Strzok', 29), ('F.B.I.', 18), ('Trump', 13), ('Russia', 6)]

In [None]:
displacy.render(nlp(str(sentences)),jupyter=True,style='ent')

In [None]:
print(nlp.Defaults.stop_words)

{'’d', 'doing', 'becomes', 'five', 'how', "n't", 'mostly', 'besides', 'noone', 'other', 'sixty', '’m', 'while', 'mine', 'nobody', 'whose', 'eleven', 'about', 'alone', 'at', 'call', 'fifty', 'elsewhere', 'front', 'again', 'rather', 'one', 'up', 'using', 'latter', 'with', 'much', 'it', 'which', 'same', 'you', 'amount', 'from', 'well', 'third', 'whenever', 'beforehand', 'nine', 'empty', 'he', 'itself', 'are', 'became', 'here', 'top', 'she', 'will', 'very', 'of', 'out', 'toward', 'were', 'his', 'myself', 'first', 're', 'beyond', 'yourself', 'their', 'anyway', 'used', 'thereupon', 'hers', 'someone', 'next', 'though', 'seem', 'almost', 'off', 'around', 'side', 'name', 'its', 'get', 'more', 'forty', 'hereafter', 'therein', '‘m', '’ve', 'into', 'yours', "'re", 'once', 'have', 'if', 'was', 'been', 'quite', 'made', 'another', 'sometime', 'down', 'thru', 'various', 'for', 'therefore', 'amongst', 'our', 'by', 'really', 'ours', 'nevertheless', 'before', 'did', 'everything', 'had', 'namely', 'less',

In [None]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['Rahul'].is_stop

False

In [None]:
nlp.Defaults.stop_words.add('pls')

In [None]:
len(nlp.Defaults.stop_words)

327

In [None]:
st_doc=nlp(u"Hello Mr. Shekhar, how are you, The weather is great, and city is very awesome the sky is pinkish blue. You should'nt eat pizza")

In [None]:
filter_data=[]
tokenize_doc=[]


In [None]:
for sw in st_doc:
  if not nlp.vocab[sw.text].is_stop:
    filter_data.append(sw)
  else:
    tokenize_doc.append(sw)
print(tokenize_doc)
print(filter_data)

[how, are, you, The, is, and, is, very, the, is, You]
[Hello, Mr., Shekhar, ,, ,, weather, great, ,, city, awesome, sky, pinkish, blue, ., should'nt, eat, pizza]
