In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [17]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'


In [18]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [19]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [20]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [21]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [22]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [23]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)


NameError: name 'ne_chunk' is not defined

In [24]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [25]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('Wednesday', 'DATE')]


In [26]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])


[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'O', ''),
 (5.1, 'O', ''),
 (billion, 'O', ''),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [27]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

156

In [28]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 34,
         'PERSON': 78,
         'DATE': 21,
         'GPE': 13,
         'CARDINAL': 5,
         'PRODUCT': 2,
         'NORP': 2,
         'ORDINAL': 1})

In [29]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 13)]

In [30]:
sentences = [x for x in article.sents]
print(sentences[20])

In one, Ms. Page asks: Trump is “not ever going to become president, right?


In [31]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')


In [32]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})


In [33]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Ms.', 'PROPN', 'Ms.'),
 ('Page', 'PROPN', 'Page'),
 ('asks', 'VERB', 'ask'),
 ('Trump', 'PROPN', 'Trump'),
 ('going', 'VERB', 'go'),
 ('president', 'NOUN', 'president'),
 ('right', 'ADJ', 'right')]

In [34]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])


{'one': 'CARDINAL', 'Page': 'PERSON', 'Trump': 'ORG'}

In [35]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])


[(In, 'O', ''), (one, 'B', 'CARDINAL'), (,, 'O', ''), (Ms., 'O', ''), (Page, 'B', 'PERSON'), (asks, 'O', ''), (:, 'O', ''), (Trump, 'B', 'ORG'), (is, 'O', ''), (“, 'O', ''), (not, 'O', ''), (ever, 'O', ''), (going, 'O', ''), (to, 'O', ''), (become, 'O', ''), (president, 'O', ''), (,, 'O', ''), (right, 'O', ''), (?, 'O', '')]


In [36]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20]))])

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'label_'

In [37]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')


In [41]:
my_example = "My name is Aryan Jain. I live in Evanston, 60201 and I am 21 years old and I like Lewis Hamilton and Kobe Bryant and Tom Holland the famous f1 driver "
displacy.render(nlp(my_example), jupyter=True, style='ent')

In [42]:
updated = [(x.orth_,x.pos_, x.lemma_, x.ent_type_ ) for x in [y 
                                      for y
                                      in nlp(my_example) ]]
#                                       if not y.is_stop and y.pos_ != 'PUNCT']]
print(updated)
updated[0]

[('My', 'PRON', 'my', ''), ('name', 'NOUN', 'name', ''), ('is', 'AUX', 'be', ''), ('Aryan', 'PROPN', 'Aryan', 'PERSON'), ('Jain', 'PROPN', 'Jain', 'PERSON'), ('.', 'PUNCT', '.', ''), ('I', 'PRON', 'I', ''), ('live', 'VERB', 'live', ''), ('in', 'ADP', 'in', ''), ('Evanston', 'PROPN', 'Evanston', 'GPE'), (',', 'PUNCT', ',', ''), ('60201', 'NUM', '60201', 'DATE'), ('and', 'CCONJ', 'and', ''), ('I', 'PRON', 'I', ''), ('am', 'AUX', 'be', ''), ('21', 'NUM', '21', 'DATE'), ('years', 'NOUN', 'year', 'DATE'), ('old', 'ADJ', 'old', 'DATE'), ('and', 'CCONJ', 'and', ''), ('I', 'PRON', 'I', ''), ('like', 'VERB', 'like', ''), ('Lewis', 'PROPN', 'Lewis', 'PERSON'), ('Hamilton', 'PROPN', 'Hamilton', 'PERSON'), ('and', 'CCONJ', 'and', ''), ('Kobe', 'PROPN', 'Kobe', 'PERSON'), ('Bryant', 'PROPN', 'Bryant', 'PERSON'), ('and', 'CCONJ', 'and', ''), ('Tom', 'PROPN', 'Tom', 'PERSON'), ('Holland', 'PROPN', 'Holland', 'PERSON'), ('the', 'DET', 'the', ''), ('famous', 'ADJ', 'famous', ''), ('f1', 'NOUN', 'f1', '

('My', 'PRON', 'my', '')

In [44]:
import requests
n = len(updated)
practice = [0 for _ in range(n)]
i = 0
while i < n:
    if updated[i][3] == 'PERSON':
        person_name = updated[i][0].capitalize()
        k = 0
        for j in range(i+1,len(updated)):
            if updated[j][3] == 'PERSON':
                person_name += "_" + updated[j][0].capitalize()
                k += 1
            else:
                i = j-1
                break
        
        response = requests.get(url="https://en.wikipedia.org/wiki/" + person_name,)
        if response.status_code == 200:
            print("Celeb: ", person_name)
        else:
            print("Normal: ", person_name)
            nn = person_name.replace("_"," ")
            my_example = my_example.replace(nn, "REDACT")
    elif updated[i][3] == 'GPE' or updated[i][1] == 'NUM':
            my_example = my_example.replace(updated[i][0], "REDACT")
    i+= 1
print("final: ", my_example)

Normal:  Aryan_Jain
Celeb:  Lewis_Hamilton
Celeb:  Kobe_Bryant
Celeb:  Tom_Holland
final:  My name is REDACT. I live in REDACT, REDACT and I am REDACT years old and I like Lewis Hamilton and Kobe Bryant and Tom Holland the famous f1 driver 


In [69]:
import requests

response = requests.get(
	url="https://en.wikipedia.org/wiki/" + "Lewis_Hamilton",
)
print(response.status_code)

200
