In [1]:
#Named entity recognition (NER)is probably the first step towards information extraction that seeks to 
#locate and classify named entities in text into pre-defined categories such as the names of persons, 
#organizations, locations, expressions of times, quantities, monetary values, percentages, etc. NER is 
#used in many fields in Natural Language Processing (NLP), and it can help answering many real-world 
#questions, such as:

#1.Which companies were mentioned in the news article?
#2.Were specified products mentioned in complaints or reviews?
#3.Does the tweet contain the name of a person? Does the tweet contain this person’s location?

#Now we will see how to build named entity recognizer with NLTK and SpaCy,to identify the names of things,
#such as persons, organizations, or locations in the raw text.

#Importing the NLTK library

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
#Information Extraction

#We took a sentence from The New York Times, “European authorities fined Google a record $5.1 billion 
#on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its 
#practices.”

ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
#Now we are applying word tokenization and part-of-speech tagging to the sentence.

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
#Now see what we get:
#We get a list of tuples containing the individual words in the sentence and their associated 
#part-of-speech.

sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [5]:
#Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker
#finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [6]:
#Chunking

#Using this pattern, we create a chunk parser and test it on our sentence.
#The output can be read as a tree or a hierarchy with S as the first level, denoting sentence. 
#we can also display it graphically.

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [7]:
#IOB[Inside_Outside_Beginning] tags have become the standard way to represent chunk structures in files,
#and we will also be using this format.

#The IOB format (short for inside, outside, beginning) is a common tagging format for tagging tokens in 
#a chunking task in computational linguistics (ex. named-entity recognition).

#In this representation,there is one token per line,each with its part-of-speech tag and its named entity 
#tag.Based on this training corpus,we can construct a tagger that can be used to label new sentences;and 
#use the nltk.chunk.conlltags2tree() function to convert the tag sequences into a chunk tree.

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [8]:
#With the function nltk.ne_chunk(), we can recognize named entities using a classifier, the classifier 
#adds category labels such as PERSON, ORGANIZATION, and GPE.

#Google is recognized as a person. It’s quite disappointing,we think so?

ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [9]:
#SpaCy

#SpaCy’s named entity recognition has been trained on the OntoNotes 5 corpus and it supports different
#entity types:
#Note:- If you found Module Not Found:en_core_web_, then download this in your cmd path 
#python -m spacy download en_core_web_s

#Entity

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [10]:
#We are using the same sentence, “European authorities fined Google a record $5.1 billion on Wednesday 
#for abusing its power in the mobile phone market and ordered the company to alter its practices.”

#One of the nice things about Spacy is that we only need to apply nlp once, the entire background pipeline
#will return the objects.

#European is NORD (nationalities or religious or political groups), Google is an organization, 
#$5.1 billion is monetary value and Wednesday is a date object. They are all correct.

doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [11]:
#Token

#During the above example, we were working on entity level, in the following example, we are demonstrating
#token-level entity annotation using the BILUO tagging scheme to describe the entity boundaries.

#Based on an issue and a patch in Clear TK, it seems like BILOU stands for "Beginning, Inside and Last 
#tokens of multi-token chunks, Unit-length chunks and Outside" (emphasis added). For instance, the 
#chunking denoted by brackets (foo foo foo) (bar) no no no (bar bar)

#"B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity,
#and "" means no entity tag is set.

pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [12]:
#Extracting named entity from an article

#Now let’s get serious with SpaCy and extracting named entities from a New York Times article, — 
#“F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired.”

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html,'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

158

In [13]:
#There are 158 entities in the article and they are represented as 10 unique labels

labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 81,
         'GPE': 13,
         'ORG': 32,
         'CARDINAL': 5,
         'DATE': 19,
         'PRODUCT': 2,
         'NORP': 3,
         'ORDINAL': 1,
         'EVENT': 1,
         'FAC': 1})

In [14]:
#The following are three most frequent tokens.

items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 13)]

In [15]:
#Let’s randomly select one sentence to learn more.

sentences = [x for x in article.sents]
print(sentences[13])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [16]:
#Let’s run displacy.render to generate the raw markup.

displacy.render(nlp(str(sentences[13])), jupyter=True, style='ent')

In [17]:
#Using spaCy’s built-in displaCy visualizer, here’s what the above sentence and its dependencies look like:

displacy.render(nlp(str(sentences[13])), style='dep', jupyter = True, options = {'distance': 120})

In [18]:
#Next, we verbatim, extract part-of-speech and lemmatize this sentence.

[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[13])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Firing', 'VERB', 'fire'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('removes', 'VERB', 'remove'),
 ('favorite', 'ADJ', 'favorite'),
 ('target', 'NOUN', 'target'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Trump', 'PROPN', 'Trump'),
 ('ranks', 'NOUN', 'rank'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('gives', 'VERB', 'give'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Bowdich', 'PROPN', 'Bowdich'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('director', 'NOUN', 'director'),
 ('Christopher', 'PROPN', 'Christopher'),
 ('A.', 'PROPN', 'A.'),
 ('Wray', 'PROPN', 'Wray'),
 ('chance', 'NOUN', 'chance'),
 ('president', 'NOUN', 'president'),
 ('ire', 'NOUN', 'ire')]

In [19]:
dict([(str(x), x.label_) for x in nlp(str(sentences[13])).ents])

{'Strzok': 'PERSON',
 'Trump': 'PERSON',
 'F.B.I.': 'ORG',
 'Bowdich': 'PERSON',
 'Christopher A. Wray': 'PERSON'}

In [20]:
#Finally, we visualize the entity of the entire article.

displacy.render(article, jupyter=True, style='ent')