In [1]:
#Implementation of NER using NLTK 
#Let’s start with the importing library.
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
#NLTK provides some already tagged sentences, we can check it using the treebank package.
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
sent = nltk.corpus.treebank.tagged_sents()
print(nltk.ne_chunk(sent[0]))

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/anisha/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/anisha/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /Users/anisha/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
#Information Extraction
raw_text="""The Board of Control for Cricket in India (BCCI) is the governing body for cricket in India and is under the jurisdiction of Ministry of Youth Affairs and Sports, Government of India.[2] The board was formed in December 1928 as a society, registered under the Tamil Nadu Societies Registration Act. It is a consortium of state cricket associations and the state associations select their representatives who in turn elect the BCCI Chief. Its headquarters are in Wankhede Stadium, Mumbai. Grant Govan was its first president and Anthony De Mello its first secretary. With the surge of cricket in India, BCCI was criticised for its monopolistic practices and has suffered from corruption allegations. The Supreme Court on 30 January 2017 nominated a four-member panel Committee of Administrators:- Vinod Rai, Ramachandra Guha, Vikaram Limaye and Diana Edulji to look after the administration of the BCCI in order to implement Lodha Committee reforms.Vinod Rai, ex-CAG of India heads the four members panel to look after the administrative duties of the board until the fresh elections are called.Presently, Sourav Ganguly is the president of BCCI.On 9 August 2019, the BCCI agreed to adhere to the anti-doping mechanisms governed by the National Anti-Doping Agency. Sunil Joshi, former Indian cricket team spinner was named as Chairman of the national selection panel by the Cricket Advisory Committee (CAC) of BCCI replacing MSK prasad in that role."""

In [4]:
#Before extracting the named entity we need to tokenize the sentence and give them part of the speech tag to the tokenized words.

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
raw_words= word_tokenize(raw_text)
tags=pos_tag(raw_words)

[nltk_data] Downloading package punkt to /Users/anisha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anisha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
#Now we’ll implement noun phrase chunking to identify named entities using a regular expression consisting of rules that indicate how sentences should be chunked.
#Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

nltk.download('maxent_ne_chunker')
nltk.download('words')
ne = nltk.ne_chunk(tags,binary=True)
print(ne)

(S
  The/DT
  (NE Board/NNP)
  of/IN
  (NE Control/NNP)
  for/IN
  Cricket/NNP
  in/IN
  (NE India/NNP)
  (/(
  (NE BCCI/NNP)
  )/)
  is/VBZ
  the/DT
  governing/VBG
  body/NN
  for/IN
  cricket/NN
  in/IN
  (NE India/NNP)
  and/CC
  is/VBZ
  under/IN
  the/DT
  jurisdiction/NN
  of/IN
  (NE Ministry/NNP)
  of/IN
  (NE Youth/NNP Affairs/NNPS)
  and/CC
  (NE Sports/NNP)
  ,/,
  Government/NNP
  of/IN
  (NE India/NNP)
  ./.
  [/CC
  2/CD
  ]/VBP
  The/DT
  board/NN
  was/VBD
  formed/VBN
  in/IN
  December/NNP
  1928/CD
  as/IN
  a/DT
  society/NN
  ,/,
  registered/VBN
  under/IN
  the/DT
  (NE Tamil/NNP Nadu/NNP Societies/NNP)
  Registration/NNP
  Act/NNP
  ./.
  It/PRP
  is/VBZ
  a/DT
  consortium/NN
  of/IN
  state/NN
  cricket/NN
  associations/NNS
  and/CC
  the/DT
  state/NN
  associations/NNS
  select/VBP
  their/PRP$
  representatives/NNS
  who/WP
  in/IN
  turn/NN
  elect/VBP
  the/DT
  (NE BCCI/NNP Chief/NNP)
  ./.
  Its/PRP$
  headquarters/NNS
  are/VBP
  in/IN
  (NE Wankhede

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/anisha/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/anisha/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
#For better understanding, we can use the IOB tagging format. This format provides tags similar to the pos tagging but gives clarification about the position and the entity of the words.
# Here the IOB Tagging system contains tags of the form:

#B-{CHUNK_TYPE} – for the word in the Beginning chunk
#I-{CHUNK_TYPE} – for words Inside the chunk
#O – Outside any chunk

from nltk.chunk import tree2conlltags
iob = tree2conlltags(ne)
iob

[('The', 'DT', 'O'),
 ('Board', 'NNP', 'B-NE'),
 ('of', 'IN', 'O'),
 ('Control', 'NNP', 'B-NE'),
 ('for', 'IN', 'O'),
 ('Cricket', 'NNP', 'O'),
 ('in', 'IN', 'O'),
 ('India', 'NNP', 'B-NE'),
 ('(', '(', 'O'),
 ('BCCI', 'NNP', 'B-NE'),
 (')', ')', 'O'),
 ('is', 'VBZ', 'O'),
 ('the', 'DT', 'O'),
 ('governing', 'VBG', 'O'),
 ('body', 'NN', 'O'),
 ('for', 'IN', 'O'),
 ('cricket', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('India', 'NNP', 'B-NE'),
 ('and', 'CC', 'O'),
 ('is', 'VBZ', 'O'),
 ('under', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('jurisdiction', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('Ministry', 'NNP', 'B-NE'),
 ('of', 'IN', 'O'),
 ('Youth', 'NNP', 'B-NE'),
 ('Affairs', 'NNPS', 'I-NE'),
 ('and', 'CC', 'O'),
 ('Sports', 'NNP', 'B-NE'),
 (',', ',', 'O'),
 ('Government', 'NNP', 'O'),
 ('of', 'IN', 'O'),
 ('India', 'NNP', 'B-NE'),
 ('.', '.', 'O'),
 ('[', 'CC', 'O'),
 ('2', 'CD', 'O'),
 (']', 'VBP', 'O'),
 ('The', 'DT', 'O'),
 ('board', 'NN', 'O'),
 ('was', 'VBD', 'O'),
 ('formed', 'VBN', 'O'),
 ('in', '