# Named Entity Recognition(NER)

# Three approaches


1) Basic NLTK algorithm

.with word segmentation

.with sentence segmentation



2) Using Spacy

In [1]:
import nltk
import pandas as pd

In [2]:
text = "Apple acquired Zoom in China on Wednesday 6th May 2020.\
This news has made Apple and Google stock jump by 5% on Dow Jones Index in the \
United States of America"

# Basic Named Entity (NE) tagging using NLTK - Word based

In [3]:
#tokenize to words
words = nltk.word_tokenize(text)
words

['Apple',
 'acquired',
 'Zoom',
 'in',
 'China',
 'on',
 'Wednesday',
 '6th',
 'May',
 '2020.This',
 'news',
 'has',
 'made',
 'Apple',
 'and',
 'Google',
 'stock',
 'jump',
 'by',
 '5',
 '%',
 'on',
 'Dow',
 'Jones',
 'Index',
 'in',
 'the',
 'United',
 'States',
 'of',
 'America']

In [5]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91799\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [6]:
#Part of speech tagging
pos_tags = nltk.pos_tag(words)
pos_tags

[('Apple', 'NNP'),
 ('acquired', 'VBD'),
 ('Zoom', 'NNP'),
 ('in', 'IN'),
 ('China', 'NNP'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('6th', 'CD'),
 ('May', 'NNP'),
 ('2020.This', 'CD'),
 ('news', 'NN'),
 ('has', 'VBZ'),
 ('made', 'VBN'),
 ('Apple', 'NNP'),
 ('and', 'CC'),
 ('Google', 'NNP'),
 ('stock', 'NN'),
 ('jump', 'NN'),
 ('by', 'IN'),
 ('5', 'CD'),
 ('%', 'NN'),
 ('on', 'IN'),
 ('Dow', 'NNP'),
 ('Jones', 'NNP'),
 ('Index', 'NNP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('of', 'IN'),
 ('America', 'NNP')]

In [8]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\91799\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

In [9]:

#check nltk help for description of the tag
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


# ne_chunk

This task is called "chunk parsing" or "chunking", and the identified groups are called "chunks"

In [12]:
 nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\91799\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [14]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\91799\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [15]:
chunks = nltk.ne_chunk(pos_tags, binary=True) #either NE or not NE
for chunk in chunks:
    print(chunk)

(NE Apple/NNP)
('acquired', 'VBD')
('Zoom', 'NNP')
('in', 'IN')
(NE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020.This', 'CD')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(NE Apple/NNP)
('and', 'CC')
(NE Google/NNP)
('stock', 'NN')
('jump', 'NN')
('by', 'IN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
('Dow', 'NNP')
('Jones', 'NNP')
('Index', 'NNP')
('in', 'IN')
('the', 'DT')
(NE United/NNP States/NNPS)
('of', 'IN')
(NE America/NNP)


In [17]:

#only taking the Nes

entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,America,NE
1,Google,NE
2,China,NE
3,United States,NE
4,Apple,NE


In [None]:
here united states of america is not properly seen , so we are going to arrange

In [18]:
#by putting Binary = False

chunks = nltk.ne_chunk(pos_tags, binary=False) #either NE or not NE
for chunk in chunks:
    print(chunk)
    
entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

(PERSON Apple/NNP)
('acquired', 'VBD')
(PERSON Zoom/NNP)
('in', 'IN')
(GPE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020.This', 'CD')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(PERSON Apple/NNP)
('and', 'CC')
(ORGANIZATION Google/NNP)
('stock', 'NN')
('jump', 'NN')
('by', 'IN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
(PERSON Dow/NNP Jones/NNP Index/NNP)
('in', 'IN')
('the', 'DT')
(GPE United/NNP States/NNPS)
('of', 'IN')
(GPE America/NNP)


Unnamed: 0,Entities,Labels
0,China,GPE
1,United States,GPE
2,America,GPE
3,Google,ORGANIZATION
4,Apple,PERSON
5,Zoom,PERSON
6,Dow Jones Index,PERSON


# Basic Named Entity (NE) tagging using NLTK - Sentence based

In [19]:
entities = []
labels = []

sentence = nltk.sent_tokenize(text)
for sent in sentence:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)),binary=False):
        if hasattr(chunk,'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())
            
entities_labels = list(set(zip(entities,labels)))

entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,China,GPE
1,United States,GPE
2,America,GPE
3,Google,ORGANIZATION
4,Apple,PERSON
5,Zoom,PERSON
6,Dow Jones Index,PERSON


# Using spacy

In [21]:
import spacy 
from spacy import displacy
#SpaCy 2.x brough significant speed and accuracy improvements
spacy.__version__

'3.6.1'

In [23]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.3.0
    Uninstalling en-core-web-sm-3.3.0:
      Successfully uninstalled en-core-web-sm-3.3.0
Successfully installed en-core-web-sm-3.6.0
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2024-05-07 22:53:41.738464: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2024-05-07 22:53:41.738511: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-05-07 22:53:47.604357: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2024-05-07 22:53:47.604388: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2024-05-07 22:53:47.607298: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: LAPTOP-DQM1PJAK
2024-05-07 22:53:47.607392: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: LAPTOP-DQM1PJAK


In [24]:
nlp = spacy.load("en_core_web_sm")

In [25]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,(Apple),ORG,0,5
1,(Zoom),GPE,15,19
2,(China),GPE,23,28
3,"(Wednesday, 6th, May)",DATE,32,49
4,(Apple),ORG,74,79
5,(Google),ORG,84,90
6,"(5, %)",PERCENT,105,107
7,"(Dow, Jones, Index)",ORG,111,126
8,"(the, United, States, of, America)",GPE,130,158


In [26]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

Spacy works the best

What you can build with this?

A bot that can analyze financial news and extract information about entities that are mentioned in a
given article along with location, dates and numeric information. This information can be further utilized
in building algorithmic trading bots

Analyze research papers produced everyday on COVID19 and find out any significant developments
1
​