In [1]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
from pathlib import Path

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

In [2]:
# https://www.nytimes.com/2021/12/02/us/politics/biden-omicron-covid-testing.html

text = "Mr. Biden's announcement came as several new cases of the Omicron variant were reported in the United States, including five people in New York State, a Minnesota resident who had recently traveled to New York City and a Colorado resident who had recently returned from southern Africa. Hawaii also reported its first known case, and California its second."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_nyt.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [3]:
# https://www.cnn.com/2021/12/07/tech/elon-musk-wsj-government/index.html

text = "Today there are about 45,000 charging stations in the US. The infrastructure bill allocates $7.5 billion for charging infrastructure, which the Biden administration hopes will reach its goal of 500,000 charging stations. Tesla's charging stations in the US can currently only be used by Tesla vehicles. The company has said it will open them to all automakers, but non-Tesla owners will likely need to purchase an adapter, as Tesla uses a distinct plug. New charging stations will likely be immediately more accessible and affordable for non-Tesla owners."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_cnn.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [4]:
# https://www.teslarati.com/tesla-new-york-city-12-million-model-3-order/

text = "IN THE MATTER OF a proposed contract between the Department of Citywide Administrative Services of the City of New York and Tesla, Inc., located at 3500 Deer Creek Rd., Palo Alto, CA 94304, for procuring Tesla Model 3 All-Electric Sedans. The contract is in the amount of $12,360,000.00. The term of the contract shall be five years from date of Notice of Award. The proposed contractor has been selected by Sole Source Procurement Method, pursuant to Section 3-05 of the Procurement Policy Board Rules. If the plan does go through, the $12.36 million could effectively purchase about 274 units of the base Model 3 Rear-Wheel-Drive, which cost $44,990 under Tesla's current pricing structure."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_teslarati.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [5]:
# https://apple.news/AEWDShyRQTiSa9vn7ATGPjQ

text = "Elon Musk resumed his Tesla Inc. stock selloff Thursday, cashing in nearly $1 billion. In fillings with the Securities and Exchange Commission, the Tesla chief executive disclosed he sold another 934,091 Tesla shares, for about $963.2 million. He also exercised options to buy 2.17 million shares at a price of $6.24. After a flurry of stock sales in November, Musk has sold just two tranches of stock since Nov. 23 — the previous one on Dec. 2. In total, Musk has sold about 11.03 million shares worth about $11.82 billion since Nov. 8, a day after he said he would abide by a Twitter poll he posted in which users declared he should sell 10% of his Tesla stake. Some of the stock sales had been put into motion well before the poll was posted."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_marketwatch.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

## Updating the NER Model

In [14]:
# https://apple.news/AEWDShyRQTiSa9vn7ATGPjQ

text = "When using forge viewer for PDF's, is it possible to enable the Snap feature for measuring (same as when viewing models). Also, can the zoom level be changed i.e. we want to zoom in closer than the default maximum? Both of these are possible in BIM360 so I hope it is also possible in our application, can you please advise. Currently we are used v7 viewer. We are added the snap extension and setZoomInLimitFactor method also. But still we are unable to get snapping feature and zoom in function at PDF files."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_forge.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

## Named Entity Recognition with NLTK

In [7]:
tokenized_sent = sent_tokenize(text)

In [8]:
person = []
org = []
gpe = []
ner_types = set()

for s in tokenized_sent:
    pos = pos_tag(word_tokenize(s))
    res_chunks = ne_chunk(pos)

    for i in tree2conlltags(res_chunks):
        ner = i[2]
        if ner == 'O':  # skip Outiside labels
            continue;
        ner_types.add(ner) # Create a set of unique entries to ensure none are missing
        if ner.find('PERSON')> -1:   # matches on I-PERSON, B-PERSON
            person.append(i)
        elif ner.find('ORGANIZATION') > -1:
            org.append(i)
        elif ner.find('GPE') >-1:
            gpe.append(i) 

In [9]:
ner_types

{'B-ORGANIZATION'}

In [10]:
print('PERSON:', len(person))
print('ORGANIZATION:', len(org))
print('GPE:', len(gpe))

PERSON: 0
ORGANIZATION: 4
GPE: 0


In [11]:
set(person[:10])

set()

In [12]:
set(org[:15])

{('BIM360', 'NNP', 'B-ORGANIZATION'),
 ('PDF', 'NNP', 'B-ORGANIZATION'),
 ('Snap', 'NNP', 'B-ORGANIZATION')}

In [13]:
set(gpe[:15])

set()