In [1]:
import pandas as pd
import random

import spacy
from spacy.training.example import Example
from spacy import displacy
from pathlib import Path
nlp = spacy.load("en_core_web_sm")


from spacy.util import minibatch, compounding

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

In [2]:
# https://www.nytimes.com/2021/12/02/us/politics/biden-omicron-covid-testing.html

text = "Mr. Biden's announcement came as several new cases of the Omicron variant were reported in the United States, including five people in New York State, a Minnesota resident who had recently traveled to New York City and a Colorado resident who had recently returned from southern Africa. Hawaii also reported its first known case, and California its second."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_nyt.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [3]:
# https://www.cnn.com/2021/12/07/tech/elon-musk-wsj-government/index.html

text = "Today there are about 45,000 charging stations in the US. The infrastructure bill allocates $7.5 billion for charging infrastructure, which the Biden administration hopes will reach its goal of 500,000 charging stations. Tesla's charging stations in the US can currently only be used by Tesla vehicles. The company has said it will open them to all automakers, but non-Tesla owners will likely need to purchase an adapter, as Tesla uses a distinct plug. New charging stations will likely be immediately more accessible and affordable for non-Tesla owners."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_cnn.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [4]:
# https://www.teslarati.com/tesla-new-york-city-12-million-model-3-order/

text = "IN THE MATTER OF a proposed contract between the Department of Citywide Administrative Services of the City of New York and Tesla, Inc., located at 3500 Deer Creek Rd., Palo Alto, CA 94304, for procuring Tesla Model 3 All-Electric Sedans. The contract is in the amount of $12,360,000.00. The term of the contract shall be five years from date of Notice of Award. The proposed contractor has been selected by Sole Source Procurement Method, pursuant to Section 3-05 of the Procurement Policy Board Rules. If the plan does go through, the $12.36 million could effectively purchase about 274 units of the base Model 3 Rear-Wheel-Drive, which cost $44,990 under Tesla's current pricing structure."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_teslarati.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [5]:
# https://apple.news/AEWDShyRQTiSa9vn7ATGPjQ

text = "Elon Musk resumed his Tesla Inc. stock selloff Thursday, cashing in nearly $1 billion. In fillings with the Securities and Exchange Commission, the Tesla chief executive disclosed he sold another 934,091 Tesla shares, for about $963.2 million. He also exercised options to buy 2.17 million shares at a price of $6.24. After a flurry of stock sales in November, Musk has sold just two tranches of stock since Nov. 23 — the previous one on Dec. 2. In total, Musk has sold about 11.03 million shares worth about $11.82 billion since Nov. 8, a day after he said he would abide by a Twitter poll he posted in which users declared he should sell 10% of his Tesla stake. Some of the stock sales had been put into motion well before the poll was posted."

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_marketwatch.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

## Updating the NER Model

In [6]:
ner = nlp.get_pipe("ner")

In [7]:
def built_spacy_ner(text, target, type):
    start = str.find(text, target)
    if start == -1:
        return
    end = start + len(target)
    
    return (text, {"entities": [(start, end, type)]})

In [8]:
df = pd.read_csv("so.csv")
df = df[['body']]

In [9]:
filter = (df['body'].str.contains("Revit", na=False))
df_revit = df[filter].copy()
df_revit.reset_index(inplace=True)

In [10]:
TRAIN_DATA = []

TRAIN_DATA.append(built_spacy_ner("I want to create a cloud-based service that can connect to a Revit Server.", "Revit Server", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("I'm new to the Forge API and not sure where a design parameter is required", "Forge API", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("I've uploaded a Revit model to my OSS bucket.", "OSS", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("Changes are sent to a central BIM 360 server.", "BIM 360", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("All of this is possible on IFC.", "IFC", "ORG"))
TRAIN_DATA.append(built_spacy_ner("I work for Autodesk.", "Autodesk", "ORG"))
TRAIN_DATA

[('I want to create a cloud-based service that can connect to a Revit Server.',
  {'entities': [(61, 73, 'PRODUCT')]}),
 ("I'm new to the Forge API and not sure where a design parameter is required",
  {'entities': [(15, 24, 'PRODUCT')]}),
 ("I've uploaded a Revit model to my OSS bucket.",
  {'entities': [(34, 37, 'PRODUCT')]}),
 ('Changes are sent to a central BIM 360 server.',
  {'entities': [(30, 37, 'PRODUCT')]}),
 ('All of this is possible on IFC.', {'entities': [(27, 30, 'ORG')]}),
 ('I work for Autodesk.', {'entities': [(11, 19, 'ORG')]})]

In [11]:
text = df_revit.body[4]

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_forge_before.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

In [12]:
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [13]:
# creating an optimizer and selecting a list of pipes NOT to train
optimizer = nlp.create_optimizer()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# adding a named entity label
ner = nlp.get_pipe('ner')
#ner.add_label(label)

with nlp.disable_pipes(*other_pipes):
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}

        # batch the examples and iterate over them
        for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)
print("Final loss: ", losses)

Final loss:  {'ner': 0.0037505520229520733}


In [14]:
#for i in range(0,10):
text = df_revit.body[4]

doc = nlp(text)
svg = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_forge_after.html")
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style="ent", jupyter=True)

## Named Entity Recognition with NLTK

In [15]:
text = "Elon Musk resumed his Tesla Inc. stock selloff Thursday, cashing in nearly $1 billion. In fillings with the Securities and Exchange Commission, the Tesla chief executive disclosed he sold another 934,091 Tesla shares, for about $963.2 million. He also exercised options to buy 2.17 million shares at a price of $6.24. After a flurry of stock sales in November, Musk has sold just two tranches of stock since Nov. 23 — the previous one on Dec. 2. In total, Musk has sold about 11.03 million shares worth about $11.82 billion since Nov. 8, a day after he said he would abide by a Twitter poll he posted in which users declared he should sell 10% of his Tesla stake. Some of the stock sales had been put into motion well before the poll was posted."

In [16]:
tokenized_sent = sent_tokenize(text)

In [17]:
person = []
org = []
gpe = []
ner_types = set()

for s in tokenized_sent:
    pos = pos_tag(word_tokenize(s))
    res_chunks = ne_chunk(pos)

    for i in tree2conlltags(res_chunks):
        ner = i[2]
        if ner == 'O':  # skip Outiside labels
            continue;
        ner_types.add(ner) # Create a set of unique entries to ensure none are missing
        if ner.find('PERSON') > -1:   # matches on I-PERSON, B-PERSON
            person.append(i)
        elif ner.find('ORGANIZATION') > -1:
            org.append(i)
        elif ner.find('GPE') > -1:
            gpe.append(i) 

In [18]:
ner_types

{'B-GPE', 'B-ORGANIZATION', 'B-PERSON', 'I-ORGANIZATION'}

In [19]:
print('PERSON:', len(person))
print('ORGANIZATION:', len(org))
print('GPE:', len(gpe))

PERSON: 4
ORGANIZATION: 6
GPE: 1


In [20]:
set(person[:10])

{('Elon', 'NNP', 'B-PERSON'), ('Musk', 'NNP', 'B-PERSON')}

In [21]:
set(org[:15])

{('Commission', 'NNP', 'I-ORGANIZATION'),
 ('Exchange', 'NNP', 'B-ORGANIZATION'),
 ('Inc.', 'NNP', 'I-ORGANIZATION'),
 ('Securities', 'NNPS', 'B-ORGANIZATION'),
 ('Tesla', 'NNP', 'B-ORGANIZATION')}

In [22]:
set(gpe[:15])

{('Tesla', 'NNP', 'B-GPE')}