In [40]:
import pandas as pd
import random

import spacy
from spacy.training.example import Example
from spacy import displacy
from pathlib import Path
nlp = spacy.load("en_core_web_sm")

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
from nltk.tree import Tree

In [41]:
# https://www.nytimes.com/2021/12/02/us/politics/biden-omicron-covid-testing.html

text = "Mr. Biden's announcement came as several new cases of the Omicron variant were reported in the United States, including five people in New York State, a Minnesota resident who had recently traveled to New York City and a Colorado resident who had recently returned from southern Africa. Hawaii also reported its first known case, and California its second."

doc = nlp(text)
html = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_nyt.html")
output_path.open("w", encoding="utf-8").write(html)
displacy.render(doc, style="ent", jupyter=True)

In [42]:
# https://www.teslarati.com/tesla-new-york-city-12-million-model-3-order/

text = "IN THE MATTER OF a proposed contract between the Department of Citywide Administrative Services of the City of New York and Tesla, Inc., located at 3500 Deer Creek Rd., Palo Alto, CA 94304, for procuring Tesla Model 3 All-Electric Sedans. The contract is in the amount of $12,360,000.00. The term of the contract shall be five years from date of Notice of Award. The proposed contractor has been selected by Sole Source Procurement Method, pursuant to Section 3-05 of the Procurement Policy Board Rules. If the plan does go through, the $12.36 million could effectively purchase about 274 units of the base Model 3 Rear-Wheel-Drive, which cost $44,990 under Tesla's current pricing structure."

doc = nlp(text)
html = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_teslarati.html")
output_path.open("w", encoding="utf-8").write(html)
displacy.render(doc, style="ent", jupyter=True)

## Updating the NER Model

In [44]:
def built_spacy_ner(text, target, type):
    start = str.find(text, target)
    end = start + len(target)
    
    return (text, {"entities": [(start, end, type)]})

In [45]:
TRAIN_DATA = []

TRAIN_DATA.append(built_spacy_ner("I want to create a cloud-based service connected to Revit Server.", "Revit Server", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("I'm new to the Forge API unsure where a design parameter is required", "Forge API", "API"))
TRAIN_DATA.append(built_spacy_ner("Autodesk Forge is my Platform of choice", "Autodesk Forge", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("I've uploaded a Revit model to my OSS bucket.", "OSS", "SERVICE"))
TRAIN_DATA.append(built_spacy_ner("Changes are sent to a central BIM 360 server.", "BIM 360", "PRODUCT"))
TRAIN_DATA.append(built_spacy_ner("All of this is possible on the IFC format.", "IFC", "FORMAT"))
TRAIN_DATA.append(built_spacy_ner("The native file format for Revit is RVT.", "RVT", "FORMAT"))
TRAIN_DATA.append(built_spacy_ner("I work for Autodesk.", "Autodesk", "ORG"))
TRAIN_DATA.append(built_spacy_ner("Model Derivative API provides translation", "Model Derivative API", "API"))
TRAIN_DATA.append(built_spacy_ner("The Model Derivative API used in conjunction with the Viewer", "Model Derivative API", "API"))
TRAIN_DATA.append(built_spacy_ner("I would like to automate Revit with the Design Automation API", "Design Automation API", "API"))


In [46]:
TRAIN_DATA

[('I want to create a cloud-based service connected to Revit Server.',
  {'entities': [(52, 64, 'PRODUCT')]}),
 ("I'm new to the Forge API unsure where a design parameter is required",
  {'entities': [(15, 24, 'API')]}),
 ('Autodesk Forge is my Platform of choice',
  {'entities': [(0, 14, 'PRODUCT')]}),
 ("I've uploaded a Revit model to my OSS bucket.",
  {'entities': [(34, 37, 'SERVICE')]}),
 ('Changes are sent to a central BIM 360 server.',
  {'entities': [(30, 37, 'PRODUCT')]}),
 ('All of this is possible on the IFC format.',
  {'entities': [(31, 34, 'FORMAT')]}),
 ('The native file format for Revit is RVT.',
  {'entities': [(36, 39, 'FORMAT')]}),
 ('I work for Autodesk.', {'entities': [(11, 19, 'ORG')]}),
 ('Model Derivative API provides translation', {'entities': [(0, 20, 'API')]}),
 ('The Model Derivative API used in conjunction with the Viewer',
  {'entities': [(4, 24, 'API')]}),
 ('I would like to automate Revit with the Design Automation API',
  {'entities': [(40, 61, 'API')]}

In [47]:
# https://stackoverflow.com/questions/66779014/use-the-revit-native-file-language-instead-of-english-when-converting-properties 

text = "I've been using for a long time the Model Derivative API from Autodesk Forge to (successfully) export Revit files to IFC. However, I notice that even when the original Revit files are saved with the French version of the software (namely, Revit FRA), the properties (e.g. ) are exported in English ( ), and I see no option in the Model Derivative API to force using the native language. Does someone have an idea on how to do that (if it is feasible)? I have searched on the official documentation and tried modifying the parameters mentioned for the conversion (see ), but with no success so far. Of course the same issue can be of interest for those exporting to other formats than IFC, or other languages than French. Thanks!"

doc = nlp(text)
html = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_forge_before.html")
output_path.open("w", encoding="utf-8").write(html)
displacy.render(doc, style="ent", jupyter=True)

In [48]:
# adding a named entity label
ner = nlp.get_pipe('ner')

# Iterate through training data and add new entitle labels.
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [49]:
# creating an optimizer and selecting a list of pipes NOT to train
optimizer = nlp.create_optimizer()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}

        # batch the examples and iterate over them
        for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)

print("Final loss: ", losses)

Final loss:  {'ner': 27.724280745607512}


In [50]:
text = "I've been using for a long time the Model Derivative API from Autodesk Forge to (successfully) export Revit files to IFC. However, I notice that even when the original Revit files are saved with the French version of the software (namely, Revit FRA), the properties (e.g. ) are exported in English ( ), and I see no option in the Model Derivative API to force using the native language. Does someone have an idea on how to do that (if it is feasible)? I have searched on the official documentation and tried modifying the parameters mentioned for the conversion (see ), but with no success so far. Of course the same issue can be of interest for those exporting to other formats than IFC, or other languages than French. Thanks!"

doc = nlp(text)
html = displacy.render(doc, style="ent", jupyter=False, page=True, minify=True)
output_path = Path("ner_forge_after.html")
output_path.open("w", encoding="utf-8").write(html)
displacy.render(doc, style="ent", jupyter=True)

## Named Entity Recognition with NLTK

In [51]:
text = "IN THE MATTER OF a proposed contract between the Department of Citywide Administrative Services of the City of New York and Tesla, Inc., located at 3500 Deer Creek Rd., Palo Alto, CA 94304, for procuring Tesla Model 3 All-Electric Sedans. The contract is in the amount of $12,360,000.00. The term of the contract shall be five years from date of Notice of Award. The proposed contractor has been selected by Sole Source Procurement Method, pursuant to Section 3-05 of the Procurement Policy Board Rules. If the plan does go through, the $12.36 million could effectively purchase about 274 units of the base Model 3 Rear-Wheel-Drive, which cost $44,990 under Tesla's current pricing structure."

In [52]:
for sent in sent_tokenize(text):
   for chunk in ne_chunk(pos_tag(word_tokenize(sent))):
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

ORGANIZATION Department
ORGANIZATION Citywide Administrative Services
ORGANIZATION City
GPE New York
PERSON Tesla
PERSON Palo Alto
PERSON Tesla Model
PERSON Award
PERSON Sole Source
ORGANIZATION Procurement Policy Board Rules
PERSON Tesla
