In [1]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import torch
import time
import json
import glob
import pickle
import random
import urllib
import requests

import pandas as pd

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter
from transformers import pipeline

from utilities import cleaning_utils
from utilities.spar_utils import NER

### Set up Named Entity Recognition (NER) 
We'll use: 
* TextBlob (NLTK's implementation of punkttokenizer under the hood) to split into sentences.
* SPaR.txt to predict which objects occur, some cleaning on top of this. 
  * We'll assume you have docker installed on your machine


In [2]:
# download SPaR.txt if required
from pathlib import Path
spartxt_path = Path("SPaR.txt/")
if not spartxt_path.exists():
    !git clone https://github.com/rubenkruiper/SPaR.txt.git

In [3]:
try: 
    ### Start our `SPaR_API` container if it exists
    !docker start SPaR_API
except:
    ### Else, set up the `SPaR_API` container
    # build the SPaR.txt image, call it `spar`
    !docker build -t spar SPaR.txt/.
    # Run the image called `spar` in a container that we will call `SPaR_API`, with the API port at localhost:8501
    # NOTE: this will train a SPaR.txt model locally, which takes about 20 minutes on a CPU 
    !docker run --name SPaR_API -p 8501:8501 spar

SPaR_API


In [4]:
ner_api = "http://localhost:8501/predict_objects/"

In [5]:
example = "Thermoplastic materials in ceilings, rooflights and lighting diffusers provide a significant hazard in a fire."
response = requests.post(ner_api,  json={"sentence": example}).json()
response

{'prediction': {'obj': ['Thermoplastic materials',
   'ceilings',
   'rooflights',
   'lighting diffusers',
   'a hazard',
   'a fire']},
 'num_input_tokens': 26,
 'num_output_tokens': 17}

### Parsing an example IFC class and description
* source: https://search-test.bsdd.buildingsmart.org/uri/buildingsmart/ifc-4.3/class/IfcWindow

In [6]:
label = "IfcWindow"

In [7]:
description = """
The window is a building element that is predominately used to provide natural light and fresh air. 
It includes vertical opening but also horizontal opening such as skylights or light domes. 
It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels. 
A window consists of a lining and one or several panels. 
A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey. 
fill an opening, typically in a wall. 
The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement; 
be part of an element assembly, typically an IfcCurtainWall. 
The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements;
There are two main representations for window occurrences:
IfcWindow entities that have a 3D rectangle 'Profile' shape representation defined. 
This profile can then be used to parametrically generate the geometry of a window. 
If not provided, the profile of the IfcOpeningElement can be used if the window fills an opening. 
The parameters are specified on the relating IfcWindowType that references IfcWindowLiningProperties and 
IfcWindowPanelProperties for each panel in the window; 
IfcWindow entities that are not parametrically generated and have only 'Brep', or 'SurfaceModel' geometry.
In addition, an IfcWindow may commonly include a 'FootPrint' representation defining the 2D shape of the window and its swing.
the window width and height the window opening direction (by the positive y-axis of the ObjectPlacement)
The IfcWindowType specifies parameters which are common to all of its occurrences of IfcWindow:
the partitioning type (single panel, double panel, tripel panel, more panels) the operation type 
(swing, tilt and turn, pivot revolve, fixed casement, etc.) 
the window panel hinge side (by using two different styles for right and left opening windows) 
the particular attributes for the lining by the IfcWindowLiningProperties the particular attributes 
for the panels by the  IfcWindowPanelPropertiesREFERENCE Definition according to ISO 6707-1 Construction 
for closing a vertical or near vertical opening in a wall or pitched roof that will admit light and 
may admit fresh air. NOTE The entity IfcWindowStandardCase has been deleted. Use an IfcWindow with 
a 'Profile' representation instead. The IfcWindow should also have an IfcWindowType with 
ParameterTakesPrecedence set to 'TRUE'. IFC4 CHANGE The attributes PredefinedType and OperationType are
added, the applicable type object has been changed to IfcWindowType. HISTORY New entity in IFC1.0.
"""

In [8]:
ner = NER(ner_api)
ner.split_into_sentences(to_be_split=description)[:3]

['The window is a building element that is predominately used to provide natural light and fresh air.',
 'It includes vertical opening but also horizontal opening such as skylights or light domes.',
 'It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels.']

In [9]:
raw_ner_output = ner.process_text(description)
random.sample(raw_ner_output, 5)

['an opening',
 'fixed casement',
 'HISTORY New entity',
 'more panels',
 'The parameters']

In [10]:
# Some basic cleaning for the entire set of extracted objects
regex_filter = cleaning_utils.RegexFilter()
def basic_cleaning(to_be_cleaned):
    # some basic cleaning steps
    _, regex_cleaned  = regex_filter.run_filter(to_be_cleaned) # _ would be the list of terms removed by our regex filters
    basic_cleaned = cleaning_utils.custom_cleaning_rules(regex_cleaned)
    determiners_removed = [cleaning_utils.remove_determiners(t) for t in basic_cleaned]
    cleaned_terms = [t for t in determiners_removed if t]
    cleaned_counter = Counter(cleaned_terms)
    return cleaned_terms, cleaned_counter

In [11]:
cleaned_ner_output, terms_with_counts = basic_cleaning(raw_ner_output)
random.sample(cleaned_ner_output, 5)

['NOTE The entity IfcWindowStandardCase',
 'tilt',
 'panels',
 'pitched roof',
 'IfcWindowLiningProperties']

In [12]:
# Another, smaller example
label = "abrasion"
description = """wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water"""

In [13]:
raw_ner_output = ner.process_text(description)
cleaned_ner_output, terms_with_counts = basic_cleaning(raw_ner_output)
cleaned_ner_output

['wearing',
 'material',
 'friction',
 'sand',
 'gravel',
 'stones',
 'wind',
 'water']

### First, remove all non-English rows from the bSDD extract
* We'll use an existing pretrained language predictor `papluca/xlm-roberta-base-language-detection`

In [30]:
# Set up the language classifier
device = 0 if torch.cuda.is_available() else -1
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt, device=device)

In [31]:
def split_list(some_list: List, chunk_size: int=8) -> List[List]:
    """
    Helper function to split a list into smaller lists of a given size.

    :param some_list:   List that has to be split into chunks.
    :param chunk_size:  Size of the sublists that will be returned.
    :return list_of_sublists:  A list of sublists, each with a maximum size of `chunk_size`.
    """
    return [some_list[i:i + chunk_size] for i in range(0, len(some_list), chunk_size)]

In [42]:
# input and output csv for the language detection
unprocessed_csv = Path("data", "bsdd_descriptions.csv")
english_csv = Path("data", "bsdd_descriptions_english.csv")

In [49]:
# First, we'll drop non-english lines, based on  
if not english_csv.exists():
    bsdd_df = pd.read_csv(unprocessed_csv)
    
    # combine name and description to determine language
    concatenations = []
    for name, description in zip(bsdd_df.name, bsdd_df.description):
        new_str = str(name) + " " + str(description)
        concatenations.append(new_str.encode("ascii", "ignore").decode())
        
    # reduce processing time a little by considering unique values only (many duplicates in bSDD)
    batch_size = 8 # not sure what is the expected batchsize for the language prediction model
    unique_concatenations = list(set(concatenations))
    languages = []
    subsets_of_unique_concatenations = split_list(unique_concatenations, batch_size)
    print(f"Predicting languages for unique concatenations of name and description, batch size {batch_size}")
    for subset in tqdm(subsets_of_unique_concatenations):
        languages += [predicted['label'] for predicted in pipe(subset)]
    
    language_lookup = {k:v for k, v in zip(unique_concatenations, languages)}
    
    # drop the non-english rows in the dataframe
    for idx, row in bsdd_df.iterrows():
        key = str(row["name"]) + " " + str(row["description"])
        language = language_lookup[key.encode("ascii", "ignore").decode()]
        print(idx, language)
        if language != 'en':
            bsdd_df.drop(idx, inplace=True) 
        
    bsdd_df.to_csv(english_csv)
else:
    bsdd_df = pd.read_csv(english_csv)

### Process all descriptions in our bSDD extract

In [50]:
english_csv = Path("data", "bsdd_descriptions_english.csv")
processed_csv = Path("data", "bsdd_parsed_descriptions.csv")

In [52]:
if not processed_csv.exists():
    bsdd_df = pd.read_csv(english_csv)
    
    # process all of the descriptions with SPaR.txt 
    # we'll process the unique description, since there is a lot of overlap in bSDD
    unique_descriptions = list(set(bsdd_df.description))
    unique_predictions_raw = []
    print(f"Predicting terms found in {len(unique_descriptions)} unique descriptions")
    for description in tqdm(unique_descriptions):
        try:
            unique_predictions_raw.append(ner.process_text(description))
        except:
            unique_predictions_raw.append([])
    
    # We'll do some cleaning of terms, count-based as well
    cleaned_object_lists, subset_counts = zip(*[basic_cleaning(sublist) for sublist in unique_predictions_raw])
    
    # Count-based cleaning (only consider objects that occur at least X times in all descriptions)
    cleaned_counter = sum(subset_counts, Counter())
    cleaned_object_lists = [[t for t in terms if cleaned_counter[t] >= 3] for terms in cleaned_object_lists]
    
    # update the dataframe and save to csv 
    objects_column = [''] * len(bsdd_df.description)
    for description, cleaned_prediction in zip(unique_descriptions, cleaned_object_lists):
        indices = bsdd_df.index[bsdd_df.description==description].tolist()
        for idx in indices:
            objects_column[idx] = ", ".join(cleaned_prediction)
            
    bsdd_df["description_NER"] = objects_column
    bsdd_df.to_csv(processed_csv)
else:
    bsdd_df = pd.read_csv(processed_csv)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3604/3604 [06:58<00:00,  8.62it/s]


In [54]:
# check the DataFrame, should be only/mostly english and have objects assigned that occur in the descriptions
bsdd_df

Unnamed: 0.1,Unnamed: 0,subject,name,uid,description,description_NER
0,0,https://identifier.buildingsmart.org/uri/FTIA/...,Location track,LocationTrack,Location track number or name as an abbreviation,
1,1,https://identifier.buildingsmart.org/uri/FTIA/...,Additional details,AdditionalDetails,E.g. additional information related to install...,"information, installation"
2,2,https://identifier.buildingsmart.org/uri/FTIA/...,Post height,PostHeight,Height of the post in millimeters if sign has ...,"Height, millimeters, sign"
3,3,https://identifier.buildingsmart.org/uri/FTIA/...,Installation direction,InstallationDirection,Installation direction of the sign,sign
4,4,https://identifier.buildingsmart.org/uri/FTIA/...,Route number,RouteNumber,The route number on which the object is located,object
...,...,...,...,...,...,...
12525,27864,https://identifier.buildingsmart.org/uri/NVDB/...,DiameterYtre_9729,DiameterYtre_9729,Angir ytre diameter for trekkekum.,
12526,27939,https://identifier.buildingsmart.org/uri/NVDB/...,Materialtype_10429,Materialtype_10429,Angir type materiale.,
12527,27979,https://identifier.buildingsmart.org/uri/NVDB/...,Materialtype_9388,Materialtype_9388,.,
12528,28080,https://identifier.buildingsmart.org/uri/NVDB/...,Spenning_10049,Spenning_10049,Angir spenningen som leveres ut fra enheten.,


In [55]:
# Quickly grab all unique terms that we found in all of the descriptions
all_objects = []
for string_or_list in bsdd_df.description_NER:
    all_objects += string_or_list.split(", ")
all_objects = list(set(all_objects))
len(all_objects)

954

In [56]:
random.sample(all_objects, 10)

['performance',
 'pedestal',
 'OverallHeight',
 'cross - section size',
 'specification',
 'operating fluid',
 'reference line',
 'IEC 60947 series',
 'flange',
 'continuously']

In [57]:
# drop the rows that don't have any objects in their description
for idx, row in bsdd_df.iterrows():
    if not row["description_NER"]:
        bsdd_df.drop(idx, inplace=True) 

In [61]:
final_csv = Path("data", "bsdd_graph_input.csv")
bsdd_df.to_csv(final_csv)