In [1]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import json
import glob
import pickle
import random
import urllib
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter

from transformers import BertTokenizer, BertModel

from utils import cleaning_utils

#### Example IFC class and description
* source: https://search-test.bsdd.buildingsmart.org/uri/buildingsmart/ifc-4.3/class/IfcWindow


We'll use: 
* TextBlob (NLTK's implementation of punkttokenizer under the hood) to split into sentences.
* SPaR.txt to predict which objects occur, some cleaning on top of this. 


In [2]:
label = "IfcWindow"

In [3]:
description = """
The window is a building element that is predominately used to provide natural light and fresh air. 
It includes vertical opening but also horizontal opening such as skylights or light domes. 
It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels. 
A window consists of a lining and one or several panels. 
A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey. 
fill an opening, typically in a wall. 
The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement; 
be part of an element assembly, typically an IfcCurtainWall. 
The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements;
There are two main representations for window occurrences:
IfcWindow entities that have a 3D rectangle 'Profile' shape representation defined. 
This profile can then be used to parametrically generate the geometry of a window. 
If not provided, the profile of the IfcOpeningElement can be used if the window fills an opening. 
The parameters are specified on the relating IfcWindowType that references IfcWindowLiningProperties and 
IfcWindowPanelProperties for each panel in the window; 
IfcWindow entities that are not parametrically generated and have only 'Brep', or 'SurfaceModel' geometry.
In addition, an IfcWindow may commonly include a 'FootPrint' representation defining the 2D shape of the window and its swing.
the window width and height the window opening direction (by the positive y-axis of the ObjectPlacement)
The IfcWindowType specifies parameters which are common to all of its occurrences of IfcWindow:
the partitioning type (single panel, double panel, tripel panel, more panels) the operation type 
(swing, tilt and turn, pivot revolve, fixed casement, etc.) 
the window panel hinge side (by using two different styles for right and left opening windows) 
the particular attributes for the lining by the IfcWindowLiningProperties the particular attributes 
for the panels by the  IfcWindowPanelPropertiesREFERENCE Definition according to ISO 6707-1 Construction 
for closing a vertical or near vertical opening in a wall or pitched roof that will admit light and 
may admit fresh air. NOTE The entity IfcWindowStandardCase has been deleted. Use an IfcWindow with 
a 'Profile' representation instead. The IfcWindow should also have an IfcWindowType with 
ParameterTakesPrecedence set to 'TRUE'. IFC4 CHANGE The attributes PredefinedType and OperationType are
added, the applicable type object has been changed to IfcWindowType. HISTORY New entity in IFC1.0.
"""

In [4]:
label = "abrasion"
description = """wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water"""

In [5]:
# download SPaR.txt if required
from pathlib import Path
spartxt_path = Path("SPaRtxt/")
if not spartxt_path.exists():
    !git clone https://github.com/rubenkruiper/SPaR.txt.git SPaRtxt

In [6]:
# # kind of convoluted way to import spar.txt as a module from the parent directory
# import imp
# with open(spartxt_path.joinpath('spar_predictor.py'), 'rb') as fp:
#     spar_predictor = imp.load_module(
#         'spar_predictor', fp, 'SPaRtxt.spar_predictor.py',
#         ('.py', 'rb', imp.PY_SOURCE)
#     )

In [7]:
# import sys 
# sys.path.insert(1, 'SPaRtxt/')

In [8]:
from SPaRtxt import *

In [9]:
from utils import spar_utils

In [10]:
# TRAIN/LOAD
# - trains a model if needed, otherwise load from archive; 
# - best F1 on dev/validation in the paper is 80,96 trained on a GPU, CPU will be a bit lower ~77.x I think
te = spar_utils.TermExtractor(max_num_cpu_threads=1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# use existing split into sentences functionality (uses textblob)
sentences = te.split_into_sentences(description)
# first 3 sents
sentences[:3]

['wearing or grinding away of material by friction',
 'usually caused by sand, gravel, or stones, carried by wind or water']

In [12]:
objects = te.process_sentences(sentences)

In [13]:
# # Show some random extracted objects
# random.sample(objects, 2)

In [14]:
# some basic cleaning  
regex_filter = cleaning_utils.RegexFilter()
def run_filters(to_be_cleaned):
    # some basic cleaning steps
    _, regex_cleaned  = regex_filter.run_filter(to_be_cleaned) # _ would be the list of terms removed by our regex filters
    basic_cleaned = cleaning_utils.custom_cleaning_rules(regex_cleaned)
    determiners_removed = [cleaning_utils.remove_determiners(t) for t in basic_cleaned]
    cleaned_terms = [t for t in determiners_removed if t]
    cleaned_counter = Counter(cleaned_terms)
    
    # Could compare how often the objects occur in all extracted descriptions
    cleaned_terms = [t for t in cleaned_terms if cleaned_counter[t] >= 1]
    cleaned_counter = Counter({t: c for t, c in cleaned_counter.items() if c >= 1})
    return cleaned_terms, cleaned_counter

In [15]:
_, obj_cntr = run_filters(objects)
obj_cntr.most_common(10)

[('material', 1),
 ('friction', 1),
 ('sand', 1),
 ('gravel', 1),
 ('stones', 1),
 ('wind', 1),
 ('water', 1)]

### Search for object

In [16]:
url_prefix = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="

In [17]:
query = ""

In [18]:
try:
    response = requests.get(url_prefix + urllib.parse.quote(query)).json()
except: # todo: catch more elegantly
    print("Search terms doesn't exist")
    

In [19]:
response

{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.1',
 'title': 'One or more validation errors occurred.',
 'status': 400,
 'traceId': '00-49dcd7edd8de7b5209c56833749c05d5-1904ad7fd9f95d7a-00',
 'errors': {'SearchText': ['The SearchText field is required.']}}

In [20]:
def check_if_string_exists_as_bsdd_label(
    query: str,
    url_prefix: str = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="
):
    try:
        response = requests.get(url_prefix + urllib.parse.quote(query)).json()
        return response
    except:
        # no search results
        return None
    

In [21]:
def parse_bsdd_api_response(json_response: Dict[str,str]):    
    search_results = []
    for result in json_response['classifications']:
        domain_namespace = result['domainNamespaceUri']

#         # LIMIT RESULTS TO IFC 4.3 for now 
#         if not str(domain_namespace).endswith("ifc-4.3"):
#             continue

        domain_name = result['domainName']
        name = result['name']
        reference_code = result['referenceCode'] if 'referenceCode' in result else None
        namespace_uri = result['namespaceUri'] if 'namespaceUri' in result else None
        description = result['description'] if 'description' in result else None
        parent_name = result['parentClassificationName'] if 'parentClassificationName' in result else None
        related_ifc_entity_names = result['relatedIfcEntityNames'] if 'relatedIfcEntityNames' in result else None
        
        results_of_interest = {"name": name, "description": description, "related": related_ifc_entity_names}
        search_results.append(results_of_interest)
    return search_results

In [33]:
# label = "IfcWindow"
# objects as extracted from the definition for "IfcWindow"
def suggest(label: str, obj_cntr: Counter):
    useless_objs = ["entity", "HISTORY"]
    top_k = 5

    suggested_rel_dict = {label: {}}
    terms_with_overlap_in_description_objects = {}
    for obj, count in tqdm(obj_cntr.most_common()):

        if obj in useless_objs or obj in label:
            continue

        # 1) search for bsdd nodes with the object span as the query
        bsdd_response = check_if_string_exists_as_bsdd_label(obj)
        if bsdd_response:
            bsdd_results = parse_bsdd_api_response(bsdd_response)
        else:
            continue

        if top_k:
            # only look at top_k results from bsdd search
            bsdd_results = bsdd_results[:top_k]

        # 2) Compare if the retrieved, potentially related nodes, contain the same object in their description
        
        for result_dict in bsdd_results:
            name = result_dict["name"] if "name" in result_dict else None
            bsdd_description = result_dict["description"] if "description" in result_dict else None
            if not bsdd_description:
                continue 

            if obj in bsdd_description:
                if name not in terms_with_overlap_in_description_objects:
                    terms_with_overlap_in_description_objects[name] = [obj]
                else:
                    terms_with_overlap_in_description_objects[name].append(obj)
                    

    # 3) Collect suggestions of related terms
    for potentially_related, matching_objects in terms_with_overlap_in_description_objects.items():
#         if label == potentially_related or label.startswith(potentially_related) or potentially_related.startswith(label):
#             # we assume that if the original label occurs in the search results for a span, they are too close
# #             print(f"Skipping self: {label} found when searching for {obj}")
#             continue
    
        if len(matching_objects) < 2:
            continue

        if potentially_related not in suggested_rel_dict[label]:
            suggested_rel_dict[label][potentially_related] = matching_objects
        else:
            suggested_rel_dict[label][potentially_related] += matching_objects
    return suggested_rel_dict

In [23]:
def spartxt_ner(long_description: str):
    # use existing split into sentences functionality (uses textblob)
    sentences = te.split_into_sentences(long_description)
    objects = te.process_sentences(sentences)
    _, object_counter = run_filters(objects)
    return object_counter

In [24]:
label = "abrasion"
description = """
Wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water.
1) Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence. 2) Removal of stream bank material due to entrained sediment, ice, or debris rubbing against the bank.
Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence.
"""

In [25]:
print(f"Term: {label} \nDefinition: {description}")
object_counter = spartxt_ner(description)
print(object_counter.most_common(10))
suggest(label, object_counter)

Term: abrasion 
Definition: 
Wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water.
1) Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence. 2) Removal of stream bank material due to entrained sediment, ice, or debris rubbing against the bank.
Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence.

[('sand', 3), ('gravel', 3), ('water', 2), ('Loss section', 2), ('coating', 2), ('culvert', 2), ('mechanical action', 2), ('suspended bed load', 2), ('cobble size', 2), ('particles', 2)]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:09<00:00,  2.36it/s]


{'abrasion': {'IfcCourse': ['sand'],
  'IfcImpactProtectionDevice.BUMPER': ['sand'],
  'IfcTrackElement.VEHICLESTOP': ['sand'],
  'IfcMarineFacility.BARRIERBEACH': ['sand'],
  'IfcWall.PARTITIONING': ['sand'],
  'IfcCoil.WATERCOOLINGCOIL': ['water'],
  'IfcCoil.WATERHEATINGCOIL': ['water'],
  'IfcSurfaceFeature.TREATMENT': ['coating'],
  'IfcInterceptor.CYCLONIC': ['particles'],
  'IfcLamp.HIGHPRESSUREMERCURY': ['high'],
  'IfcLamp.HIGHPRESSURESODIUM': ['high'],
  'IfcMarinePart.HIGHWATERLINE': ['high'],
  'IfcProtectiveDevice.VARISTOR': ['high'],
  'IfcRoadPart.SHOULDER': ['high'],
  'IfcMaterial': ['material'],
  'IfcMaterialDefinition': ['material'],
  'IfcConstructionMaterialResource': ['material'],
  'IfcBeam.LINTEL': ['material'],
  'IfcMarineFacility.JETTY': ['material'],
  'IfcCourse.BALLASTBED': ['stones'],
  'IfcWindow': ['wind'],
  'IfcWindow.LIGHTDOME': ['wind'],
  'IfcWindow.NOTDEFINED': ['wind'],
  'IfcEvaporativeCooler.DIRECTEVAPORATIVEAIRWASHER': ['entrained'],
  'IfcEv

In [30]:
label = "IfcWindow"
description = """
The window is a building element that is predominately used to provide natural light and fresh air.  It includes vertical opening but also horizontal opening such as skylights or light domes.  It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels.  A window consists of a lining and one or several panels.  A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey.  fill an opening, typically in a wall.  The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement;  be part of an element assembly, typically an IfcCurtainWall.  The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements; There are two main representations for window occurrences: IfcWindow entities that have a 3D rectangle 'Profile' shape representation defined.  This profile can then be used to parametrically generate the geometry of a window.  If not provided, the profile of the IfcOpeningElement can be used if the window fills an opening.  The parameters are specified on the relating IfcWindowType that references IfcWindowLiningProperties and  IfcWindowPanelProperties for each panel in the window;  IfcWindow entities that are not parametrically generated and have only 'Brep', or 'SurfaceModel' geometry. In addition, an IfcWindow may commonly include a 'FootPrint' representation defining the 2D shape of the window and its swing. the window width and height the window opening direction (by the positive y-axis of the ObjectPlacement) The IfcWindowType specifies parameters which are common to all of its occurrences of IfcWindow: the partitioning type (single panel, double panel, tripel panel, more panels) the operation type  (swing, tilt and turn, pivot revolve, fixed casement, etc.)  the window panel hinge side (by using two different styles for right and left opening windows)  the particular attributes for the lining by the IfcWindowLiningProperties the particular attributes  for the panels by the  IfcWindowPanelPropertiesREFERENCE Definition according to ISO 6707-1 Construction  for closing a vertical or near vertical opening in a wall or pitched roof that will admit light and  may admit fresh air. NOTE The entity IfcWindowStandardCase has been deleted. Use an IfcWindow with  a 'Profile' representation instead. The IfcWindow should also have an IfcWindowType with  ParameterTakesPrecedence set to 'TRUE'. IFC4 CHANGE The attributes PredefinedType and OperationType are added, the applicable type object has been changed to IfcWindowType. HISTORY New entity in IFC1.0.
"""

In [34]:
print(f"Term: {label} \nDefinition: {description}")
object_counter = spartxt_ner(description)
suggested_rel_dict = suggest(label, object_counter)

Term: IfcWindow 
Definition: 
The window is a building element that is predominately used to provide natural light and fresh air.  It includes vertical opening but also horizontal opening such as skylights or light domes.  It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels.  A window consists of a lining and one or several panels.  A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey.  fill an opening, typically in a wall.  The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement;  be part of an element assembly, typically an IfcCurtainWall.  The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements; There are two main representations for window occurrences: IfcWindow entities that have a 3D rectangle 'Profile' 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [00:25<00:00,  3.36it/s]


**Alternative heuristics**:
* KNN graph / k nearest neighbours based on embeddings
* consider combining label with objects in defintion for a representantion?

In [35]:
import pprint
pprint.pprint(suggested_rel_dict)

{'IfcWindow': {'IfcArbitraryClosedProfileDef': ['profile', 'Profile'],
               'IfcArbitraryProfileDefWithVoids': ['profile', 'Profile'],
               'IfcBeam': ['parametric', 'Brep'],
               'IfcColumn': ['parametric', 'Brep'],
               'IfcCurtainWall': ['representation',
                                  'IfcCurtainWall',
                                  'IfcRelAggregates relationship',
                                  'representations'],
               'IfcDoor': ['IfcSpatialElement',
                           'IfcBuildingStorey',
                           'FillsVoids attribute',
                           'IfcRelFillsElement relationship',
                           'IfcCurtainWall',
                           'Decomposes attribute',
                           'rectangle',
                           'FootPrint',
                           '2D shape',
                           'swing',
                           'ObjectPlacement',
                      

In [36]:
import pandas as pd

In [46]:
to_be_parsed = pd.read_csv("bsdd_descriptions.csv")

In [53]:
to_be_parsed.description

0        Location track number or name as an abbreviation 
1        E.g. additional information related to install...
2        Height of the post in millimeters if sign has ...
3                       Installation direction of the sign
4          The route number on which the object is located
                               ...                        
28163    Angir hvilket år vegobjektet ble etablert på s...
28164             Angir hvilket år utstyret ble produsert.
28165         Angir hvilken type energikilde som benyttes.
28166             Angir hovedbruksområde for styreapparat.
28167                             Angir type styreapparat.
Name: description, Length: 28168, dtype: object

In [59]:
objects_found = []
objects_and_counts = []
for description in tqdm(to_be_parsed.description):
    try:
        object_counter = spartxt_ner(description)
        objects_and_counts.append(str(object_counter))
    
        objects = [o for o, _ in object_counter.most_common()]
        objects_found.append(objects)
    except:
        objects_and_counts.append("")
        objects_found.append("")
        


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28168/28168 [1:04:21<00:00,  7.29it/s]


In [60]:
to_be_parsed['objects found'] = objects_found
to_be_parsed['objects and counts'] = objects_and_counts

In [61]:
to_be_parsed.to_csv("bsdd_parsed_descriptions.csv")