In [21]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import json
import glob
import pickle
import random
import urllib
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter

from transformers import BertTokenizer, BertModel

from utils import spar_utils
from utils import cleaning_utils

#### Example IFC class and description
* source: https://search-test.bsdd.buildingsmart.org/uri/buildingsmart/ifc-4.3/class/IfcWindow


We'll use: 
* TextBlob (NLTK's implementation of punkttokenizer under the hood) to split into sentences.
* SPaR.txt to predict which objects occur, some cleaning on top of this. 


In [5]:
label = "IfcWindow"

In [6]:
description = """
The window is a building element that is predominately used to provide natural light and fresh air. It includes vertical opening but also horizontal opening such as skylights or light domes. It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels. A window consists of a lining and one or several panels. A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey. fill an opening, typically in a wall. The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement; be part of an element assembly, typically an IfcCurtainWall. The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements;There are two main representations for window occurrences:IfcWindow entities that have a 3D rectangle 'Profile' shape representation defined. This profile can then be used to parametrically generate the geometry of a window. If not provided, the profile of the IfcOpeningElement can be used if the window fills an opening. The parameters are specified on the relating IfcWindowType that references IfcWindowLiningProperties and IfcWindowPanelProperties for each panel in the window; IfcWindow entities that are not parametrically generated and have only 'Brep', or 'SurfaceModel' geometry.In addition, an IfcWindow may commonly include a 'FootPrint' representation defining the 2D shape of the window and its swing.the window width and height the window opening direction (by the positive y-axis of the ObjectPlacement)The IfcWindowType specifies parameters which are common to all of its occurrences of IfcWindow:the partitioning type (single panel, double panel, tripel panel, more panels) the operation type (swing, tilt and turn, pivot revolve, fixed casement, etc.) the window panel hinge side (by using two different styles for right and left opening windows) the particular attributes for the lining by the IfcWindowLiningProperties the particular attributes for the panels by the  IfcWindowPanelPropertiesREFERENCE Definition according to ISO 6707-1 Construction for closing a vertical or near vertical opening in a wall or pitched roof that will admit light and may admit fresh air. NOTE The entity IfcWindowStandardCase has been deleted. Use an IfcWindow with a 'Profile' representation instead. The IfcWindow should also have an IfcWindowType with ParameterTakesPrecedence set to 'TRUE'. IFC4 CHANGE The attributes PredefinedType and OperationType are added, the applicable type object has been changed to IfcWindowType. HISTORY New entity in IFC1.0.
"""

In [7]:
# download SPaR.txt if required
from pathlib import Path
spartxt_path = Path("SPaR.txt/")
if not spartxt_path.exists():
    !git clone https://github.com/rubenkruiper/SPaR.txt.git

In [8]:
# kind of convoluted way to import spar.txt as a module from the parent directory
import imp
with open(spartxt_path.joinpath('spar_predictor.py'), 'rb') as fp:
    spar_predictor = imp.load_module(
        'spar_predictor', fp, 'SPaR.txt.spar_predictor.py',
        ('.py', 'rb', imp.PY_SOURCE)
    )

In [10]:
# TRAIN/LOAD
# - trains a model if needed, otherwise load from archive; 
# - best F1 on dev/validation in the paper is 80,96 trained on a GPU, CPU will be a bit lower ~77.x I think
te = spar_utils.TermExtractor(max_num_cpu_threads=1)

In [14]:
# use existing split into sentences functionality (uses textblob)
sentences = te.split_into_sentences(description)
# first 3 sents
sentences[:3]

['The window is a building element that is predominately used to provide natural light and fresh air.',
 'It includes vertical opening but also horizontal opening such as skylights or light domes.',
 'It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels.']

In [15]:
objects = te.process_sentences(sentences)

In [18]:
# Show some random extracted objects
random.sample(objects, 10)

['window occurrences',
 'The parameters',
 'one',
 'a',
 'the window',
 'an opening',
 'Brep',
 'revolving',
 'The IfcWindow',
 'TRUE']

In [24]:
# some basic cleaning  
regex_filter = cleaning_utils.RegexFilter()
def run_filters(to_be_cleaned):
    # some basic cleaning steps
    _, regex_cleaned  = regex_filter.run_filter(to_be_cleaned) # _ would be the list of terms removed by our regex filters
    basic_cleaned = cleaning_utils.custom_cleaning_rules(regex_cleaned)
    determiners_removed = [cleaning_utils.remove_determiners(t) for t in basic_cleaned]
    cleaned_terms = [t for t in determiners_removed if t]
    cleaned_counter = Counter(cleaned_terms)
    
    # Could compare how often the objects occur in all extracted descriptions
    cleaned_terms = [t for t in cleaned_terms if cleaned_counter[t] >= 1]
    cleaned_counter = Counter({t: c for t, c in cleaned_counter.items() if c >= 1})
    return cleaned_terms, cleaned_counter

In [26]:
_, obj_cntr = run_filters(objects)
obj_cntr.most_common(10)

[('window', 9),
 ('IfcWindow', 5),
 ('panels', 3),
 ('fresh air', 2),
 ('lining', 2),
 ('opening', 2),
 ('IfcOpeningElement', 2),
 ('IfcWindow entities', 2),
 ('profile', 2),
 ('parameters', 2)]

### Search for object

In [28]:
url_prefix = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="

In [53]:
query = "IfcWindow"

In [54]:
try:
    response = requests.get(url_prefix + urllib.parse.quote(query)).json()
except: # todo: catch more elegantly
    print("Search terms doesn't exist")
    

In [76]:
def check_if_string_exists_as_bsdd_label(
    query: str,
    url_prefix: str = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="
):
    try:
        response = requests.get(url_prefix + urllib.parse.quote(query)).json()
        return response
    except:
        # no search results
        return None
    

In [82]:
def parse_bsdd_api_results(json_response: Dict):
    rows = []
    for result in json_response['classifications']:
        domain_namespace = result['domainNamespaceUri']

        # LIMITED TO IFC 4.3 for now 
        if not str(domain_namespace).endswith("ifc-4.3"):
            continue

        domain_name = result['domainName']
        name = result['name']
        reference_code = result['referenceCode'] if 'referenceCode' in result else None
        namespace_uri = result['namespaceUri'] if 'namespaceUri' in result else None
        description = result['description'] if 'description' in result else None
        parent_name = result['parentClassificationName'] if 'parentClassificationName' in result else None
        related_ifc_entity_names = result['relatedIfcEntityNames'] if 'relatedIfcEntityNames' in result else None
        rows.append([name, description, related_ifc_entity_names])
    return rows
        
    

In [86]:
parse_bsdd_api_results(response)

[['IfcWindow',
  'The window is a building element that is predominately used to provide natural light and fresh air. It includes vertical opening but also horizontal opening such as skylights or light domes. It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels. A window consists of a lining and one or several panels. A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey.\nfill an opening, typically in a wall. The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement;\nbe part of an element assembly, typically an IfcCurtainWall. The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements;There are two main representations for window occurrences:IfcWindow entities that have a 3D rectangle \'Profile\' shape representati

In [84]:
# label = "IfcWindow"
suggested_rel_dict = {}
for obj, count in obj_cntr.most_common():
    bsdd_results = check_if_string_exists_as_bsdd_label(obj)
    rows = parse_bsdd_api_results(bsdd_results)
    
    if label not in suggested_rel_dict:
        suggested_rel_dict[label] = {obj}

In [85]:
suggested_rel_dict

{'IfcWindow': {'window'}}