In [37]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import json
import glob
import pickle
import random
import urllib
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

from utils import cleaning_utils

### Our starting point is a .ttl file containing new terms and corresponding definitions
* In this case, generated from a .csv file using ontotext

In [38]:
path_to_input = Path("data", "tpfDefinitions.ttl")

In [39]:
from rdflib import Graph
new_graph = Graph()
new_graph.parse(path_to_input, format='ttl')

<Graph identifier=N34f198e2845e4b1da8b2e137d5332b33 (<class 'rdflib.graph.Graph'>)>

In [46]:
new_nodes = []
NS = Namespace("http://example.com/resource/")
new_terms = []
for s, p, o in new_graph:
    if p == URIRef(NS.Term):
        new_terms.append(o)
#     if p == URIRef(NS.Definition):

# TODO; think about the logic of looping over everything, probably SPARQL query to grab what we want
        

Backwater Ratio
scaling
Streetcar
catchment area
End Distance of Bolts
Duct Stack
At Loading
Deck System
electrolytic corrosion
Crossover
Indirect Loading/Supporting
Straight-In Approach VFR
notch effect
concrete
creep
portable bridge
Carrier
sacrificial thickness
aggradation
Composite Hydrograph
wind bracing
General Bursting Forces
foundation failure
Turning Movement
load and resistance factor design (LRFD)
Altimeter Setting
Flume
Drag Coefficient
Compact Section
Wind Sock
Compression Seal
bending moment
Car Float
Bottleneck
frost heave
Curb Extension
Glued Laminated Deck Panel
Average Velocity
Differential Pricing
Apron
Joint Seal
segmental arch
bridging
Bituminous Concrete
Surface Course
Velocity Head
hydroplaning
Unlinked Trip
Negative Moment
Shallow Draft Waterways
Vulnerability Assessment
grid flooring
Public road
Work
end rotation
Drip Groove
Competitive Range
Stiffness
Zone Fare
spandrel
fascia
portal
Cover
wingwall
galvanize
sacrificial protection
bulb t-girder
post-stressing


In [45]:
predicates

[rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://example.com/resource/Term'),
 rdflib.term.URIRef('http://example.com/resource/Reference'),
 rdflib.term.URIRef('http://example.com/resource/Definition')]

### We'll be adding potential links to nodes in our bSDD graph (representing bSDD)
1. using exact matches in the description, linking to nodes returned by the standard search API
2. using semantic similarity based on node+description, linking to nodes in our bSDD graph
3. using description overlap overlap, relying on the objects occuring in our bSDD graph descriptions

#### 1. Search for some term in bSDD and parse results

In [2]:
# API endpoinit
url_prefix = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="

In [10]:
def check_if_string_exists_as_bsdd_label(
    query: str,
    url_prefix: str = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="
):
    """
    Try to find a result for `query`. Returns None if the API response cannot be converted to json, e.g., if the
    response is empty.
    """
    try:
        response = requests.get(url_prefix + urllib.parse.quote(query)).json()
        return response
    except:
        # no search results
        return None
    

In [11]:
def parse_bsdd_api_response(json_response: Dict[str,str]):    
    search_results = []
    for result in json_response['classifications']:
        domain_namespace = result['domainNamespaceUri']

#         # EXAMPLE, LIMIT RESULTS TO IFC 4.3 namespace
#         if not str(domain_namespace).endswith("ifc-4.3"):
#             continue

        domain_name = result['domainName']
        name = result['name']
        reference_code = result['referenceCode'] if 'referenceCode' in result else None
        namespace_uri = result['namespaceUri'] if 'namespaceUri' in result else None
        description = result['description'] if 'description' in result else None
        parent_name = result['parentClassificationName'] if 'parentClassificationName' in result else None
        related_ifc_entity_names = result['relatedIfcEntityNames'] if 'relatedIfcEntityNames' in result else None
        
        results_of_interest = {"name": name, "description": description, "related": related_ifc_entity_names}
        search_results.append(results_of_interest)
    return search_results

In [22]:
query = "example"
json_result = check_if_string_exists_as_bsdd_label(query)
parsed_result = parse_bsdd_api_response(json_result) if json_result else None
parsed_result

[{'name': 'IfcCourse',
  'description': 'A built element whose length greatly exceeds its thickness and often also its width, usually of a single material laid on site on top of another horizontal or nearly horizontal built element. A course is distinctive from a earthworks element in that a course is a graded granular (which can be bound or unbound) material that is generally processed in some fashion, where as earthworks elements are soil earthen based structure that can be formed by removal and transport of general ground material.\nStructurally a course does not have capacity to carry loads over open span, or to be removed or replaced as a single unit. examples of courses include:\n* Graded aggregate layers\n* Graded sand layers\n* Cement bounded material (CBM)\n* Asphalt layers',
  'related': []},
 {'name': 'IfcTank.EXPANSION',
  'description': 'A closed container used in a closed fluid distribution system to mitigate the effects of thermal expansion or water hammer. The tank is t

In [24]:
query = "exa1354mple"
json_result = check_if_string_exists_as_bsdd_label(query)
parsed_result = parse_bsdd_api_response(json_result) if json_result else None
parsed_result # returns None

### Suggest related terms in bSDD, based on terms that occur in its description
* We'll try to identify terms in the description using SPaR.txt 
  * might want to consider domain-specific filtering based on background corpus

In [25]:
# download SPaR.txt if required
from pathlib import Path
spartxt_path = Path("SPaR.txt/")
if not spartxt_path.exists():
    !git clone https://github.com/rubenkruiper/SPaR.txt.git

In [26]:
try: 
    ### Start our `SPaR_API` container if it exists
    !docker start SPaR_API
except:
    ### Else, set up the `SPaR_API` container
    # build the SPaR.txt image, call it `spar`
    !docker build -t spar SPaR.txt/.
    # Run the image called `spar` in a container that we will call `SPaR_API`, with the API port at localhost:8501
    # NOTE: this will train a SPaR.txt model locally, which takes about 20 minutes on a CPU 
    !docker run --name SPaR_API -p 8501:8501 spar

SPaR_API


In [27]:
ner_api = "http://localhost:8501/predict_objects/"

In [28]:
example = "Thermoplastic materials in ceilings, rooflights and lighting diffusers provide a significant hazard in a fire."
response = requests.post(ner_api,  json={"sentence": example}).json()
response

{'prediction': {'obj': ['Thermoplastic materials',
   'ceilings',
   'rooflights',
   'lighting diffusers',
   'a hazard',
   'a fire']},
 'num_input_tokens': 26,
 'num_output_tokens': 17}

In [21]:
# label = "IfcWindow"
# objects as extracted from the definition for "IfcWindow"
def suggest(label: str, obj_cntr: Counter):
    """
    :return suggested_rel_dict: {label: [suggested_related_label: [found by bsdd search for span]}
    """
    useless_objs = ["entity", "HISTORY"]
    top_k = 5

    suggested_rel_dict = {label: {}}
    terms_with_overlap_in_description_objects = {}
    for obj, count in tqdm(obj_cntr.most_common()):

        if obj in useless_objs or obj in label:
            continue

        # 1) search for bsdd nodes with the object span as the query
        bsdd_response = check_if_string_exists_as_bsdd_label(obj)
        if bsdd_response:
            bsdd_results = parse_bsdd_api_response(bsdd_response)
        else:
            continue

        if top_k:
            # only look at top_k results from bsdd search
            bsdd_results = bsdd_results[:top_k]

        # 2) Compare if the retrieved, potentially related nodes, contain the same object in their description
        
        for result_dict in bsdd_results:
            name = result_dict["name"] if "name" in result_dict else None
            bsdd_description = result_dict["description"] if "description" in result_dict else None
            if not bsdd_description:
                continue 

            if obj in bsdd_description:
                if name not in terms_with_overlap_in_description_objects:
                    terms_with_overlap_in_description_objects[name] = [obj]
                else:
                    terms_with_overlap_in_description_objects[name].append(obj)
                    

    # 3) Collect suggestions of related terms
    for potentially_related, matching_objects in terms_with_overlap_in_description_objects.items():
#         if label == potentially_related or label.startswith(potentially_related) or potentially_related.startswith(label):
#             # we assume that if the original label occurs in the search results for a span, they are too close
# #             print(f"Skipping self: {label} found when searching for {obj}")
#             continue
    
        if len(matching_objects) < 2:
            continue

        if potentially_related not in suggested_rel_dict[label]:
            suggested_rel_dict[label][potentially_related] = matching_objects
        else:
            suggested_rel_dict[label][potentially_related] += matching_objects
    return suggested_rel_dict