In [1]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import torch
import json
import glob
import pickle
import random
import urllib
import requests

import numpy as np
import pandas as pd
import torch.nn as nn

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter
from itertools import combinations
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

from utilities import cleaning_utils
from utilities.spar_utils import NER

### Our starting point is a .ttl file containing new terms and corresponding definitions
* In this case, generated from a .csv file using ontotext

In [2]:
path_to_input = Path("data", "tpfDefinitions.ttl")

In [3]:
from rdflib import Graph
new_graph = Graph()
new_graph.parse(path_to_input, format='ttl')

<Graph identifier=N250580546b6147bd9ef4178f36822e92 (<class 'rdflib.graph.Graph'>)>

In [4]:
predicates_query = """
SELECT DISTINCT ?p WHERE {
    ?s ?p ?o .
}
"""
predicate_response = new_graph.query(predicates_query)
[row for row in predicate_response]

[(rdflib.term.URIRef('http://example.com/resource/Term'),),
 (rdflib.term.URIRef('http://example.com/resource/Reference'),),
 (rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),),
 (rdflib.term.URIRef('http://example.com/resource/Definition'),)]

In [5]:
terms_to_add_query = """
prefix ex: <http://example.com/resource/>

SELECT DISTINCT ?uid ?reference ?term ?definition WHERE{
    ?uid ex:Term ?term ;
         ex:Reference ?reference ;
         ex:Definition ?definition .  
}

"""
new_terms_response = new_graph.query(terms_to_add_query)

In [6]:
# some examples
new_rows = [row for row in new_terms_response]
[row for row in new_terms_response][:3]

[(rdflib.term.URIRef('http://example.com/resource/%22A%22%20Car'),
  rdflib.term.Literal('AASHTO. (2009). Transportation Glossary (4th ed.)'),
  rdflib.term.Literal('"A" Car'),
  rdflib.term.Literal('A motive-powered unit so designed that it may be used as the controlling unit of a multiple-unit train and that has adequate visibility in a forward direction, as well as a cab and equipment, to permit full control and observation of the propulsion power and brake applications for the train.')),
 (rdflib.term.URIRef('http://example.com/resource/%22B%22%20Car'),
  rdflib.term.Literal('AASHTO. (2009). Transportation Glossary (4th ed.)'),
  rdflib.term.Literal('"B" Car'),
  rdflib.term.Literal('A motive-powered unit designed primarily for use in combination with an “A” unit for the purpose of increasing power, but not equipped for use as the leading unit for full observation of the propulsion power and brake applications for the train; it is normally equipped with a single control station to 

### We'll be suggesting potential links to nodes in our bSDD graph (representing bSDD)
1. using exact matches in the description, linking to nodes returned by the standard search API
2. using semantic similarity based on node+description, linking to nodes in our bSDD graph
3. using description overlap, relying on the objects occuring in our bSDD graph descriptions

#### 1. Search for some term in bSDD and parse results
This section contains example code to suggest hyperlinks based on a term's description.

In [7]:
# API endpoint
url_prefix = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="

In [8]:
def check_if_string_exists_as_bsdd_label(
    query: str,
    url_prefix: str = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="
):
    """
    Try to find a result for `query`. Returns None if the API response cannot be converted to json, e.g., if the
    response is empty.
    """
    try:
        response = requests.get(url_prefix + urllib.parse.quote(query)).json()
        return response
    except:
        # no search results
        return None
    

In [9]:
def parse_bsdd_api_response(json_response: Dict[str,str]):    
    search_results = []
    for result in json_response['classifications']:
        domain_namespace = result['domainNamespaceUri']

#         # EXAMPLE, LIMIT RESULTS TO IFC 4.3 namespace
#         if not str(domain_namespace).endswith("ifc-4.3"):
#             continue

        domain_name = result['domainName']
        name = result['name']
        reference_code = result['referenceCode'] if 'referenceCode' in result else None
        namespace_uri = result['namespaceUri'] if 'namespaceUri' in result else None
        description = result['description'] if 'description' in result else None
        parent_name = result['parentClassificationName'] if 'parentClassificationName' in result else None
        related_ifc_entity_names = result['relatedIfcEntityNames'] if 'relatedIfcEntityNames' in result else None
        
        results_of_interest = {
            "name": name, 
            "description": description,
            "related": related_ifc_entity_names,
            "uid": namespace_uri
        }
        search_results.append(results_of_interest)
    return search_results

In [10]:
query = "window"
json_result = check_if_string_exists_as_bsdd_label(query)
parsed_result = parse_bsdd_api_response(json_result) if json_result else None

# let's keep it simple and check for exact matches only when searching bSDD
[x for x in parsed_result if x['name'].lower() == query]

[{'name': 'Window',
  'description': 'space access object for light entry only',
  'related': ['IfcWindow', 'IfcWindowStandardCase'],
  'uid': 'https://identifier.buildingsmart.org/uri/molio/cciconstruction-1.0/class/L-QQA'},
 {'name': 'Window',
  'description': None,
  'related': ['IfcWindow'],
  'uid': 'https://identifier.buildingsmart.org/uri/BBRI/BBRI-0.3/class/Window'},
 {'name': 'Window',
  'description': None,
  'related': ['IfcWindow'],
  'uid': 'https://identifier.buildingsmart.org/uri/bw/BW-0.1/class/Window'},
 {'name': 'Window',
  'description': 'space access object for light entry only',
  'related': ['IfcWindow', 'IfcWindowStandardCase'],
  'uid': 'https://identifier.buildingsmart.org/uri/acca/ACCAtest-0.1/class/L-QQA'}]

In [11]:
# test for a query that doesn't exist in bSDD
query = "exa1354mple"
json_result = check_if_string_exists_as_bsdd_label(query)
parsed_result = parse_bsdd_api_response(json_result) if json_result else None
parsed_result # returns None

* We'll rely on this code to suggest hyperlinks for terms found in a description 
  * Finding terms in the description will be done with SPaR.txt, 
  * If we'd simply check for string presene we might link, e.g., `light` to `roof light` 

In [12]:
def hyperlink_suggestion(query: str):
    """
    :return exact_match_uids: List of (label, URI reference) tuples.
    """
    json_result = check_if_string_exists_as_bsdd_label(query)
    parsed_result = parse_bsdd_api_response(json_result) if json_result else None
    if parsed_result:
        # let's keep it simple and check for exact matches only when searching bSDD
        exact_match_uids = [(x['name'], x["uid"]) for x in parsed_result if (x['name'].lower() == query.lower())]
        return exact_match_uids
    else:
        return []

In [13]:
hyperlink_suggestion("window")

[('Window',
  'https://identifier.buildingsmart.org/uri/molio/cciconstruction-1.0/class/L-QQA'),
 ('Window',
  'https://identifier.buildingsmart.org/uri/BBRI/BBRI-0.3/class/Window'),
 ('Window', 'https://identifier.buildingsmart.org/uri/bw/BW-0.1/class/Window'),
 ('Window',
  'https://identifier.buildingsmart.org/uri/acca/ACCAtest-0.1/class/L-QQA')]

Prepare SPaR.txt again

In [14]:
# download SPaR.txt if required
from pathlib import Path
spartxt_path = Path("SPaR.txt/")
if not spartxt_path.exists():
    !git clone https://github.com/rubenkruiper/SPaR.txt.git

In [18]:
try: 
    ### Start our `SPaR_API` container if it exists
    !docker start SPaR_API
except:
    ### Else, set up the `SPaR_API` container
    # build the SPaR.txt image, call it `spar`
    !docker build -t spar SPaR.txt/.
    # Run the image called `spar` in a container that we will call `SPaR_API`, with the API port at localhost:8501
    # NOTE: this will train a SPaR.txt model locally, which takes about 20 minutes on a CPU 
    !docker run --name SPaR_API -p 8501:8501 spar

SPaR_API


In [19]:
# Some basic cleaning for the entire set of extracted objects
regex_filter = cleaning_utils.RegexFilter()
def basic_cleaning(to_be_cleaned):
    # some basic cleaning steps
    _, regex_cleaned  = regex_filter.run_filter(to_be_cleaned) # _ would be the list of terms removed by our regex filters
    basic_cleaned = cleaning_utils.custom_cleaning_rules(regex_cleaned)
    determiners_removed = [cleaning_utils.remove_determiners(t) for t in basic_cleaned]
    cleaned_terms = [t for t in determiners_removed if t]
    cleaned_counter = Counter(cleaned_terms)
    return cleaned_terms, cleaned_counter

In [21]:
ner_api = "http://localhost:8501/predict_objects/"
ner = NER(ner_api)

In [22]:
example = "Thermoplastic materials in ceilings, rooflights and lighting diffusers provide a significant hazard in a fire."
ner.process_text(example)

['Thermoplastic materials',
 'ceilings',
 'rooflights',
 'lighting diffusers',
 'a hazard',
 'a fire']

Example parsing of some descriptions for the new terms that are being added to bSDD.

In [26]:
hyperlinks_per_row = []
for uid, reference, term, description in new_rows[-3:]:
    suggested_hyperlinks =  []
    # note that the uid, reference, etc. are URIreference and or RDF literals
    ner_objects, _ = basic_cleaning(ner.process_text(description.toPython()))
    unique_ner = list(set(ner_objects))
    for term_with_potential_hyperlink in unique_ner:
        suggested_hyperlinks += hyperlink_suggestion(term_with_potential_hyperlink)
        
    hyperlinks_per_row.append(suggested_hyperlinks)
        

In [27]:
for new_row, hyperlinks in zip(new_rows[-3:], hyperlinks_per_row[-3:]):
    label = new_row[-2].toPython()
    description = new_row[-1].toPython()
    print(f"{label} >> {description}")
    [print(h) for h in hyperlinks]
    print("-----")

Zone of Aeration >> The zone above the water table. Water in the zone of aeration does not flow into a well. 
('Zone', 'https://identifier.buildingsmart.org/uri/dtc/dtdl-1/class/Zone')
('ZONE', 'https://identifier.buildingsmart.org/uri/bimdata/bimdata-1.0/class/BDC69322')
('Water', 'https://identifier.buildingsmart.org/uri/bs-agri/fruitvegs-1.1/mat/water')
('Water', 'https://identifier.buildingsmart.org/uri/v5/fruitvegs-1.0/mat/water')
('Water', 'https://identifier.buildingsmart.org/uri/v5/fruitvegs-v5-5.0/mat/water')
('Water', 'https://identifier.buildingsmart.org/uri/bs-agri/fruitvegs-1.0/mat/water')
-----
Zone of Saturation >> The zone in which the functional permeable rocks are saturated with water under hydrostatic pressure. 
('Water', 'https://identifier.buildingsmart.org/uri/bs-agri/fruitvegs-1.1/mat/water')
('Water', 'https://identifier.buildingsmart.org/uri/v5/fruitvegs-1.0/mat/water')
('Water', 'https://identifier.buildingsmart.org/uri/v5/fruitvegs-v5-5.0/mat/water')
('Water'

#### 2. using semantic similarity based on node+description, linking to nodes in our bSDD graph
This section contains example code to suggest semantic similarity relations between:
1. new terms (expecting description) that are being added
2. existing terms (those that are English and have a descriptions) in our bSDD .csv file

In [28]:
# our English, processed csv file from before
graph_input_csv = Path("data", "bsdd_graph_input.csv")
bsdd_df = pd.read_csv(graph_input_csv)
bsdd_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,subject,name,uid,description,description_NER
0,1,1,https://identifier.buildingsmart.org/uri/FTIA/...,Additional details,AdditionalDetails,E.g. additional information related to install...,"information, installation"
1,2,2,https://identifier.buildingsmart.org/uri/FTIA/...,Post height,PostHeight,Height of the post in millimeters if sign has ...,"Height, millimeters, sign"
2,3,3,https://identifier.buildingsmart.org/uri/FTIA/...,Installation direction,InstallationDirection,Installation direction of the sign,sign
3,4,4,https://identifier.buildingsmart.org/uri/FTIA/...,Route number,RouteNumber,The route number on which the object is located,object
4,5,5,https://identifier.buildingsmart.org/uri/FTIA/...,Operating centre district,OperatingCentreDistrict,The operating centre district on which the obj...,object
...,...,...,...,...,...,...,...
10304,12510,27220,https://identifier.buildingsmart.org/uri/v5/fr...,Height,height,The height of an apple,height
10305,12512,27225,https://identifier.buildingsmart.org/uri/v5/fr...,Color,color,The color of a tomato,color
10306,12513,27226,https://identifier.buildingsmart.org/uri/v5/fr...,Height,height,The height of a Granny Smith,height
10307,12514,27227,https://identifier.buildingsmart.org/uri/v5/fr...,Color,color,The color of a Granny Smith,color


In [29]:
# still loads of duplicate terms in the csv
bsdd_concatenations = list(set([n.strip() + ' ' + d.strip() for n, d in zip(bsdd_df.name, bsdd_df.description) if (n and d) and (type(n) == str and type(d) == str)]))
# show some examples
print(f"Number of unique inputs we'll consider: {len(bsdd_concatenations)}")

Number of unique inputs we'll consider: 2923


In [30]:
max_num_combinations = len(new_rows) * len(bsdd_concatenations)
'Number of sem-sim pairs to compute {0:,}'.format(max_num_combinations)

'Number of sem-sim pairs to compute 10,408,803'

* Prepare the bSDD embeddings to compute semantic similarity 
* Assumption is that for the new terms, you'd want to embed on the fly and compare

In [31]:
batch_size = 32
show_progress_bar = True
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings_file_name = Path("data", "filtered_bsdd_embeddings.pkl")

In [32]:
if not embeddings_file_name.exists(): 
    # compute the embeddings for all the input strings (name + description)
    bsdd_embeddings = model.encode(bsdd_concatenations, 
                              show_progress_bar=show_progress_bar, 
                              batch_size=batch_size, 
                              convert_to_tensor=True)
    pickle.dump(bsdd_embeddings, open(embeddings_file_name, 'wb'))
else:
    bsdd_embeddings = pickle.load(open(embeddings_file_name, 'rb'))

In [33]:
type(bsdd_embeddings)

torch.Tensor

In [34]:
def grab_sem_sim_terms(new_embedding: torch.tensor, 
                       existing_embeddings: torch.tensor):
    # compute 1 semantically similar term, based on cosine similarity to bsdd embeddings
    cos_score = pytorch_cos_sim(new_embedding, existing_embeddings)
    cos_value, cos_index = torch.topk(cos_score, 5, 1)
    indices_list = cos_index.flatten().tolist()
    value_list = cos_value.flatten().tolist()
#     indices_list = [i for i in indices_list if to_be_predicted[i] != to_be_predicted[idx]] # avoid self-comparison
    
    # HEURISTIC: We check the difference between the semantic similar of the most similar, 
    # and the least similar in our top k terms. If this difference is small, then we assume
    # that the representations for these terms+definitions were relatively weak.
    diff = value_list[0] - value_list[-1]
    if diff > 0.06:
        # we only consider the most similar node for now
        most_similar_node_idx = indices_list[0]
        return most_similar_node_idx
    else:
        return None

In [35]:
EX = Namespace("http://ex.ample.org/span/")

In [36]:
example_triples = []
# Only computing the first 3 examples for now
for row in tqdm(new_rows[:3]):
    uid, reference, label, description = row
    new_c = label.toPython().strip() + " " + description.toPython().strip()
    
    new_c_embedding = model.encode(new_c, 
                              show_progress_bar=False, 
                              batch_size=batch_size, 
                              convert_to_tensor=True)
    
    idx = grab_sem_sim_terms(new_c_embedding, bsdd_embeddings)
    similar_concatenation = "" if not idx else bsdd_concatenations[idx]
    
    suggestions = [r for i, r in bsdd_df.iterrows() if (similar_concatenation.startswith(r["name"]) and \
                                                     r["description"] in similar_concatenation)]
    
    new_triples = []
    for bsdd_row in suggestions:
        new_triples.append((uid, EX.semanticallySimilar, bsdd_row.subject)) 
    
    example_triples.append(new_triples)    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.12it/s]


In [37]:
for new_row, similar in zip(new_rows[:3], example_triples[:3]):
    new_label = new_row[-2].toPython()
    new_description = new_row[-1].toPython()
    print(f"{new_label} >> {new_description}")
    for triple in similar:
        print(triple[-1])
    print("-----")

"A" Car >> A motive-powered unit so designed that it may be used as the controlling unit of a multiple-unit train and that has adequate visibility in a forward direction, as well as a cab and equipment, to permit full control and observation of the propulsion power and brake applications for the train.
https://identifier.buildingsmart.org/uri/buildingsmart/ifc-4.3/prop/CloggingIndicator
https://identifier.buildingsmart.org/uri/buildingsmart/ifc-4.3/class/IfcFilterCOMPRESSEDAIRFILTER/CloggingIndicator
-----
"B" Car >> A motive-powered unit designed primarily for use in combination with an “A” unit for the purpose of increasing power, but not equipped for use as the leading unit for full observation of the propulsion power and brake applications for the train; it is normally equipped with a single control station to permit independent movement of the unit itself. 
https://identifier.buildingsmart.org/uri/buildingsmart/ifc-4.3/prop/SelfClosing
https://identifier.buildingsmart.org/uri/buil

#### 3. using description overlap, relying on the objects occuring in our bSDD graph descriptions
* We'll load the previously computed bsdd graph from the .ttl file


In [38]:
bsdd_graph = Graph()
bsdd_graph_path = Path("data", "graph_output", "test_graph.ttl")
bsdd_graph.parse(bsdd_graph_path, format='ttl')

<Graph identifier=N0d5f927b044b4fac809c33d7df633c40 (<class 'rdflib.graph.Graph'>)>

In [43]:
span_query = """
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?span_label WHERE{
    ?s rdfs:label ?span_label .
} 
"""
all_spans = [span[0].toPython() for span in bsdd_graph.query(span_query)]

In [45]:
# some example spans that we have in the graph
random.sample(all_spans, 10)

['Count',
 'centre of gravity',
 'stiffener',
 'test dome',
 'circular transition change',
 'conditions',
 'device hardware',
 'agreement',
 'cooled beam',
 'vehicles']

In [None]:
# check if any of these spans occur verbatim in the description
# if they do, add triple between the UID and the span in the graph

In [None]:
# GraphDB SPARQL QUERY to find potentially related classes (based on spans found in their descriptions)
"""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix bsdd:<http://bsdd.buildingsmart.org/def#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>
prefix ex: <http://ex.ample.org/span/>

SELECT DISTINCT ?subject ?object ?subj_def ?obj_def (COUNT(DISTINCT ?span_in_description) AS ?shared_def_terms) (SUM(?generic) as ?total_g)
WHERE {
    ?subject_node ex:associatedSpan ?span_in_description ;
                  skos:definition ?subj_def ;
                  skos:prefLabel ?subject .
    
    ?object_node ex:associatedSpan ?span_in_description ;
                 skos:definition ?obj_def;
                 skos:prefLabel ?object .
    
    FILTER (str(?subject) != str(?object))
    # ensure same ordering of subject object so we don't get reverse triples
    FILTER (STR(?object) < STR(?subject))
    {   
        # sub query to check how generic the span_in_description is (number of edges)
        SELECT DISTINCT ?span_in_description (COUNT(?defined_node) AS ?generic) 
        WHERE{
            ?span_in_description  ^ex:associatedSpan ?defined_node .
        } 
        GROUP BY ?span_in_description
        # each span_in_description should be linked to less than X edges, otherwise too generic
        HAVING (?generic < 20)  
    }
}
GROUP BY ?subject ?object ?subj_def ?obj_def
# at least 3 shared terms, that together have more than 10 edges and less than 300 in total
HAVING (?shared_def_terms > 2 && ?total_g > 5 && ?total_g < 300)
"""