In [1]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import json
import glob
import pickle
import random
import urllib
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter

from transformers import BertTokenizer, BertModel

from utils import cleaning_utils

#### Example IFC class and description
* source: https://search-test.bsdd.buildingsmart.org/uri/buildingsmart/ifc-4.3/class/IfcWindow


We'll use: 
* TextBlob (NLTK's implementation of punkttokenizer under the hood) to split into sentences.
* SPaR.txt to predict which objects occur, some cleaning on top of this. 


In [2]:
label = "IfcWindow"

In [3]:
description = """
The window is a building element that is predominately used to provide natural light and fresh air. 
It includes vertical opening but also horizontal opening such as skylights or light domes. 
It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels. 
A window consists of a lining and one or several panels. 
A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey. 
fill an opening, typically in a wall. 
The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement; 
be part of an element assembly, typically an IfcCurtainWall. 
The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements;
There are two main representations for window occurrences:
IfcWindow entities that have a 3D rectangle 'Profile' shape representation defined. 
This profile can then be used to parametrically generate the geometry of a window. 
If not provided, the profile of the IfcOpeningElement can be used if the window fills an opening. 
The parameters are specified on the relating IfcWindowType that references IfcWindowLiningProperties and 
IfcWindowPanelProperties for each panel in the window; 
IfcWindow entities that are not parametrically generated and have only 'Brep', or 'SurfaceModel' geometry.
In addition, an IfcWindow may commonly include a 'FootPrint' representation defining the 2D shape of the window and its swing.
the window width and height the window opening direction (by the positive y-axis of the ObjectPlacement)
The IfcWindowType specifies parameters which are common to all of its occurrences of IfcWindow:
the partitioning type (single panel, double panel, tripel panel, more panels) the operation type 
(swing, tilt and turn, pivot revolve, fixed casement, etc.) 
the window panel hinge side (by using two different styles for right and left opening windows) 
the particular attributes for the lining by the IfcWindowLiningProperties the particular attributes 
for the panels by the  IfcWindowPanelPropertiesREFERENCE Definition according to ISO 6707-1 Construction 
for closing a vertical or near vertical opening in a wall or pitched roof that will admit light and 
may admit fresh air. NOTE The entity IfcWindowStandardCase has been deleted. Use an IfcWindow with 
a 'Profile' representation instead. The IfcWindow should also have an IfcWindowType with 
ParameterTakesPrecedence set to 'TRUE'. IFC4 CHANGE The attributes PredefinedType and OperationType are
added, the applicable type object has been changed to IfcWindowType. HISTORY New entity in IFC1.0.
"""

In [4]:
label = "abrasion"
description = """wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water"""

In [5]:
# download SPaR.txt if required
from pathlib import Path
spartxt_path = Path("SPaRtxt/")
if not spartxt_path.exists():
    !git clone https://github.com/rubenkruiper/SPaR.txt.git SPaRtxt

In [6]:
# # kind of convoluted way to import spar.txt as a module from the parent directory
# import imp
# with open(spartxt_path.joinpath('spar_predictor.py'), 'rb') as fp:
#     spar_predictor = imp.load_module(
#         'spar_predictor', fp, 'SPaRtxt.spar_predictor.py',
#         ('.py', 'rb', imp.PY_SOURCE)
#     )

In [7]:
# import sys 
# sys.path.insert(1, 'SPaRtxt/')

In [8]:
from SPaRtxt import *

In [9]:
from utils import spar_utils

In [10]:
# TRAIN/LOAD
# - trains a model if needed, otherwise load from archive; 
# - best F1 on dev/validation in the paper is 80,96 trained on a GPU, CPU will be a bit lower ~77.x I think
te = spar_utils.TermExtractor(max_num_cpu_threads=1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# use existing split into sentences functionality (uses textblob)
sentences = te.split_into_sentences(description)
# first 3 sents
sentences[:3]

['wearing or grinding away of material by friction',
 'usually caused by sand, gravel, or stones, carried by wind or water']

In [12]:
objects = te.process_sentences(sentences)

In [13]:
# # Show some random extracted objects
# random.sample(objects, 2)

In [14]:
# some basic cleaning  
regex_filter = cleaning_utils.RegexFilter()
def run_filters(to_be_cleaned):
    # some basic cleaning steps
    _, regex_cleaned  = regex_filter.run_filter(to_be_cleaned) # _ would be the list of terms removed by our regex filters
    basic_cleaned = cleaning_utils.custom_cleaning_rules(regex_cleaned)
    determiners_removed = [cleaning_utils.remove_determiners(t) for t in basic_cleaned]
    cleaned_terms = [t for t in determiners_removed if t]
    cleaned_counter = Counter(cleaned_terms)
    
    # Could compare how often the objects occur in all extracted descriptions
    cleaned_terms = [t for t in cleaned_terms if cleaned_counter[t] >= 1]
    cleaned_counter = Counter({t: c for t, c in cleaned_counter.items() if c >= 1})
    return cleaned_terms, cleaned_counter

In [15]:
_, obj_cntr = run_filters(objects)
obj_cntr.most_common(10)

[('material', 1),
 ('friction', 1),
 ('sand', 1),
 ('gravel', 1),
 ('stones', 1),
 ('wind', 1),
 ('water', 1)]

### Search for object

In [12]:
url_prefix = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="

In [13]:
query = ""

In [14]:
try:
    response = requests.get(url_prefix + urllib.parse.quote(query)).json()
except: # todo: catch more elegantly
    print("Search terms doesn't exist")
    

In [15]:
response

{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.1',
 'title': 'One or more validation errors occurred.',
 'status': 400,
 'traceId': '00-8f31c869282987074e4e52fc456e2c27-1a6421f0e85d5acc-00',
 'errors': {'SearchText': ['The SearchText field is required.']}}

In [16]:
def check_if_string_exists_as_bsdd_label(
    query: str,
    url_prefix: str = "https://test.bsdd.buildingsmart.org/api/ClassificationSearchOpen/v1?SearchText="
):
    try:
        response = requests.get(url_prefix + urllib.parse.quote(query)).json()
        return response
    except:
        # no search results
        return None
    

In [17]:
def parse_bsdd_api_response(json_response: Dict[str,str]):    
    search_results = []
    for result in json_response['classifications']:
        domain_namespace = result['domainNamespaceUri']

#         # LIMIT RESULTS TO IFC 4.3 for now 
#         if not str(domain_namespace).endswith("ifc-4.3"):
#             continue

        domain_name = result['domainName']
        name = result['name']
        reference_code = result['referenceCode'] if 'referenceCode' in result else None
        namespace_uri = result['namespaceUri'] if 'namespaceUri' in result else None
        description = result['description'] if 'description' in result else None
        parent_name = result['parentClassificationName'] if 'parentClassificationName' in result else None
        related_ifc_entity_names = result['relatedIfcEntityNames'] if 'relatedIfcEntityNames' in result else None
        
        results_of_interest = {"name": name, "description": description, "related": related_ifc_entity_names}
        search_results.append(results_of_interest)
    return search_results

In [18]:
# label = "IfcWindow"
# objects as extracted from the definition for "IfcWindow"
def suggest(label: str, obj_cntr: Counter):
    useless_objs = ["entity", "HISTORY"]
    top_k = 5

    suggested_rel_dict = {label: {}}
    terms_with_overlap_in_description_objects = {}
    for obj, count in tqdm(obj_cntr.most_common()):

        if obj in useless_objs or obj in label:
            continue

        # 1) search for bsdd nodes with the object span as the query
        bsdd_response = check_if_string_exists_as_bsdd_label(obj)
        if bsdd_response:
            bsdd_results = parse_bsdd_api_response(bsdd_response)
        else:
            continue

        if top_k:
            # only look at top_k results from bsdd search
            bsdd_results = bsdd_results[:top_k]

        # 2) Compare if the retrieved, potentially related nodes, contain the same object in their description
        
        for result_dict in bsdd_results:
            name = result_dict["name"] if "name" in result_dict else None
            bsdd_description = result_dict["description"] if "description" in result_dict else None
            if not bsdd_description:
                continue 

            if obj in bsdd_description:
                if name not in terms_with_overlap_in_description_objects:
                    terms_with_overlap_in_description_objects[name] = [obj]
                else:
                    terms_with_overlap_in_description_objects[name].append(obj)
                    

    # 3) Collect suggestions of related terms
    for potentially_related, matching_objects in terms_with_overlap_in_description_objects.items():
#         if label == potentially_related or label.startswith(potentially_related) or potentially_related.startswith(label):
#             # we assume that if the original label occurs in the search results for a span, they are too close
# #             print(f"Skipping self: {label} found when searching for {obj}")
#             continue
    
        if len(matching_objects) < 2:
            continue

        if potentially_related not in suggested_rel_dict[label]:
            suggested_rel_dict[label][potentially_related] = matching_objects
        else:
            suggested_rel_dict[label][potentially_related] += matching_objects
    return suggested_rel_dict

In [19]:
def spartxt_ner(long_description: str):
    # use existing split into sentences functionality (uses textblob)
    sentences = te.split_into_sentences(long_description)
    objects = te.process_sentences(sentences)
    _, object_counter = run_filters(objects)
    return object_counter

In [24]:
label = "abrasion"
description = """
Wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water.
1) Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence. 2) Removal of stream bank material due to entrained sediment, ice, or debris rubbing against the bank.
Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence.
"""

In [25]:
print(f"Term: {label} \nDefinition: {description}")
object_counter = spartxt_ner(description)
print(object_counter.most_common(10))
suggest(label, object_counter)

Term: abrasion 
Definition: 
Wearing or grinding away of material by friction; usually caused by sand, gravel, or stones, carried by wind or water.
1) Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence. 2) Removal of stream bank material due to entrained sediment, ice, or debris rubbing against the bank.
Loss of section or coating of a culvert by the mechanical action of water conveying suspended bed load of sand, gravel, and cobble-size particles at high velocities with appreciable turbulence.

[('sand', 3), ('gravel', 3), ('water', 2), ('Loss section', 2), ('coating', 2), ('culvert', 2), ('mechanical action', 2), ('suspended bed load', 2), ('cobble size', 2), ('particles', 2)]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:07<00:00,  2.96it/s]


{'abrasion': {}}

In [26]:
label = "IfcWindow"
description = """
The window is a building element that is predominately used to provide natural light and fresh air.  It includes vertical opening but also horizontal opening such as skylights or light domes.  It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels.  A window consists of a lining and one or several panels.  A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey.  fill an opening, typically in a wall.  The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement;  be part of an element assembly, typically an IfcCurtainWall.  The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements; There are two main representations for window occurrences: IfcWindow entities that have a 3D rectangle 'Profile' shape representation defined.  This profile can then be used to parametrically generate the geometry of a window.  If not provided, the profile of the IfcOpeningElement can be used if the window fills an opening.  The parameters are specified on the relating IfcWindowType that references IfcWindowLiningProperties and  IfcWindowPanelProperties for each panel in the window;  IfcWindow entities that are not parametrically generated and have only 'Brep', or 'SurfaceModel' geometry. In addition, an IfcWindow may commonly include a 'FootPrint' representation defining the 2D shape of the window and its swing. the window width and height the window opening direction (by the positive y-axis of the ObjectPlacement) The IfcWindowType specifies parameters which are common to all of its occurrences of IfcWindow: the partitioning type (single panel, double panel, tripel panel, more panels) the operation type  (swing, tilt and turn, pivot revolve, fixed casement, etc.)  the window panel hinge side (by using two different styles for right and left opening windows)  the particular attributes for the lining by the IfcWindowLiningProperties the particular attributes  for the panels by the  IfcWindowPanelPropertiesREFERENCE Definition according to ISO 6707-1 Construction  for closing a vertical or near vertical opening in a wall or pitched roof that will admit light and  may admit fresh air. NOTE The entity IfcWindowStandardCase has been deleted. Use an IfcWindow with  a 'Profile' representation instead. The IfcWindow should also have an IfcWindowType with  ParameterTakesPrecedence set to 'TRUE'. IFC4 CHANGE The attributes PredefinedType and OperationType are added, the applicable type object has been changed to IfcWindowType. HISTORY New entity in IFC1.0.
"""

In [27]:
print(f"Term: {label} \nDefinition: {description}")
object_counter = spartxt_ner(description)
suggested_rel_dict = suggest(label, object_counter)

Term: IfcWindow 
Definition: 
The window is a building element that is predominately used to provide natural light and fresh air.  It includes vertical opening but also horizontal opening such as skylights or light domes.  It includes constructions with swinging, pivoting, sliding, or revolving panels and fixed panels.  A window consists of a lining and one or several panels.  A window can:be a "free standing" window, contained in an IfcSpatialElement such as an IfcBuildingStorey.  fill an opening, typically in a wall.  The window will then have a FillsVoids attribute which uses the IfcRelFillsElement relationship to relate the IfcWindow with the IfcOpeningElement;  be part of an element assembly, typically an IfcCurtainWall.  The window will then have a Decomposes attribute which uses the the IfcRelAggregates relationship to relate the window with the assembly of elements; There are two main representations for window occurrences: IfcWindow entities that have a 3D rectangle 'Profile' 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [00:28<00:00,  3.09it/s]


**Alternative heuristics**:
* KNN graph / k nearest neighbours based on embeddings
* consider combining label with objects in defintion for a representantion?

In [20]:
import pprint
pprint.pprint(suggested_rel_dict)

NameError: name 'suggested_rel_dict' is not defined

In [3]:
import pandas as pd

In [5]:
# to_be_parsed = pd.read_csv("bsdd_descriptions.csv")
to_be_parsed = pd.read_csv("bsdd_parsed_descriptions.csv")

In [6]:
to_be_parsed.description

0        Location track number or name as an abbreviation 
1        E.g. additional information related to install...
2        Height of the post in millimeters if sign has ...
3                       Installation direction of the sign
4          The route number on which the object is located
                               ...                        
28163    Angir hvilket år vegobjektet ble etablert på s...
28164             Angir hvilket år utstyret ble produsert.
28165         Angir hvilken type energikilde som benyttes.
28166             Angir hovedbruksområde for styreapparat.
28167                             Angir type styreapparat.
Name: description, Length: 28168, dtype: object

In [None]:
# objects_found = []
objects_and_counts = []
for description in tqdm(to_be_parsed.description):
    try:
        object_counter = spartxt_ner(description)
        objects_and_counts.append(object_counter.most_common())
#         objects = [o for o, _ in object_counter.most_common()]
#         objects_found.append(objects)
    except:
        objects_and_counts.append({})
#         objects_found.append("")
        


  0%|▌                                                                                                                                                                   | 94/28168 [00:06<31:24, 14.90it/s]

In [15]:
# to_be_parsed['objects_found'] = objects_found
to_be_parsed['objects_and_counts'] = objects_and_counts

NameError: name 'objects_and_counts' is not defined

In [None]:
to_be_parsed.to_csv("bsdd_parsed_descriptions.csv")

In [7]:
def parse_objects(string_list):
    span_list = []
    for p1 in string_list.split("('")[1:]:
        p2 = p1.split("',", 1)[0]
        span_list.append(p2)
    return span_list

In [12]:
# shitty quick solution from already parsed csv (out of time)
all_objects = []
for string_list in to_be_parsed.objects_and_counts:
    all_objects += parse_objects(string_list)

In [13]:
# all_objects = [o for sublist in to_be_parsed["objects_and_counts"] for o in sublist]

In [14]:
len(all_objects)

113448

In [15]:
all_objects

['Location track number',
 'name',
 'information',
 'installation',
 'Height',
 'post',
 'millimeters',
 'sign',
 'Installation direction',
 'sign',
 'route number',
 'object',
 'operating centre',
 'object',
 'Article number',
 'reference',
 'configured product',
 'standard scheme',
 'article number definition',
 'manufacturer',
 'used',
 'purchasing number',
 'coordinates',
 'object',
 'Accessories',
 'fasten',
 'sign',
 'structure',
 'object',
 'sign',
 'maintenance district',
 'object',
 'signal',
 'sign',
 'Name',
 'sign',
 'RATO',
 'maintenance oversight district',
 'object',
 'name',
 'Type of foundation',
 'sign',
 'Installation height',
 'sign',
 'Owner',
 'object asset',
 'year production',
 'manufactured',
 'item',
 'Date on',
 'element',
 'Installation distance',
 'sign',
 'sign',
 'ascending',
 'descending direction',
 'track',
 'route number',
 'obecjt',
 'time duration',
 'manufacturer',
 'supplier',
 'performance',
 'artefact',
 'sign',
 'Number fasteners',
 'fasten',
 

In [16]:
unique_objects = list(set(all_objects))
len(unique_objects)

17533

In [17]:
random.sample(unique_objects, 10)

['Captured',
 'Numero link E1 interfaccia Ater',
 'je m²',
 'manovra a mano',
 'effettuazione misure ”',
 'PSSC',
 'Kan også være romlefelt f eks',
 'Betonwerkstein nach DIN 18333',
 'angrenzt bzw',
 'EN 12467 Faserzement - Tafeln - Produktspezifikation und']

### We want only english, but bsdd doesn't really help us there... too much random language stuff

In [35]:
concatenations = []
for name, description in zip(to_be_parsed.name, to_be_parsed.description):
    new_str = str(name) + " " + str(description)
    concatenations.append(new_str.encode("ascii", "ignore").decode())

In [36]:
concatenations[124:135]

['Ratanumero Ratanumero jolla kohde sijaitsee',
 'Omistaja Kohteen omistaja',
 'Nimi Kohteen nimi',
 'Kiinnitys Kohteen, rakenteen tai laitteen kiinnitys',
 'Pohjaimen luokittelu Pohjaimen jykkyys',
 'Materiaali Kohteen, rakenteen tai laitteen materiaali.',
 'Kaari / suora /siirtymkaari Tieto sijaitseeko kisko kaarella, suoralla vai siirtymkaarella',
 'Kiskotusvuosi New property description 2',
 'Uusi / kierrtetty Tieto onko kisko uusi vai kierrtetty',
 'Rataplkkyjako RATO 11:n mukainen plkkyjako',
 'Puuplkyn kyllstys Tieto siit onko puuplkky kyllstetty vai ei']

In [37]:
# Filter out non-enlishg stufff

In [39]:
import torch
from transformers import pipeline
device = 0 if torch.cuda.is_available() else -1

model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt, device=device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [40]:
languages = [predicted['label'] for predicted in pipe(concatenations)]

KeyboardInterrupt: 

In [None]:
languages

In [None]:
# drop non-english rows
for index, row in to_be_parsed.iterrows():
    language = languages[index]
    if  language != 'en':
        to_be_parsed.drop(index, inplace=True)

In [None]:
to_be_parsed

In [18]:
### Create a graph quickly
from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

In [19]:
graph_output_fp = Path.cwd().joinpath("data", "graph_output")
graph_output_fp.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist

In [20]:
BSDD = Namespace("http://bsdd.buildingsmart.org/def#") # https://identifier.buildingsmart.org/uri/
EX = Namespace("http://ex.ample.org/span/")

In [21]:
all_triples = []

In [22]:

for span_idx, span in enumerate(unique_objects):
    object_triple = (EX[str(span_idx)], RDFS.label,  Literal(span, lang='en'))
    all_triples.append(object_triple)

In [23]:
uids = to_be_parsed.subject
bsdd_names = to_be_parsed.name
bsdd_descriptions = to_be_parsed.description
objects_counts = to_be_parsed.objects_and_counts


for uid, name, description, objects_counts in zip(uids, bsdd_names, bsdd_descriptions, objects_counts):
    
    name_triple = (URIRef(uid), RDFS.label,  Literal(name, lang='en'))
    description_triple= (URIRef(uid), SKOS.definition,  Literal(description, lang='en'))
    all_triples += [name_triple, description_triple]
    
    if type(objects_counts) == str:
        objects = parse_objects(objects_counts)
    else:
        objects = [o for o, c in objects_counts]
        
    for span in objects:
        span_idx = unique_objects.index(span)
        association_triple = (URIRef(uid), EX.associatedSpan, EX[str(span_idx)])
        all_triples.append(association_triple)

In [24]:
graph = Graph()
issues = []
for t in all_triples:
    assert len(t) == 3
    if t not in graph:
        try:
            graph.add(t)
        except:
            issues.append(f"issue @ {t}")
            pass


In [25]:
len(issues)

0

In [26]:
issues

[]

In [27]:
graph.serialize(destination=graph_output_fp.joinpath("test_graph.ttl"))

<Graph identifier=Neea29be65b5c4d6cb92d4ee22513ebfe (<class 'rdflib.graph.Graph'>)>

In [88]:
# done ish; 
# - for each row in the dataframe, predict the language. input could be concatenation of (name + description)
# - get only those rows that have english as the predicted language
# - create a graph with:
# -- all bsdd terms with (uid bsdd:name name; bsdd:descripition description. )
# -- all object nodes with (listIDX bsdd:name string)
# -- all relations between terms and objects (uid meta:obj_found listIDX)
# - upload to graphDB and prepare sparql query

In [111]:
import torch
from sentence_transformers.util import pytorch_cos_sim
from sklearn import metrics
import torch
import torch.nn as nn
import pickle
import sentence_transformers
from sentence_transformers import SentenceTransformer
import numpy as np
import requests
import json
import pandas as pd
from pprint import pprint

In [112]:
query = """query MyQuery {
  classification(
    limit: "20"
    where: {domain: {name: {EQ: "CCI Construction"}}}
  ) {
    id
    name
    definition
    synonym
  }
}"""

url = 'https://bsdd.ontotext.com/graphql/'
r_test = requests.post(url, json={'query': query})
print(r_test.status_code)

json_data_test = json.loads(r_test.text)



query = """query MyQuery {
  classification(
    limit: "10000"
    where: {domain: {name: {EQ: "IFC"}}}
  ) {
    id
    name
    definition
    synonym
  }
}"""

url = 'https://bsdd.ontotext.com/graphql/'
r = requests.post(url, json={'query': query})
print(r.status_code)

json_data = json.loads(r.text)

200
200


In [113]:
result_list_test = []
id_list_test = []
for item in json_data_test['data']['classification']:

  if item['name'] and item['definition'] is not None:
    concatenated_string = item['name'] + '. ' + item['definition']
  elif item['name'] is None:
    concatenated_string = item['definition']
  elif item['definition'] is None:
    concatenated_string = item['name']
  id_list_test.append(item['id'])

  result_list_test.append(concatenated_string)

result_list = []
id_list = []
for item in json_data['data']['classification']:
  if item['name'] and item['definition'] is not None:
    concatenated_string = item['name'] + '. ' + item['definition']
  elif item['name'] is None:
    concatenated_string = item['definition']
  elif item['definition'] is None:
    concatenated_string = item['name']
  id_list.append(item['id'])

  result_list.append(concatenated_string)

In [114]:
batch_size = 32
show_progress_bar = True
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [115]:
embeddings = model.encode(result_list, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True)
embeddings_test = model.encode(result_list_test, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True)

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
triple_list = []
i = 0
for key in embeddings_test:

  cos_score = pytorch_cos_sim(key, embeddings)
  cos_value, cos_index = torch.topk(cos_score,5,1)
  indices_list = cos_index.flatten().tolist()
  value_list = cos_value.flatten().tolist()
  diff = value_list[0] - value_list[4]
  #print(diff)
  
  if diff > 0.06:
    triple = []
    triple.append(id_list_test[i])
    triple.append('similar')
    triple.append(id_list[indices_list[0]])
    triple_list.append(triple)
  i = i+1
print(triple_list)

In [None]:
# GraphDB SPARQL QUERY
"""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix bsdd:<http://bsdd.buildingsmart.org/def#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>
prefix ex: <http://ex.ample.org/span/>

SELECT DISTINCT ?subject ?subj_def ?object ?obj_def (COUNT(?definition_node) AS ?shared_def_terms) (SUM(?generic) as ?total_g)
WHERE {
    ?subject_node ex:associatedSpan ?definition_node ;
                  skos:definition ?subj_def ;
                  rdfs:label ?subject .
    {   # sub query to check how generic the definition_node is (number of edges)
        SELECT DISTINCT ?definition_node (COUNT(?defined_node) AS ?generic) 
        WHERE{
            ?definition_node  ^ex:associatedSpan ?defined_node .
        } 
        GROUP BY ?definition_node
        HAVING (?generic < 30)  # each shared term has less than 20 edges generic
    }
    ?object_node ex:associatedSpan ?definition_node ;
                 skos:definition ?obj_def;
                 rdfs:label ?object .

    FILTER (str(?subject) != str(?object))
    # ensure same ordering of subject object so we don't get reverse triples
    FILTER (STR(?object) < STR(?subject))
}
GROUP BY ?subject ?object ?subj_def ?obj_def
# at least 3 shared terms, that together have more than 10 edges and less than 300 in total
HAVING (?shared_def_terms > 2 && ?total_g > 10 && ?total_g < 300)
"""