In [49]:
import json
import torch
import pickle
import random
import requests
import sentence_transformers

import numpy as np
import pandas as pd
import torch.nn as nn

from tqdm import tqdm
from pprint import pprint
from pathlib import Path
from sklearn import metrics
from itertools import combinations
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim

* We'll be computing the semantic similarity between the terms that we find in our local version of bSDD terms
  * Our bSDD terms can be found in `bsdd_descriptions.csv`
  * Alternatively, for grabbing only subset of the bSDD terms see the graphQL query at the end of this notebook

### Grab bSDD terms and definitions

In [2]:
bsdd_df = pd.read_csv("bsdd_descriptions.csv")

In [5]:
# Get some insight in what our csv file contains
bsdd_df

Unnamed: 0,subject,name,uid,description
0,https://identifier.buildingsmart.org/uri/FTIA/...,Location track,LocationTrack,Location track number or name as an abbreviation
1,https://identifier.buildingsmart.org/uri/FTIA/...,Additional details,AdditionalDetails,E.g. additional information related to install...
2,https://identifier.buildingsmart.org/uri/FTIA/...,Post height,PostHeight,Height of the post in millimeters if sign has ...
3,https://identifier.buildingsmart.org/uri/FTIA/...,Installation direction,InstallationDirection,Installation direction of the sign
4,https://identifier.buildingsmart.org/uri/FTIA/...,Route number,RouteNumber,The route number on which the object is located
...,...,...,...,...
28163,https://identifier.buildingsmart.org/uri/NVDB/...,Etableringsår_4066,Etableringsår_4066,Angir hvilket år vegobjektet ble etablert på s...
28164,https://identifier.buildingsmart.org/uri/NVDB/...,Produksjonsår_8814,Produksjonsår_8814,Angir hvilket år utstyret ble produsert.
28165,https://identifier.buildingsmart.org/uri/NVDB/...,Energikilde_3750,Energikilde_3750,Angir hvilken type energikilde som benyttes.
28166,https://identifier.buildingsmart.org/uri/NVDB/...,Bruksområde_11003,Bruksområde_11003,Angir hovedbruksområde for styreapparat.


In [26]:
max_num_combinations = sum(1 for unique_name in combinations(list(set(bsdd_df.name + bsdd_df.description)), 2))
'{0:,}'.format(max_num_combinations)

'39,360,628'

* columns: subject (url), name (term), uid, description
* 28K rows, so at most 28K terms that we want to compare against each other
  * max ±31 million similarity computations between the unique node label+description, could consider additional filtering based on namespace language etc

In [37]:
unique_combinations = list(set([n.strip() + ' ' + d.strip() for n, d in zip(bsdd_df.name, bsdd_df.description) if (n and d) and (type(n) == str and type(d) == str)]))
to_be_predicted = [name_and_definition for name_and_definition in unique_combinations]
# show some examples
print(f"Number of unique inputs we'll consider: {len(to_be_predicted)}")
to_be_predicted[:10]

8549


['Style Description of the furniture style.',
 'Sist. estraz.fumi/ventilazione (I)',
 'Side weld name Ident of the side weld',
 'FlangeTopFillet Flange top fillet of the profile.',
 'GS-Zeichen Mit dem Siegel Geprüfte Sicherheit (GS-Zeichen) wird einem verwendungsfertigen Produkt bescheinigt, dass es den Anforderungen des § 21 des Produktsicherheitsgesetzes (ProdSG) entspricht.',
 '_ArtBauholz Aufzaehlung lt. ASB-ING_neu',
 'NpshDatumPlane Horizontal plane through the center of the circle described by the external points of the entrance edges of the impeller blades, in the first stage in the case of multi-stage pumps',
 'Girder Station at start of span Girder Station at start of span',
 'CommunicationStandard Indicates the communication standard supported by the physical wired communication port.',
 'n. campate metalliche Campo calcolato: CONTA n. CAMPATE delle Sedi Tecniche sottostanti con Stato Sistema CREA,  e Stato Utente FATT, ESER, FUES e CONC e valore della caratteristica Materi

### Semantic similarity (distributed)
* Rather than only looking at the node's label, we are going to compare both the label and the description (sometimes definition)
* We'll use the pretrained `sentence-transformers/all-mpnet-base-v2` model

In [41]:
batch_size = 32
show_progress_bar = True
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings_file_name = Path("data", "embeddings.pkl")

In [44]:
if not embeddings_file_name.exists(): 
    # compute the embeddings for all the input strings (name + description)
    embeddings = model.encode(to_be_predicted, 
                              show_progress_bar=show_progress_bar, 
                              batch_size=batch_size, 
                              convert_to_tensor=True)
    pickle.dump(embeddings, open(embeddings_file_name, 'wb'))
else:
    embeddiings = pickle.load(open(embeddings_file_name, 'rb'))

In [81]:
similar_nodes = []
i = 0
for idx, embedding in tqdm(enumerate(embeddings), total=len(to_be_predicted)):
    # compute cosine similarity to all other embeddings
    cos_score = pytorch_cos_sim(embedding, embeddings)
    # consider the top 5 most similar terms, also grab their indices
    cos_value, cos_index = torch.topk(cos_score, 5, 1)
    indices_list = cos_index.flatten().tolist()
    value_list = cos_value.flatten().tolist()
    indices_list = [i for i in indices_list if to_be_predicted[i] != to_be_predicted[idx]] # avoid self-comparison
    
    # HEURISTIC: We check the difference between the semantic similar of the most similar, 
    # and the least similar in our top k terms. If this difference is small, then we assume
    # that the representations for these terms+definitions were relatively weak.
    diff = value_list[0] - value_list[-1]
    
    if diff > 0.06:
        # we only consider the most similar node for now
        most_similar_node_idx = indices_list[0]
        triple = [to_be_predicted[idx], 'ex:semsim', to_be_predicted[most_similar_node_idx]]
        similar_nodes.append(triple)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8549/8549 [00:56<00:00, 151.16it/s]


In [82]:
len(similar_nodes)
random.sample(similar_nodes, 10)

[['Castellation depth Castellation depth',
  'ex:semsim',
  'Castellation height Castellation height'],
 ['Volume The volume of an apple',
  'ex:semsim',
  'Height The height of an apple'],
 ['SoundEnergyLevel Ten times the logarithm to the base 1 of the ratio of the sound energy, J, to a reference value, J, expressed in decibels.',
  'ex:semsim',
  'SoundEnergyLevel Determined ten times the logarithm to the base 10 of the ratio of the sound energy, J, to a reference value, J0, expressed in decibels'],
 ['_BestandsplanVorhandenHyperlink Datentyp = URL',
  'ex:semsim',
  '_Hyperlink_005 Datentyp = URL'],
 ['Verbrauch Materialverbrauch in [kg/m� und mm Schichtdicke]',
  'ex:semsim',
  'Verbrauch (x mm Zahnung) Materialverbrauch mit einer verwendeten xmm Zahnung in [kg]'],
 ['GS-Zeichen Mit dem Siegel Geprüfte Sicherheit (GS-Zeichen) wird einem verwendungsfertigen Produkt bescheinigt, dass es den Anforderungen des § 21 des Produktsicherheitsgesetzes (ProdSG) entspricht.',
  'ex:semsim',
 

TODO: grab the node's UID to suggest triples to similar terms

### Alternative, grab specific/subset of terms from bSDD

In [68]:
# Example GraphQL query for 20 terms within the namespace of CCI
query = """query MyQuery {
  classification(
    limit: "20"
    where: {domain: {name: {EQ: "CCI Construction"}}}
  ) {
    id
    name
    definition
  }
}"""

In [69]:
url = 'https://bsdd.ontotext.com/graphql/'
response = requests.post(url, json={'query': query})
print(response.status_code)

200


In [70]:
results = response.json()

In [74]:
bsdd_df_test = pd.DataFrame.from_dict(results['data']['classification'])

In [75]:
# note that the current graph ql query grabs some different information than we have in the csv
bsdd_df_test.columns = ['subject', 'name', 'description'] # rename columns to align with csv naming
bsdd_df_test

Unnamed: 0,subject,name,description
0,https://identifier.buildingsmart.org/uri/molio...,Seat,open stationary storing object for persons
1,https://identifier.buildingsmart.org/uri/molio...,Power filter,electricity stabilising object suppressing a d...
2,https://identifier.buildingsmart.org/uri/molio...,Low voltage electric energy guiding object,guiding object of low voltage electric energy
3,https://identifier.buildingsmart.org/uri/molio...,Shutter,local climate stabilising object by restrictin...
4,https://identifier.buildingsmart.org/uri/molio...,Laundry,workspace for laundering clothes and linen
5,https://identifier.buildingsmart.org/uri/molio...,Stove,combustion heating object for heating of space
6,https://identifier.buildingsmart.org/uri/molio...,Joystick,movement interaction device by restricted posi...
7,https://identifier.buildingsmart.org/uri/molio...,Materials laboratory,laboratory for studying materials
8,https://identifier.buildingsmart.org/uri/molio...,Soild material flow space,matter flow space for soild materials
9,https://identifier.buildingsmart.org/uri/molio...,Gate leaf,closure object in a gate
