In [4]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import json
import glob
import pickle
import random
import urllib
import requests

import pandas as pd

from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from collections import Counter

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

# from utilities import cleaning_utils

### Create a quick RDF graph
* Using dummy namespaces and predicates

In [9]:
### input for graph
graph_input_csv = Path("data", "bsdd_graph_input.csv")
bsdd_df = pd.read_csv(graph_input_csv)

In [10]:
bsdd_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,subject,name,uid,description,description_NER
0,1,1,https://identifier.buildingsmart.org/uri/FTIA/...,Additional details,AdditionalDetails,E.g. additional information related to install...,"information, installation"
1,2,2,https://identifier.buildingsmart.org/uri/FTIA/...,Post height,PostHeight,Height of the post in millimeters if sign has ...,"Height, millimeters, sign"
2,3,3,https://identifier.buildingsmart.org/uri/FTIA/...,Installation direction,InstallationDirection,Installation direction of the sign,sign
3,4,4,https://identifier.buildingsmart.org/uri/FTIA/...,Route number,RouteNumber,The route number on which the object is located,object
4,5,5,https://identifier.buildingsmart.org/uri/FTIA/...,Operating centre district,OperatingCentreDistrict,The operating centre district on which the obj...,object
...,...,...,...,...,...,...,...
10304,12510,27220,https://identifier.buildingsmart.org/uri/v5/fr...,Height,height,The height of an apple,height
10305,12512,27225,https://identifier.buildingsmart.org/uri/v5/fr...,Color,color,The color of a tomato,color
10306,12513,27226,https://identifier.buildingsmart.org/uri/v5/fr...,Height,height,The height of a Granny Smith,height
10307,12514,27227,https://identifier.buildingsmart.org/uri/v5/fr...,Color,color,The color of a Granny Smith,color


In [12]:
# Quickly grab all unique terms that we found in all of the descriptions
unique_objects = []
for string_or_list in bsdd_df.description_NER:
    unique_objects += string_or_list.split(", ")
unique_objects = list(set(unique_objects))
len(unique_objects)

953

Preparing the graph

In [2]:
graph_output_fp = Path.cwd().joinpath("data", "graph_output")
graph_output_fp.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist

In [14]:
BSDD = Namespace("http://bsdd.buildingsmart.org/def#") # https://identifier.buildingsmart.org/uri/
EX = Namespace("http://ex.ample.org/span/")

In [18]:
all_triples = []

In [19]:
# Add all the unique objects the graph, within our example namespace
for span in unique_objects:
    span_uid = urllib.parse.quote(span)
    object_triple = (EX[span_uid], RDFS.label,  Literal(span, lang='en'))
    all_triples.append(object_triple)

In [20]:
# Add the objects from our .csv file and link to the span that occur in the descriptions
uids = bsdd_df.subject
bsdd_names = bsdd_df.name
bsdd_descriptions = bsdd_df.description
description_NER = bsdd_df.description_NER

for uid, name, description, objects_str in zip(uids, bsdd_names, bsdd_descriptions, description_NER):
    
    name_triple = (URIRef(uid), SKOS.prefLabel,  Literal(name, lang='en'))
    description_triple= (URIRef(uid), SKOS.definition,  Literal(description, lang='en'))
    all_triples += [name_triple, description_triple]
    
    objects = objects_str.split(", ")
    for span in objects:
        span_uid = urllib.parse.quote(span)
        association_triple = (URIRef(uid), EX.associatedSpan, EX[span_uid])
        all_triples.append(association_triple)

In [21]:
# Add all triples a new graph
graph = Graph()
issues = []
for t in all_triples:
    assert len(t) == 3
    if t not in graph:
        try:
            graph.add(t)
        except:
            issues.append(f"issue @ {t}")
            pass


In [22]:
len(issues)

0

In [23]:
issues

[]

In [24]:
# save the graph
graph.serialize(destination=graph_output_fp.joinpath("test_graph.ttl"))

<Graph identifier=N280b832b851c4f4599600c2e655ea942 (<class 'rdflib.graph.Graph'>)>

In [25]:
# GraphDB SPARQL QUERY to find potentially related classes (based on spans found in their descriptions)
"""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix bsdd:<http://bsdd.buildingsmart.org/def#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>
prefix ex: <http://ex.ample.org/span/>

SELECT DISTINCT ?subject ?object ?subj_def ?obj_def (COUNT(?definition_node) AS ?shared_def_terms) (SUM(?generic) as ?total_g)
WHERE {
    ?subject_node ex:associatedSpan ?definition_node ;
                  skos:definition ?subj_def ;
                  skos:prefLabel ?subject .
    {   
        # sub query to check how generic the definition_node is (number of edges)
        SELECT DISTINCT ?definition_node (COUNT(?definition_node) AS ?generic) 
        WHERE{
            ?definition_node  ^ex:associatedSpan ?defined_node .
        } 
        GROUP BY ?definition_node
        # each span found in definitions should be linked to less than X edges, otherwise its too generic
        HAVING (?generic < 30)  

    }
    ?object_node ex:associatedSpan ?definition_node ;
                 skos:definition ?obj_def;
                 skos:prefLabel ?object .

    FILTER (str(?subject) != str(?object))
    # ensure same ordering of subject object so we don't get reverse triples
    FILTER (STR(?object) < STR(?subject))
}
GROUP BY ?subject ?object ?subj_def ?obj_def
# at least 3 shared terms, that together have more than 10 edges and less than 300 in total
HAVING (?shared_def_terms > 5 && ?total_g > 5 && ?total_g < 200)
"""

"\nprefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nprefix bsdd:<http://bsdd.buildingsmart.org/def#>\nprefix skos: <http://www.w3.org/2004/02/skos/core#>\nprefix ex: <http://ex.ample.org/span/>\n\nSELECT DISTINCT ?subject ?object ?subj_def ?obj_def (COUNT(?definition_node) AS ?shared_def_terms) (SUM(?generic) as ?total_g)\nWHERE {\n    ?subject_node ex:associatedSpan ?definition_node ;\n                  skos:definition ?subj_def ;\n                  skos:prefLabel ?subject .\n    {   \n        # sub query to check how generic the definition_node is (number of edges)\n        SELECT DISTINCT ?definition_node (COUNT(?definition_node) AS ?generic) \n        WHERE{\n            ?definition_node  ^ex:associatedSpan ?defined_node .\n        } \n        GROUP BY ?definition_node\n        # each span found in definitions should be linked to less than X edges, otherwise its too generic\n        HAVING (?generic < 30)  \n\n    }\n    ?object_node ex:associatedSpan ?definition_node ;\n     