In [1]:
!pip install rdflib torch_geometric

Collecting rdflib
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading rdflib-7.1.4-py3-none-any.whl (565 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib, torch_geometric
Successfully installed rdflib-7.1.4 torch_geometric-2.6.1


In [2]:
import os
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.data import Data
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import negative_sampling
import rdflib
from rdflib.namespace import Namespace, RDF
from shapely import wkt
import glob
import random
import re
import requests
from sklearn.metrics import precision_score
from rdflib.namespace import NamespaceManager

In [None]:
OWNER = "bxbalaban"
REPO = "ai-lab"
BRANCH = "db0c4603d6b5b78929720437b2bbf93ede6ff6b6"
FOLDER_PATH = "data"

os.makedirs("ttl_files", exist_ok=True)

api_url = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{FOLDER_PATH}?ref={BRANCH}"

response = requests.get(api_url)
response.raise_for_status()
files = response.json()

ttl_files = [f for f in files if f["name"].endswith(".ttl")]

print(f"Found {len(ttl_files)} TTL files.")

for file_info in ttl_files:
    download_url = file_info["download_url"]
    filename = file_info["name"]
    print(f"Downloading {filename} ...")
    r = requests.get(download_url)
    r.raise_for_status()
    with open(os.path.join("ttl_files", filename), "wb") as f:
        f.write(r.content)

print("All TTL files downloaded to ./ttl_files/")

In [9]:
BOTAI = Namespace("http://www.aiLab.org/botAiLab#")
GEO = Namespace("http://www.opengis.net/ont/geosparql#")
LOCAL = Namespace("http://example.org/building/")

class GeoLinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels=128, num_relations=3):
        super().__init__()
        self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations)
        self.conv2 = RGCNConv(hidden_channels, hidden_channels, num_relations)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_channels * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def encode(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type).relu()
        x = self.conv2(x, edge_index, edge_type)
        return x

    def decode(self, z, edge_index):
        src, dst = edge_index
        z_cat = torch.cat([z[src], z[dst]], dim=1)
        return self.classifier(z_cat).view(-1)

def parse_ttl_to_pyg(ttl_path):
    g = rdflib.Graph()
    g.parse(ttl_path, format="turtle")
    g.bind("botAiLab", BOTAI)
    g.bind("geo", GEO)

    nodes = list(set(g.subjects()))
    node_idx = {n: i for i, n in enumerate(nodes)}
    num_nodes = len(nodes)

    features = torch.zeros((num_nodes, 7))  # x, y, z, w, h, d, rot

    for s in nodes:
        loc = g.value(s, BOTAI.hasLocation)
        if loc:
            try:
                coords = list(map(float, str(loc).split(",")))
                if len(coords) == 3:
                    x, y, z = coords
                    wkt_str = f"POINT Z({x} {y} {z})"
                    g.add((s, GEO.asWKT, rdflib.Literal(wkt_str, datatype=GEO.wktLiteral)))
            except:
                continue

    for s in nodes:
        i = node_idx[s]
        loc = g.value(s, GEO.asWKT)
        if loc:
            pt = wkt.loads(str(loc))
            features[i][:3] = torch.tensor([pt.x, pt.y, pt.z if hasattr(pt, 'z') else 0.0])

        size = g.value(s, BOTAI.hasSize)
        if size:
            try:
                w, h, d = map(float, str(size).split(","))
                features[i][3:6] = torch.tensor([w, h, d])
            except:
                pass

        rot = g.value(s, BOTAI.hasRotation)
        if rot:
            try:
                features[i][6] = float(str(rot))
            except:
                pass

    relation_uris = [
        BOTAI.isBelow,
        BOTAI.isAbove,
        BOTAI.intersectsElement
    ]
    relation_to_id = {rel: i for i, rel in enumerate(relation_uris)}

    edge_list = []
    edge_types = []

    for rel in relation_uris:
        for s, _, o in g.triples((None, rel, None)):
            if s in node_idx and o in node_idx:
                edge_list.append([node_idx[s], node_idx[o]])
                edge_types.append(relation_to_id[rel])

    if not edge_list:
        edge_index = torch.empty((2, 0), dtype=torch.long)
        edge_type = torch.empty((0,), dtype=torch.long)
    else:
        edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
        edge_type = torch.tensor(edge_types, dtype=torch.long)

    pos_edges = [
        [node_idx[s], node_idx[o]]
        for s, o in g.subject_objects(BOTAI.supports)
        if s in node_idx and o in node_idx
    ]
    if not pos_edges:
        raise ValueError(f"No supports links found in {ttl_path}")
    pos_edge_index = torch.tensor(pos_edges, dtype=torch.long).t().contiguous()

    neg_edge_index = negative_sampling(
        edge_index=pos_edge_index,
        num_nodes=num_nodes,
        num_neg_samples=pos_edge_index.size(1)
    )

    return Data(
        x=features,
        edge_index=edge_index,
        edge_type=edge_type,
        pos_edge_index=pos_edge_index,
        neg_edge_index=neg_edge_index
    )

def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index, data.edge_type)
    pos_score = model.decode(z, data.pos_edge_index)
    neg_score = model.decode(z, data.neg_edge_index)

    pos_loss = F.binary_cross_entropy_with_logits(pos_score, torch.ones_like(pos_score))
    neg_loss = F.binary_cross_entropy_with_logits(neg_score, torch.zeros_like(neg_score))
    loss = pos_loss + neg_loss
    loss.backward()
    optimizer.step()

    return loss.item()

def main(data_folder, epochs=100):
    ttl_files = glob.glob(os.path.join(data_folder, "*.ttl"))

    filtered_files = []
    for f in ttl_files:
        filename = os.path.basename(f)
        match = re.search(r'(\d+)\.ttl$', filename)
        if match:
            num = int(match.group(1))
            if 1 <= num <= 100:
                filtered_files.append(f)

    all_graphs = [parse_ttl_to_pyg(f) for f in filtered_files]

    model = GeoLinkPredictor(in_channels=7, num_relations=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(1, epochs + 1):
        total_loss = 0
        for graph in all_graphs:
            loss = train(model, graph, optimizer)
            total_loss += loss
        print(f"Epoch {epoch} - Loss: {total_loss:.4f}")

    torch.save(model.state_dict(), "link_predictor_model.pt")
    print("✅ Model saved to link_predictor_model.pt")

if __name__ == "__main__":
    main(data_folder="/content/ttl_files")

Epoch 1 - Loss: 93.3739
Epoch 2 - Loss: 44.2668
Epoch 3 - Loss: 30.8922
Epoch 4 - Loss: 13.9436
Epoch 5 - Loss: 11.3858
Epoch 6 - Loss: 15.4037
Epoch 7 - Loss: 13.0172
Epoch 8 - Loss: 4.7113
Epoch 9 - Loss: 2.4407
Epoch 10 - Loss: 3.0299
Epoch 11 - Loss: 3.0503
Epoch 12 - Loss: 14.1825
Epoch 13 - Loss: 10.5297
Epoch 14 - Loss: 10.4829
Epoch 15 - Loss: 9.7007
Epoch 16 - Loss: 2.1221
Epoch 17 - Loss: 0.4639
Epoch 18 - Loss: 0.2470
Epoch 19 - Loss: 0.3337
Epoch 20 - Loss: 1.1374
Epoch 21 - Loss: 8.3989
Epoch 22 - Loss: 4.6798
Epoch 23 - Loss: 1.9266
Epoch 24 - Loss: 14.3127
Epoch 25 - Loss: 33.3637
Epoch 26 - Loss: 2.6864
Epoch 27 - Loss: 0.9530
Epoch 28 - Loss: 0.4776
Epoch 29 - Loss: 0.0930
Epoch 30 - Loss: 0.0697
Epoch 31 - Loss: 0.0537
Epoch 32 - Loss: 0.0437
Epoch 33 - Loss: 0.0364
Epoch 34 - Loss: 0.0308
Epoch 35 - Loss: 0.0265
Epoch 36 - Loss: 0.0229
Epoch 37 - Loss: 0.0200
Epoch 38 - Loss: 0.0175
Epoch 39 - Loss: 0.0153
Epoch 40 - Loss: 0.0135
Epoch 41 - Loss: 0.0119
Epoch 42 - Lo

In [10]:
def load_graph_for_inference(ttl_file):
    g = rdflib.Graph()
    g.parse(ttl_file, format="turtle")
    g.bind("botAiLab", BOTAI)
    g.bind("geo", GEO)

    nodes = list(set(g.subjects()))
    node_index = {n: i for i, n in enumerate(nodes)}
    index_node = {i: n for n, i in node_index.items()}
    num_nodes = len(nodes)

    features = torch.zeros((num_nodes, 7))  # x, y, z, w, h, d, rot

    for s in nodes:
        loc = g.value(s, BOTAI.hasLocation)
        if loc:
            try:
                coords = list(map(float, str(loc).split(",")))
                if len(coords) == 3:
                    x, y, z = coords
                    wkt_str = f"POINT Z({x} {y} {z})"
                    g.add((s, GEO.asWKT, rdflib.Literal(wkt_str, datatype=GEO.wktLiteral)))
            except:
                continue

    for s in nodes:
        i = node_index[s]
        loc = g.value(s, GEO.asWKT)
        if loc:
            pt = wkt.loads(str(loc))
            features[i][:3] = torch.tensor([pt.x, pt.y, pt.z if hasattr(pt, 'z') else 0.0])

        size = g.value(s, BOTAI.hasSize)
        if size:
            try:
                w, h, d = map(float, str(size).split(","))
                features[i][3:6] = torch.tensor([w, h, d])
            except:
                pass

        rot = g.value(s, BOTAI.hasRotation)
        if rot:
            try:
                features[i][6] = float(str(rot))
            except:
                pass

    relation_uris = [
        BOTAI.adjacentElement,
        BOTAI.isAbove,
        BOTAI.intersectsElement
    ]

    relation_to_id = {rel: i for i, rel in enumerate(relation_uris)}

    edge_list = []
    edge_types = []

    for rel in relation_uris:
        for s, _, o in g.triples((None, rel, None)):
            if s in node_index and o in node_index:
                edge_list.append([node_index[s], node_index[o]])
                edge_types.append(relation_to_id[rel])

    if not edge_list:
        edge_index = torch.empty((2, 0), dtype=torch.long)
        edge_type = torch.empty((0,), dtype=torch.long)
    else:
        edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
        edge_type = torch.tensor(edge_types, dtype=torch.long)

    supports = set()
    for s, _, o in g.triples((None, BOTAI.supports, None)):
        if s in node_index and o in node_index:
            supports.add((node_index[s], node_index[o]))

    return features, edge_index, supports, node_index, index_node, edge_type

In [11]:
def predict_missing_links(ttl_file, model_path, top_k=10):
    original_graph = rdflib.Graph()
    original_graph.parse(ttl_file, format="turtle")

    test_graph = rdflib.Graph()
    test_graph += original_graph
    botAiLab = Namespace("http://www.aiLab.org/botAiLab#")
    supports_triples = list(test_graph.triples((None, botAiLab.supports, None)))

    for triple in supports_triples:
        test_graph.remove(triple)

    temp_ttl_path = "temp_inference.ttl"
    test_graph.serialize(temp_ttl_path, format="turtle")

    x, edge_index, _ , node_index, index_node, edge_type = load_graph_for_inference(temp_ttl_path)

    model = GeoLinkPredictor(in_channels=7)
    model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
    model.eval()

    with torch.no_grad():
        z = model.encode(x, edge_index, edge_type)

        num_nodes = x.size(0)
        candidate_edges = []
        candidate_labels = []
        for i in range(num_nodes):
            for j in range(num_nodes):
                if i != j:
                    subj = rdflib.URIRef(index_node[i])
                    obj = rdflib.URIRef(index_node[j])
                    candidate_edges.append((i, j))
                    label = (subj, botAiLab.supports, obj) in original_graph
                    candidate_labels.append(label)

        if not candidate_edges:
            print("✅ No candidate missing links to predict.")
            return

        edge_tensor = torch.tensor(candidate_edges, dtype=torch.long).t()
        scores = torch.sigmoid(model.decode(z, edge_tensor))

        top_scores, top_indices = torch.topk(scores, min(top_k, len(scores)))
        filtered_scores = []
        filtered_indices = []
        for i, score in enumerate(top_scores):
            if score >= 0.9:
                filtered_scores.append(score)
                filtered_indices.append(top_indices[i])

        predicted_links = [candidate_edges[idx] for idx in filtered_indices]
        predicted_labels = [candidate_labels[idx] for idx in filtered_indices]


        print(f"📊 Top {top_k} predicted missing `botAiLab:supports` links:")
        for score, (i, j), is_correct in zip(filtered_scores, predicted_links, predicted_labels):
            subj_uri = rdflib.URIRef(index_node[i])
            obj_uri = rdflib.URIRef(index_node[j])
            status = "✅" if is_correct else "❌"
            print(f"{status} {subj_uri} → {obj_uri} | score={score:.4f}")
            original_graph.add((subj_uri, botAiLab.supports, obj_uri))

        folder = os.path.dirname(ttl_file)
        filename = os.path.basename(ttl_file)
        enriched_path = os.path.join(folder, f"enriched_{filename}")

        namespace_manager = NamespaceManager(original_graph)
        namespace_manager.bind("local", LOCAL)
        original_graph.namespace_manager = namespace_manager
        original_graph.serialize(destination=enriched_path, format="turtle")

        accuracy = sum(predicted_labels) / len(predicted_labels)
        print(f"\n Accuracy of top-{top_k} predictions: {accuracy:.2%}")
        return accuracy

In [12]:
def predict_missing_links_in_folder(folder_path, model_path, top_k=50):
    ttl_files = glob.glob(os.path.join(folder_path, "*.ttl"))

    filtered_files = []
    for f in ttl_files:
        filename = os.path.basename(f)
        match = re.search(r'^building(\d+)\.ttl$', filename)
        if match:
            num = int(match.group(1))
            if 1 <= num <= 100:
                filtered_files.append(f)

    if not filtered_files:
        print("🚫 No TTL files found in the folder.")
        return
    all_acc = []
    for ttl_file in filtered_files:
        full_path = os.path.join(folder_path, ttl_file)
        print(f"\n🔍 Processing: {ttl_file}")
        try:
            acc = predict_missing_links(
                ttl_file=full_path,
                model_path=model_path,
                top_k=top_k
            )
            all_acc.append(acc)
        except Exception as e:
            print(f"⚠️ Failed to process {ttl_file}: {e}")

    print(f"Average Acc: {sum(all_acc)/len(all_acc):.4f}")

In [13]:
predict_missing_links_in_folder(
    folder_path="/content/ttl_files",
    model_path="/content/link_predictor_model.pt",
    top_k=15
)


🔍 Processing: /content/ttl_files/building46.ttl
📊 Top 15 predicted missing `botAiLab:supports` links:
✅ http://example.org/building/storey_1_column_1 → http://example.org/building/slab_1 | score=1.0000
✅ http://example.org/building/storey_1_column_3 → http://example.org/building/slab_1 | score=1.0000
✅ http://example.org/building/storey_1_wall_2 → http://example.org/building/slab_1 | score=1.0000
✅ http://example.org/building/storey_1_wall_3 → http://example.org/building/slab_1 | score=1.0000
✅ http://example.org/building/storey_1_column_2 → http://example.org/building/slab_1 | score=1.0000
✅ http://example.org/building/storey_1_wall_1 → http://example.org/building/slab_1 | score=0.9999

 Accuracy of top-15 predictions: 100.00%

🔍 Processing: /content/ttl_files/building62.ttl
📊 Top 15 predicted missing `botAiLab:supports` links:
✅ http://example.org/building/storey_1_wall_1 → http://example.org/building/slab_1 | score=0.9999
✅ http://example.org/building/storey_1_column_2 → http://exa