# Model-theoretic ontology embedding methods

## EL-Embeddings

Import MOWL library and ELEmbedding model base classes

In [1]:
import mowl
mowl.init_jvm("10g")

from mowl.models.elembeddings.module import ELEmModule
from mowl.base_models.elmodel import EmbeddingELModel


  from .autonotebook import tqdm as notebook_tqdm


Define the model and training strategy

In [10]:
import torch
from torch import nn
from tqdm import trange

class ELEmbeddings(EmbeddingELModel):

    def __init__(self,
                 dataset,
                 embed_dim=50,
                 margin=0,
                 reg_norm=1,
                 learning_rate=0.001,
                 epochs=1000,
                 batch_size=4096 * 8,
                 model_filepath=None,
                 device='cpu'
                 ):
        super().__init__(dataset, batch_size, extended=True, model_filepath=model_filepath)

        self.embed_dim = embed_dim
        self.margin = margin
        self.reg_norm = reg_norm
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.device = device
        self._loaded = False
        self._loaded_eval = False
        self.extended = False
        self.init_model()

    def init_model(self):
        self.model = ELEmModule(
            len(self.class_index_dict),  # number of ontology classes
            len(self.object_property_index_dict),  # number of ontology object properties
            embed_dim=self.embed_dim,
            margin=self.margin
        ).to(self.device)

    def train(self, checkpoint=1):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        best_loss = float('inf')

        for epoch in trange(self.epochs):
            self.model.train()

            train_loss = 0
            loss = 0

            # Notice how we use the ``training_datasets`` variable directly
            # and every element of it is a pair (GCI name, GCI tensor data).
            for gci_name, gci_dataset in self.training_datasets.items():
                if len(gci_dataset) == 0:
                    continue
                loss += torch.mean(self.model(gci_dataset[:], gci_name))
                
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().item()
            torch.save(self.model.state_dict(), self.model_filepath)
            if (epoch + 1) % checkpoint == 0:
                print(f'\nEpoch {epoch}: Train loss: {train_loss:4f}')

Create the dataset class

In [11]:
from mowl.datasets import PathDataset

family_dataset = PathDataset('data/family.owl')

Train the model

In [17]:
model = ELEmbeddings(family_dataset,
                     embed_dim=2,
                     margin=0.1,
                     reg_norm=1,
                     learning_rate=0.1,
                     epochs=100,
                     batch_size=2,
                     model_filepath=None,
                     device='cpu')

model.train(checkpoint=10)

100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 427.21it/s]


Epoch 9: Train loss: 1.500174

Epoch 19: Train loss: 0.336646

Epoch 29: Train loss: 0.139975

Epoch 39: Train loss: 0.178397

Epoch 49: Train loss: 0.135798

Epoch 59: Train loss: 0.125496

Epoch 69: Train loss: 0.113816

Epoch 79: Train loss: 0.069246

Epoch 89: Train loss: 0.080087

Epoch 99: Train loss: 0.131009





Plot embeddings

In [None]:
import matplotlib.pyplot as plt

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
fig, ax =  plt.subplots()
plt.axis('equal')
for i in range(embeds.shape[0]):
    if classes[i].startswith('owl:'):
        continue
    x, y = embeds[i, 0], embeds[i, 1]
    r = rs[i]
    ax.add_artist(plt.Circle(
        (x, y), r, fill=False, edgecolor=colors[i % len(colors)], label=classes[i]))
    ax.annotate(classes[i], xy=(x, y + r + 0.03), fontsize=10, ha="center", color=colors[i % len(colors)])
ax.grid(True)
plt
