In [1]:
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

# Sample RDF triple list (replace this with your real RDF data)
# For example: triples = [(str(s), str(p), str(o)) for s, p, o in graph]
triples = [(s, p, o) for s, p, o in g]  # g is your RDFLib Graph

# Convert triples into PyKEEN-compatible TriplesFactory
tf = TriplesFactory.from_labeled_triples(triples)

# Train a basic TransE model
results = pipeline(
    training=tf,
    model='TransE',
    epochs=100,
    learning_rate=0.01,
    training_batch_size=128,
    random_seed=42,
)

# Extract embeddings
entity_embeddings = results.model.entity_embeddings.weight.detach().numpy()
relation_embeddings = results.model.relation_embeddings.weight.detach().numpy()


KeyboardInterrupt: 

In [2]:
# Evaluate entity similarities
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similar_entities(entity_id, entity_embeddings, top_k=5):
 entity_vector = entity_embeddings[entity_id].reshape(1, -1)
 similarities = cosine_similarity(entity_vector, entity_embeddings)
 most_similar = np.argsort(similarities[0])[-top_k-1:-1][::-1]
 return most_similar

# Link prediction
results.evaluate()

NameError: name 'results' is not defined

In [None]:
from pykeen.triples import TriplesFactory

# Reuse your triples
triples = [(s, p, o) for s, p, o in g]

# Split into train/valid/test
tf = TriplesFactory.from_labeled_triples(triples)
training, validation, testing = tf.split([0.8, 0.1, 0.1])

print(f"Training triples: {training.num_triples}")
print(f"Validation triples: {validation.num_triples}")
print(f"Testing triples: {testing.num_triples}")


In [None]:
from pykeen.pipeline import pipeline

model_results = {}
models = ['TransE', 'DistMult', 'ComplEx']

for model_name in models:
    print(f"Training {model_name}...")
    results = pipeline(
        training=training,
        validation=validation,
        testing=testing,
        model=model_name,
        epochs=100,
        embedding_dim=50,
        training_kwargs=dict(batch_size=32),
        random_seed=42,
    )
    model_results[model_name] = results


In [None]:
def evaluate_model(results, model_name):
    metrics = results.metric_results.to_dict()
    print(f"\nResults for {model_name}:")
    print(f"Mean Rank: {metrics['both']['mean_rank']:.2f}")
    print(f"MRR: {metrics['both']['mean_reciprocal_rank']:.4f}")
    print(f"Hits@1: {metrics['both']['hits_at_1']:.4f}")
    print(f"Hits@3: {metrics['both']['hits_at_3']:.4f}")
    print(f"Hits@10: {metrics['both']['hits_at_10']:.4f}")

for model_name, results in model_results.items():
    evaluate_model(results, model_name)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_entities(entity_id, model, top_k=5):
    embeddings = model.entity_embeddings.weight.detach().numpy()
    similarities = cosine_similarity([embeddings[entity_id]], embeddings)[0]
    most_similar = np.argsort(similarities)[-top_k-1:-1][::-1]
    entity_labels = model.triples_factory.entity_labeling.label_to_id
    reverse_labels = {v: k for k, v in entity_labels.items()}
    
    print(f"\nMost similar entities to {reverse_labels[entity_id]}:")
    for idx in most_similar:
        print(f"{reverse_labels[idx]}: {similarities[idx]:.4f}")


In [None]:
def predict_tail_entities(model, head_id, relation_id, k=5):
    scores = model.predict_scores(
        heads=torch.tensor([head_id]),
        relations=torch.tensor([relation_id])
    )
    top_scores = torch.topk(scores, k=k, dim=1)

    labels = model.triples_factory.entity_labeling.label_to_id
    reverse_labels = {v: k for k, v in labels.items()}

    return [(reverse_labels[idx.item()], score.item())
            for idx, score in zip(top_scores.indices[0], top_scores.values[0])]


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def visualize_embeddings(model):
    embeddings = model.entity_embeddings.weight.detach().numpy()
    tsne = TSNE(n_components=2, random_state=42)
    reduced = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 10))
    plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.6)
    plt.title("t-SNE of Entity Embeddings")
    plt.show()


In [None]:
import pandas as pd

def create_comparison_table(model_results):
    metrics = ['mean_rank', 'mean_reciprocal_rank', 'hits_at_10']
    comparison = {}

    for model_name, results in model_results.items():
        metric_values = results.metric_results.to_dict()['both']
        comparison[model_name] = {m: round(metric_values[m], 4) for m in metrics}

    return pd.DataFrame(comparison).T

create_comparison_table(model_results)


In [6]:
# ✅ Minimal test setup for KG Embedding pipeline

from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd

# 🔹 Step 1: Create dummy triples
triples = [
    ('France', 'has_capital', 'Paris'),
    ('France', 'has_president', 'Macron'),
    ('Macron', 'born_in', 'Amiens'),
    
    ('Germany', 'has_capital', 'Berlin'),
    ('Germany', 'has_chancellor', 'Scholz'),
    ('Scholz', 'born_in', 'Osnabrück'),
    
    ('Spain', 'has_capital', 'Madrid'),
    ('Spain', 'has_monarch', 'Felipe_VI'),
    ('Felipe_VI', 'born_in', 'Madrid'),

    ('Italy', 'has_capital', 'Rome'),
    ('Italy', 'has_prime_minister', 'Meloni'),
    ('Meloni', 'born_in', 'Rome'),

    ('UK', 'has_capital', 'London'),
    ('UK', 'has_prime_minister', 'Sunak'),
    ('Sunak', 'born_in', 'Southampton'),

    ('USA', 'has_capital', 'Washington'),
    ('USA', 'has_president', 'Biden'),
    ('Biden', 'born_in', 'Scranton'),

    ('UN', 'headquartered_in', 'New_York'),
    ('EU', 'headquartered_in', 'Brussels'),

    ('Macron', 'member_of', 'EU'),
    ('Scholz', 'member_of', 'EU'),
    ('Sunak', 'member_of', 'UN'),
    ('Biden', 'member_of', 'UN'),

    ('France', 'member_of', 'EU'),
    ('Germany', 'member_of', 'EU'),
    ('UK', 'member_of', 'UN'),
    ('USA', 'member_of', 'UN'),
]



# 🔹 Step 2: Convert to PyKEEN triples
triples_array = np.array(triples, dtype=str)
tf = TriplesFactory.from_labeled_triples(triples_array)

# 🔹 Step 3: Split into train/valid/test
training, validation, testing = tf.split([0.8, 0.1, 0.1], method='deterministic')


# 🔹 Step 4: Train multiple models
model_results = {}
models = ['TransE', 'DistMult', 'ComplEx']

for model_name in models:
    print(f"Training {model_name}...")
    results = pipeline(
    training=training,
    validation=validation,
    testing=testing,
    model=model_name,
    model_kwargs=dict(embedding_dim=50),
    epochs=100,
    training_kwargs=dict(batch_size=32),
    random_seed=42,
)
    model_results[model_name] = results


using automatically assigned random_state=150512018
No cuda devices were available. The model runs on CPU


Training TransE...


Training epochs on cpu: 100%|██████████| 100/100 [00:27<00:00,  3.67epoch/s, loss=0.121, prev_loss=0.143] 
Evaluating on cpu: 100%|██████████| 2.00/2.00 [00:00<00:00, 14.1triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.16s seconds
INFO:pykeen.pipeline.api:Using device: None


Training DistMult...


Training epochs on cpu: 100%|██████████| 100/100 [00:28<00:00,  3.50epoch/s, loss=0.909, prev_loss=0.919]
Evaluating on cpu: 100%|██████████| 2.00/2.00 [00:00<00:00, 9.35triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.24s seconds
INFO:pykeen.pipeline.api:Using device: None


Training ComplEx...


Training epochs on cpu: 100%|██████████| 100/100 [00:22<00:00,  4.38epoch/s, loss=0.887, prev_loss=3.55]
Evaluating on cpu: 100%|██████████| 2.00/2.00 [00:00<00:00, 17.6triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.13s seconds


In [11]:
# Evaluation metrics
def evaluate_model(results, model_name):
    metrics = results.metric_results.to_dict()
    real = metrics.get('realistic', {})
    
    print(f"\nResults for {model_name} (realistic ranking):")
    for k in ['mean_rank', 'mean_reciprocal_rank', 'hits_at_1', 'hits_at_3', 'hits_at_10']:
        if k in real:
            print(f"{k}: {real[k]:.4f}")



# Display all models' metrics
for name, result in model_results.items():
    evaluate_model(result, name)

# Comparison table
def create_comparison_table(model_results):
    metrics_to_extract = ['mean_rank', 'mean_reciprocal_rank', 'hits_at_10']
    comparison = {}

    for name, result in model_results.items():
        real = result.metric_results.to_dict().get('realistic', {})
        comparison[name] = {
            metric: round(real.get(metric, -1), 4) for metric in metrics_to_extract
        }

    return pd.DataFrame(comparison).T


create_comparison_table(model_results)



Results for TransE (realistic ranking):

Results for DistMult (realistic ranking):

Results for ComplEx (realistic ranking):


Unnamed: 0,mean_rank,mean_reciprocal_rank,hits_at_10
TransE,-1,-1,-1
DistMult,-1,-1,-1
ComplEx,-1,-1,-1


In [12]:
print(f"Train: {training.num_triples}")
print(f"Valid: {validation.num_triples}")
print(f"Test: {testing.num_triples}")


Train: 14
Valid: 2
Test: 2


In [None]:
# Entity Similarity
def find_similar_entities(entity_id, model, top_k=5):
    embeddings = model.entity_embeddings.weight.detach().numpy()
    similarities = cosine_similarity([embeddings[entity_id]], embeddings)[0]
    most_similar = np.argsort(similarities)[-top_k-1:-1][::-1]
    labels = model.triples_factory.entity_labeling.label_to_id
    reverse_labels = {v: k for k, v in labels.items()}
    
    print(f"\nMost similar entities to {reverse_labels[entity_id]}:")
    for idx in most_similar:
        print(f"{reverse_labels[idx]}: {similarities[idx]:.4f}")

# t-SNE Embedding Visualization
def visualize_embeddings(model):
    embeddings = model.entity_embeddings.weight.detach().numpy()
    tsne = TSNE(n_components=2, random_state=42)
    reduced = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 10))
    plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.6)
    plt.title("t-SNE of Entity Embeddings")
    plt.show()
