In [12]:
!pip install datasets



In [13]:
import pandas as pd
from datasets import load_dataset

## Loading the dataset
The dataset in question can be found at this link: https://huggingface.co/datasets/VLyb/FB15k  and it contains entities in the form [HEAD], [RELATION], [TAIL]. This means that the two entities head and tail are connected by a relation of a given type. In this version we have a total of 237 types of relations.

In [14]:
dataset = load_dataset("VLyb/FB15k")


## TASK
Perform **link prediction**. In our task we want to be able to check, given two entities [HEAD] and [TAIL] if there is a relation between these two in the dataframe. The way this is implemented enables to also find the type of relation.

In [15]:
train_triples = dataset['train']

In [16]:
def check_relation(head, tail, triples):
    # Filter triples where the head and tail match the given entities
    relations = [triple['relation'] for triple in triples if triple['head'] == head and triple['tail'] == tail]
    return relations

In [17]:
# Example entities
head = "/m/027rn"  # Replace with your entity ID
tail = "/m/06cx9"   # Replace with your entity ID

# Check for relations
relations = check_relation(head, tail, train_triples)

if relations:
    print(f"Relations found between {head} and {tail}: {relations}")
else:
    print(f"No relations found between {head} and {tail}.")

Relations found between /m/027rn and /m/06cx9: ['/location/country/form_of_government']


## PERFORMING ACTUAL LINK PREDICTION

In [18]:
!pip install pykeen



In [19]:
!pip install pykeen datasets



In [20]:
train_data = dataset["train"]
valid_data = dataset["validation"]
test_data = dataset["test"]

# Display some sample data
print(train_data[0])

{'head': '/m/027rn', 'relation': '/location/country/form_of_government', 'tail': '/m/06cx9'}


In [21]:
import numpy as np
from datasets import load_dataset
from pykeen.triples import TriplesFactory

# Load dataset from Hugging Face
dataset = load_dataset("VLyb/FB15k")

# Convert dataset into a NumPy array (head, relation, tail)
train_triples = np.array([(x["head"], x["relation"], x["tail"]) for x in dataset["train"]], dtype=str)
valid_triples = np.array([(x["head"], x["relation"], x["tail"]) for x in dataset["validation"]], dtype=str)
test_triples = np.array([(x["head"], x["relation"], x["tail"]) for x in dataset["test"]], dtype=str)

# Convert to PyKEEN's TriplesFactory format
train_factory = TriplesFactory.from_labeled_triples(train_triples)
valid_factory = TriplesFactory.from_labeled_triples(valid_triples)
test_factory = TriplesFactory.from_labeled_triples(test_triples)


In [None]:
from pykeen.pipeline import pipeline

# Train a link prediction model
result = pipeline(
    model='ComplEx',
    training=train_factory,
    validation=valid_factory,
    testing=test_factory,
    epochs=100,
    random_seed=42,
)

# Save the trained model
result.save_to_directory('trained_model')


INFO:pykeen.pipeline.api:Using device: None


Training epochs on cpu:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1888 [00:00<?, ?batch/s]

In [None]:
from pykeen.models import ComplEx
import torch

# Load the trained model
model = ComplEx().from_directory('trained_model')

# Define example entities and relation for prediction
entity_1 = "m/0223bl"   # Example entity
relation = "people/person/place_of_birth"
entity_2 = "m/09c7w0"

# Convert to tensor format for model prediction
entity_1_id = torch.tensor([model.triples_factory.entity_to_id[entity_1]])
relation_id = torch.tensor([model.triples_factory.relation_to_id[relation]])
entity_2_id = torch.tensor([model.triples_factory.entity_to_id[entity_2]])

# Predict the likelihood score for the given triple
score = model.predict_hrt((entity_1_id, relation_id, entity_2_id))
print(f"Link prediction score: {score.item()}")
