In [3]:
!pip install pykeen

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import tensorflow as tf 
import pandas as pd
import pykeen
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Dataset

In [6]:
dataset = pd.read_csv('/content/drive/MyDrive/Portfolio/Neo4j/dataset.csv')
dataset = dataset.drop(columns=['Unnamed: 0'])
dataset

Unnamed: 0,start_node,relation,end_node
0,Manufacturing_employment,subClassOf,Green_Indicators
1,Frameworks_for_gender,subClassOf,Green_Indicators
2,has_Inclusive_decision_making,domain,Peace_Justice_and_Strong_Institutions
3,Financial_services_access,subClassOf,Green_Indicators
4,has_Science_tech_cooperation,domain,Partnerships_for_The_Goals
...,...,...,...
37898,classic,SimilarTo,typical
37899,classical,SimilarTo,neoclassical
37900,classified,SimilarTo,eyes only
37901,classified,SimilarTo,sensitive


## Spilt data

In [7]:
from pykeen.triples import TriplesFactory

In [8]:
tf = TriplesFactory.from_labeled_triples(
  dataset[["start_node", "relation", "end_node"]].values,
  create_inverse_triples=False,
  entity_to_id=None,
  relation_to_id=None,
  compact_id=False,
  filter_out_candidate_inverse_relations=True,
  metadata=None,
)

In [9]:
#spilt data
training, validation, testing = tf.split([.8, .1, .1])

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [556, 3746, 3746]


In [10]:
validation

TriplesFactory(num_entities=39054, num_relations=17, num_triples=3746, inverse_triples=False)

In [11]:
testing

TriplesFactory(num_entities=39054, num_relations=17, num_triples=3746, inverse_triples=False)

In [12]:
training

TriplesFactory(num_entities=39054, num_relations=17, num_triples=29964, inverse_triples=False)

## Model KGE

In [13]:
from pykeen.pipeline import pipeline
from pykeen.hpo import hpo_pipeline
from pykeen.models import TransE

from pykeen.evaluation import RankBasedEvaluator
from pykeen.models.predict import get_tail_prediction_df
from pykeen.models import predict
# from pykeen.ablation import ablation_pipeline
from torch.optim import Adam, SGD
from pykeen.training import SLCWATrainingLoop
# from pykeen.evaluation import LCWAEvaluationLoop

### TransE

In [18]:
TransE = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='TransE',
    loss='softplus',
    model_kwargs=dict(embedding_dim=20),
    optimizer='Adam',
    optimizer_kwargs=dict(lr=0.01),
    training_kwargs=dict(num_epochs=60, use_tqdm_batch=False),
    evaluation_kwargs=dict(batch_size=128,use_tqdm=False),
    evaluator='RankBasedEvaluator',
    random_seed=1,
    device='gpu',
    # regularizer='LpRegularizer',
    # regularizer_kwargs=dict(p=3),
    # training_loop='slcwa',
    # stopper='early',
    # stopper_kwargs=dict(frequency=5, patience=2, relative_delta=0.002),
    # evaluation_loop='LCWAEvaluationLoop',
)

INFO:pykeen.pipeline.api:Using device: gpu


Training epochs on cuda:0:   0%|          | 0/60 [00:00<?, ?epoch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.93s seconds


In [19]:
print('Hit@K : ', TransE.get_metric('hits@k'))
print('Mean Rank : ', TransE.get_metric('mr'))
print('Mean Reciprocal Rank : ', TransE.get_metric('mrr'))
# print('Adjusted Mean Rank : ', TransE.get_me

Hit@K :  0.09329951948745328
Mean Rank :  15844.818359375
Mean Reciprocal Rank :  0.04002795038666244


#### predict model

In [24]:
# tail prediction

df = get_tail_prediction_df(
    model=TransE.model,
    head_label="Manufacturing_employment",
    relation_label="subClassOf",
    triples_factory=TransE.training,
    add_novelties=False,
)

print(df.head(5))

     tail_id                tail_label     score
832      832  Manufacturing_employment -0.676553
785      785          Green_Indicators -1.158750
702      702           Child_mortality -1.258308
735      735          Energy_intensity -1.354527
726      726    Disaster_deaths_injury -1.358621


In [None]:
# Score all triples (memory intensive)
predictions_df = predict.get_all_prediction_df(TransE.model, triples_factory=TransE.training)



In [20]:
# Score top K triples (computationally expensive)
top_k_predictions_df = predict.get_all_prediction_df(TransE.model, k=3, triples_factory=testing)



scoring:   0%|          | 0.00/664k [00:00<?, ?batch/s]

In [22]:
top_k_predictions_df

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,35805,stromboli,12,label,35805,stromboli,-0.093675,False
1,28457,native language,12,label,28457,native language,-0.093675,False
2,38090,very potent chemicals in use,12,label,38090,very potent chemicals in use,-0.093675,False


In [None]:
# Score a given list of triples
score_df = predict.predict_triples_df(
    model=TransE.model,
    triples=[("Sustainable_Development_Goals", "member", "Zero_Hunger"),
             ("Zero_Hunger", "subClassOf", "Sustainable_Development_Goals"),
             ("Air_pollution_deaths", "label", "Air_pollution_deaths")
             ],
    triples_factory=TransE.training,
)

In [None]:
# # Evaluate your model with not only testing triples,
# # but also filter on validation triples
# results = evaluator.evaluate(
#     model=TransE.model,
#     mapped_triples= testing.mapped_triples,
#     batch_size=1024,
#     # additional_filter_triples=[
#     #     training.mapped_triples,
#     #     validation.mapped_triples,
#     # ],
# )

### RotatE

In [14]:
RotatE = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RotatE',
    loss='softplus',
    model_kwargs=dict(embedding_dim=20),
    optimizer='Adam',
    optimizer_kwargs=dict(lr=0.01),
    training_kwargs=dict(num_epochs=60, use_tqdm_batch=False),
    evaluation_kwargs=dict(batch_size=128,use_tqdm=False),
    evaluator='RankBasedEvaluator',
    random_seed=1,
    device='gpu',
)

print('Hit@K : ', RotatE.get_metric('hits@k'))
print('Mean Rank : ', RotatE.get_metric('mr'))
print('Mean Reciprocal Rank : ', RotatE.get_metric('mrr'))


INFO:pykeen.pipeline.api:Using device: gpu


Training epochs on cuda:0:   0%|          | 0/60 [00:00<?, ?epoch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 1.18s seconds


Hit@K :  0.0597971169247197
Mean Rank :  13252.73046875
Mean Reciprocal Rank :  0.04274922466125325


### ComplEx 

In [148]:
ComplEx = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='ComplEx',
    loss='softplus',
    model_kwargs=dict(embedding_dim=20),
    optimizer='Adam',
    optimizer_kwargs=dict(lr=0.01),
    training_kwargs=dict(num_epochs=60, use_tqdm_batch=False),
    evaluation_kwargs=dict(batch_size=128,use_tqdm=False),
    evaluator='RankBasedEvaluator',
    random_seed=1,
    device='gpu',
)

print('Hit@K : ', ComplEx.get_metric('hits@k'))
print('Mean Rank : ', ComplEx.get_metric('mr'))
print('Mean Reciprocal Rank : ', ComplEx.get_metric('mrr'))

INFO:pykeen.pipeline.api:Using device: gpu


Training epochs on cuda:0:   0%|          | 0/60 [00:00<?, ?epoch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 1.08s seconds


Hit@K :  0.0
Mean Rank :  19259.185546875
Mean Reciprocal Rank :  0.00025428717076169874
