<a href="https://colab.research.google.com/github/aSafarpoor/Seminar/blob/main/ilpc2022_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/MSc/codes/ilpc2022/

/content/drive/MyDrive/MSc/codes/ilpc2022


In [4]:
!pip install pykeen
# !pip install click
!pip install wandb 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pykeen
  Downloading pykeen-1.8.1-py3-none-any.whl (630 kB)
[K     |████████████████████████████████| 630 kB 7.4 MB/s 
Collecting scipy>=1.5.0
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 382 kB/s 
[?25hCollecting rexmex
  Downloading rexmex-0.1.0.tar.gz (21 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting click-default-group
  Downloading click-default-group-1.2.2.tar.gz (3.3 kB)
Collecting optuna>=2.0.0
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 68.9 MB/s 
[?25hCollecting docdata
  Downloading docdata-0.0.3-py3-none-any.whl (5.8 kB)
Co

In [4]:
import datetime 

In [5]:
"""The challenge's datasets."""
from pathlib import Path

from pykeen.datasets.inductive.base import DisjointInductivePathDataset
from typing_extensions import Literal

__all__ = [
    "InductiveLPDataset",
    "Size",
]

# HERE = Path(__file__).parent.resolve()
DATA = "data/"

Size = Literal["small/", "large/", "sp/"]


class InductiveLPDataset(DisjointInductivePathDataset):
    """An inductive link prediction dataset for the ILPC 2022 Challenge."""

    def __init__(self, size= "small/", **kwargs):
        """Initialize the inductive link prediction dataset.

        :param size: "small" or "large"
        :param kwargs: keyword arguments to forward to the base dataset class, cf. DisjointInductivePathDataset
        """

        super().__init__(
            transductive_training_path=size+ "train.txt",
            inductive_inference_path=size+ "inference.txt",
            inductive_validation_path=size+ "inference_validation.txt",
            inductive_testing_path=size+ "inference_test.txt",
            create_inverse_triples=True,
            eager=True,
            **kwargs
        )


In [21]:
"""Example workflow."""
import logging
from pathlib import Path

# import click
# import more_click
import torch
from pykeen.evaluation import RankBasedEvaluator
from pykeen.losses import NSSALoss
from pykeen.models.inductive import InductiveNodePiece, InductiveNodePieceGNN
from pykeen.trackers import ConsoleResultTracker, WANDBResultTracker
from pykeen.training import SLCWATrainingLoop
from pykeen.typing import TESTING, TRAINING, VALIDATION
from pykeen.utils import resolve_device, set_random_seed
from torch.optim import Adam


DATA = "data/"

# fix the seed for reproducibility
set_random_seed(42)


# for GNN layer reproducibility
# when running on a GPU, make sure to set up an env variable as advised in the doc:
# https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html

# torch.use_deterministic_algorithms(True)

'''
click.Choice(["small", "large","sp"]),
default="small",
    show_default=True,
    help="The dataset to use",
)

    "--embedding-dim",
    help="The dimension of the entity embeddings",

    "--tokens",
    help="Number of tokens to use in NodePiece",

    "--margin",
    help="The margin value to use for the negative sampling self-adversarial loss.",

    "--num-negatives",
    help="The number of negative samples per positive.",

    "--wandb",
    help="Track results with Weights & Biases (requires `wandb` to be installed).",

    "--save",
     is_flag=True, help=f"Save the model in the {DATA} directory")

    "--gnn", 
    is_flag=True, help="Use the Inductive NodePiece model with GCN layers"

@more_click.log_level_option()
'''


def main(
    dataset= 'sp/',
    embedding_dim= 100,
    tokens= 5,
    learning_rate= 0.001,
    margin= 15.0,
    num_negatives= 4,
    batch_size= 256,
    epochs= 2,
    wandb= True,
    save= True,
    gnn= True,
    log_level= ""):


    """Train an inductive model with NodePiece representations and an optional GNN encoder."""
    # set appropriate log-level
    logging.basicConfig(level=log_level)

    # dataset loading
    dataset = InductiveLPDataset(size=dataset)
    loss = NSSALoss(margin=margin)

    # we have two baselines: InductiveNodePiece and InductiveNodePieceGNN
    # the GNN version uses a 2-layer CompGCN message passing encoder on the training / inference graphs
    # but feel free to create and attach your own GNN encoder via the gnn_encoder argument
    # and new inductive link prediction models in general
    model_cls = InductiveNodePieceGNN if gnn else InductiveNodePiece
    model = model_cls(
        embedding_dim=embedding_dim,
        triples_factory=dataset.transductive_training,
        inference_factory=dataset.inductive_inference,
        num_tokens=tokens,
        aggregation="mlp",
        loss=loss,
    ).to(resolve_device())
    optimizer = Adam(params=model.parameters(), lr=learning_rate)
    logging.info(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")
    logging.info(f"Space occupied: {model.num_parameter_bytes} bytes")

    if wandb:
        tracker = WANDBResultTracker(
            project="inductive_ilp",  # put here your project and entity
            entity="pykeen",
            # config=click.get_current_context().params,
        )
        tracker.start_run()
    else:
        tracker = ConsoleResultTracker()

    # default training regime is negative sampling (SLCWA)
    # you can also use the 1-N regime with the LCWATrainingLoop
    # the LCWA loop does not need negative sampling kwargs, but accepts label_smoothing in the .train() method
    training_loop = SLCWATrainingLoop(
        triples_factory=dataset.transductive_training,
        model=model,
        optimizer=optimizer,
        result_tracker=tracker,
        negative_sampler_kwargs=dict(
            # affects training speed, the more - the better
            num_negs_per_pos=num_negatives
        ),
        mode=TRAINING,  # must be specified for the inductive setup
    )

    # specifying hits@k values: 1, 3, 5, 10, 100
    valid_evaluator = RankBasedEvaluator(
        mode=VALIDATION,
        metrics=["hits_at_k"]*5,
        metrics_kwargs=[dict(k=k) for k in (1, 3, 5, 10, 100)],
        add_defaults=True,
    )
    test_evaluator = RankBasedEvaluator(
        mode=TESTING,
        metrics=["hits_at_k"] * 5,
        metrics_kwargs=[dict(k=k) for k in (1, 3, 5, 10, 100)],
        add_defaults=True
    )

    # model training and eval on validation starts here
    training_loop.train(
        triples_factory=dataset.transductive_training,
        num_epochs=epochs,
        batch_size=batch_size,
        callbacks="evaluation",
        callback_kwargs=dict(
            evaluator=valid_evaluator,
            evaluation_triples=dataset.inductive_validation.mapped_triples,
            prefix="validation",
            frequency=1,
            additional_filter_triples=dataset.inductive_inference.mapped_triples,
            batch_size=batch_size,
        ),
    )

    # final eval on the test set
    result = test_evaluator.evaluate(
        model=model,
        mapped_triples=dataset.inductive_testing.mapped_triples,
        additional_filter_triples=[
            # filtering of other positive triples
            dataset.inductive_inference.mapped_triples,
            dataset.inductive_validation.mapped_triples,
        ],
        batch_size=batch_size,
    )

    # extracting final metrics
    for metric, metric_label in [
        ("inverse_harmonic_mean_rank", "MRR"),
        *((f"hits_at_{k}", f"Hits@{k}") for k in (100, 10, 3, 1)),
        ("adjusted_arithmetic_mean_rank_index", "AMRI"),
    ]:
        logging.info(f"Test {metric_label:10}: {result.get_metric(name=metric):.5f}")

    # you can also log the final results to wandb if you want
    if wandb:
        tracker.log_metrics(
            metrics=result.to_flat_dict(),
            step=epochs + 1,
            prefix="test",
        )

    # saving the final model
    if save:
        current_time = datetime.datetime.now() 
        s = current_time.isoformat()
        print(s)
        torch.save(model, "result/model_"+s)

        





In [None]:
main(dataset= 'data/small/',
    embedding_dim= 100,
    tokens= 5,
    learning_rate= 0.001,
    margin= 15.0,
    num_negatives= 4,
    batch_size= 256,
    epochs= 50,
    wandb= False,
    save= True,
    gnn= True,
    log_level= "Trace"
    )



sampling:   0%|          | 0.00/10.2k [00:00<?, ?it/s]



sampling:   0%|          | 0.00/6.65k [00:00<?, ?it/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on cuda:0:   0%|          | 0/50 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.
INFO:pykeen.training.training_loop:Dropping last (incomplete) batch each epoch (1/614 (0.16%) batches).


Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.57s seconds


Step: 1
Metric: validation.head.optimistic.hits_at_1 = 0.004470426409903714
Metric: validation.tail.optimistic.hits_at_1 = 0.007909215955983494
Metric: validation.both.optimistic.hits_at_1 = 0.006189821182943604
Metric: validation.head.realistic.hits_at_1 = 0.004470426409903714
Metric: validation.tail.realistic.hits_at_1 = 0.007909215955983494
Metric: validation.both.realistic.hits_at_1 = 0.006189821182943604
Metric: validation.head.pessimistic.hits_at_1 = 0.004470426409903714
Metric: validation.tail.pessimistic.hits_at_1 = 0.007909215955983494
Metric: validation.both.pessimistic.hits_at_1 = 0.006189821182943604
Metric: validation.head.optimistic.hits_at_3 = 0.013067400275103164
Metric: validation.tail.optimistic.hits_at_3 = 0.03163686382393398
Metric: validation.both.optimistic.hits_at_3 = 0.02235213204951857
Metric: validation.head.realistic.hits_at_3 = 0.013067400275103164
Metric: validation.tail.realistic.hits_at_3 = 0.03163686382393398
Metric: validation.both.realistic.hits_at_3 =

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.58s seconds


Step: 2
Metric: validation.head.optimistic.hits_at_1 = 0.006533700137551582
Metric: validation.tail.optimistic.hits_at_1 = 0.019944979367262722
Metric: validation.both.optimistic.hits_at_1 = 0.013239339752407153
Metric: validation.head.realistic.hits_at_1 = 0.006533700137551582
Metric: validation.tail.realistic.hits_at_1 = 0.019944979367262722
Metric: validation.both.realistic.hits_at_1 = 0.013239339752407153
Metric: validation.head.pessimistic.hits_at_1 = 0.006533700137551582
Metric: validation.tail.pessimistic.hits_at_1 = 0.019944979367262722
Metric: validation.both.pessimistic.hits_at_1 = 0.013239339752407153
Metric: validation.head.optimistic.hits_at_3 = 0.013067400275103164
Metric: validation.tail.optimistic.hits_at_3 = 0.045048143053645115
Metric: validation.both.optimistic.hits_at_3 = 0.02905777166437414
Metric: validation.head.realistic.hits_at_3 = 0.013067400275103164
Metric: validation.tail.realistic.hits_at_3 = 0.045048143053645115
Metric: validation.both.realistic.hits_at_3

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.57s seconds


Step: 3
Metric: validation.head.optimistic.hits_at_1 = 0.003782668500687758
Metric: validation.tail.optimistic.hits_at_1 = 0.021320495185694635
Metric: validation.both.optimistic.hits_at_1 = 0.012551581843191197
Metric: validation.head.realistic.hits_at_1 = 0.003782668500687758
Metric: validation.tail.realistic.hits_at_1 = 0.021320495185694635
Metric: validation.both.realistic.hits_at_1 = 0.012551581843191197
Metric: validation.head.pessimistic.hits_at_1 = 0.003782668500687758
Metric: validation.tail.pessimistic.hits_at_1 = 0.021320495185694635
Metric: validation.both.pessimistic.hits_at_1 = 0.012551581843191197
Metric: validation.head.optimistic.hits_at_3 = 0.014099037138927097
Metric: validation.tail.optimistic.hits_at_3 = 0.0546767537826685
Metric: validation.both.optimistic.hits_at_3 = 0.0343878954607978
Metric: validation.head.realistic.hits_at_3 = 0.014099037138927097
Metric: validation.tail.realistic.hits_at_3 = 0.0546767537826685
Metric: validation.both.realistic.hits_at_3 = 0.

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.55s seconds


Step: 4
Metric: validation.head.optimistic.hits_at_1 = 0.007909215955983494
Metric: validation.tail.optimistic.hits_at_1 = 0.027166437414030263
Metric: validation.both.optimistic.hits_at_1 = 0.017537826685006877
Metric: validation.head.realistic.hits_at_1 = 0.007909215955983494
Metric: validation.tail.realistic.hits_at_1 = 0.027166437414030263
Metric: validation.both.realistic.hits_at_1 = 0.017537826685006877
Metric: validation.head.pessimistic.hits_at_1 = 0.007909215955983494
Metric: validation.tail.pessimistic.hits_at_1 = 0.027166437414030263
Metric: validation.both.pessimistic.hits_at_1 = 0.017537826685006877
Metric: validation.head.optimistic.hits_at_3 = 0.014442916093535076
Metric: validation.tail.optimistic.hits_at_3 = 0.06980742778541953
Metric: validation.both.optimistic.hits_at_3 = 0.042125171939477304
Metric: validation.head.realistic.hits_at_3 = 0.014442916093535076
Metric: validation.tail.realistic.hits_at_3 = 0.06980742778541953
Metric: validation.both.realistic.hits_at_3 

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.56s seconds


Step: 5
Metric: validation.head.optimistic.hits_at_1 = 0.006533700137551582
Metric: validation.tail.optimistic.hits_at_1 = 0.030261348005502064
Metric: validation.both.optimistic.hits_at_1 = 0.01839752407152682
Metric: validation.head.realistic.hits_at_1 = 0.006533700137551582
Metric: validation.tail.realistic.hits_at_1 = 0.030261348005502064
Metric: validation.both.realistic.hits_at_1 = 0.01839752407152682
Metric: validation.head.pessimistic.hits_at_1 = 0.006533700137551582
Metric: validation.tail.pessimistic.hits_at_1 = 0.030261348005502064
Metric: validation.both.pessimistic.hits_at_1 = 0.01839752407152682
Metric: validation.head.optimistic.hits_at_3 = 0.014442916093535076
Metric: validation.tail.optimistic.hits_at_3 = 0.09147180192572214
Metric: validation.both.optimistic.hits_at_3 = 0.05295735900962861
Metric: validation.head.realistic.hits_at_3 = 0.014442916093535076
Metric: validation.tail.realistic.hits_at_3 = 0.09147180192572214
Metric: validation.both.realistic.hits_at_3 = 0.

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.56s seconds


Step: 6
Metric: validation.head.optimistic.hits_at_1 = 0.003094910591471802
Metric: validation.tail.optimistic.hits_at_1 = 0.03610729023383769
Metric: validation.both.optimistic.hits_at_1 = 0.019601100412654747
Metric: validation.head.realistic.hits_at_1 = 0.003094910591471802
Metric: validation.tail.realistic.hits_at_1 = 0.03610729023383769
Metric: validation.both.realistic.hits_at_1 = 0.019601100412654747
Metric: validation.head.pessimistic.hits_at_1 = 0.003094910591471802
Metric: validation.tail.pessimistic.hits_at_1 = 0.03610729023383769
Metric: validation.both.pessimistic.hits_at_1 = 0.019601100412654747
Metric: validation.head.optimistic.hits_at_3 = 0.014786795048143054
Metric: validation.tail.optimistic.hits_at_3 = 0.09594222833562586
Metric: validation.both.optimistic.hits_at_3 = 0.05536451169188446
Metric: validation.head.realistic.hits_at_3 = 0.014786795048143054
Metric: validation.tail.realistic.hits_at_3 = 0.09594222833562586
Metric: validation.both.realistic.hits_at_3 = 0.

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/2.91k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.55s seconds


Step: 7
Metric: validation.head.optimistic.hits_at_1 = 0.003782668500687758
Metric: validation.tail.optimistic.hits_at_1 = 0.031292984869325996
Metric: validation.both.optimistic.hits_at_1 = 0.017537826685006877
Metric: validation.head.realistic.hits_at_1 = 0.003782668500687758
Metric: validation.tail.realistic.hits_at_1 = 0.031292984869325996
Metric: validation.both.realistic.hits_at_1 = 0.017537826685006877
Metric: validation.head.pessimistic.hits_at_1 = 0.003782668500687758
Metric: validation.tail.pessimistic.hits_at_1 = 0.031292984869325996
Metric: validation.both.pessimistic.hits_at_1 = 0.017537826685006877
Metric: validation.head.optimistic.hits_at_3 = 0.015130674002751032
Metric: validation.tail.optimistic.hits_at_3 = 0.10213204951856947
Metric: validation.both.optimistic.hits_at_3 = 0.05863136176066025
Metric: validation.head.realistic.hits_at_3 = 0.015130674002751032
Metric: validation.tail.realistic.hits_at_3 = 0.10213204951856947
Metric: validation.both.realistic.hits_at_3 =

Training batches on cuda:0:   0%|          | 0/614 [00:00<?, ?batch/s]