# **Embedding Models Comparative study**

In [None]:
# Mounting drive 

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
# check if Tensorflow is correctly installed 

import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [None]:
#installing AmpluGraph and dependencies 
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [None]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.3.2


# **Uploading Data for Knowledge graph embeddings creation**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ComparativeStudyKGembedding/Head_Tail_Relationship_big.csv", header=None)

In [None]:
dataset = pd.DataFrame()
dataset['subject'] = df[1]
dataset['predicate'] = df[3]
dataset['object'] = df[2]

In [None]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (863126, 3)


# **Creating training, validation and test set**

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 20000
test_train, X_valid = train_test_split_no_unseen(dataset.values, 20000, seed=0)

# get the test set of size 400000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 40000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (863126, 3)
Size of train: (803126, 3)
Size of valid: (20000, 3)
Size of test: (40000, 3)


In [None]:
X_test

array([['GoldenEye', 'has_genre', 'Action'],
       ['Carrington', 'has_genre', 'Romance'],
       ['Dead Presidents', 'has_genre', 'Action'],
       ...,
       ['Sonita', 'has_rate', '3.0'],
       ['TechnoCalyps', 'has_rate', '2.5'],
       ['Rivers of Sand', 'has_rate', '0.0']], dtype=object)

In [None]:
from ampligraph.evaluation import evaluate_performance 

print('Size of X_test:', X_test.shape)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)



Size of X_test: (40000, 3)


In [None]:
def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))



In [None]:
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB

# **TransE**

In [None]:
model = TransE(k=150, epochs=50, eta=1, loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
               seed= 0, batches_count= 1, verbose=True)

model.fit(X_train)

ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
display_aggregate_metrics(ranks)



Average Loss:   0.180492: 100%|██████████| 50/50 [00:10<00:00,  4.96epoch/s]


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.
100%|██████████| 40000/40000 [22:09<00:00, 30.09it/s]


Mean Rank: 18797.309675
Mean Reciprocal Rank: 0.041047834845826685
Hits@1: 0.0220625
Hits@10: 0.101725
Hits@100: 0.1505625


In [None]:
print('The number of unique entities:', len(model.ent_to_idx))
print('The number of unique relations:', len(model.rel_to_idx))

The number of unique entities: 283009
The number of unique relations: 7


In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)

Size of entity embeddings: (283009, 150)
Size of entity embeddings: (7, 150)


## **DistMult**

In [None]:
model = DistMult(k=150, epochs=50, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, batches_count= 1, verbose=True)

model.fit(X_train)

ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
display_aggregate_metrics(ranks)



Average Loss:   0.675982: 100%|██████████| 50/50 [00:10<00:00,  4.99epoch/s]


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.
100%|██████████| 40000/40000 [19:11<00:00, 34.75it/s]


Mean Rank: 17733.2299625
Mean Reciprocal Rank: 0.07967297346571732
Hits@1: 0.0506875
Hits@10: 0.1384875
Hits@100: 0.1978625


In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)

# **ComplEx**

In [None]:
model = ComplEx(k=150, epochs=50, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, batches_count= 4, verbose=True)

model.fit(X_train)

ranks = evaluate_performance(X_test, 
                             model=model,
                             filter_triples=X_filter,
                             corrupt_side='s,o',
                             ranking_strategy='worst')
display_aggregate_metrics(ranks)


  0%|          | 0/50 [00:00<?, ?epoch/s][A
Average Loss:   0.693145:   0%|          | 0/50 [00:01<?, ?epoch/s][A
Average Loss:   0.693145:   2%|▏         | 1/50 [00:01<01:26,  1.77s/epoch][A
Average Loss:   0.693095:   2%|▏         | 1/50 [00:02<01:26,  1.77s/epoch][A
Average Loss:   0.693095:   4%|▍         | 2/50 [00:02<01:13,  1.53s/epoch][A
Average Loss:   0.693009:   4%|▍         | 2/50 [00:03<01:13,  1.53s/epoch][A
Average Loss:   0.693009:   6%|▌         | 3/50 [00:03<01:04,  1.37s/epoch][A
Average Loss:   0.692857:   6%|▌         | 3/50 [00:04<01:04,  1.37s/epoch][A
Average Loss:   0.692857:   8%|▊         | 4/50 [00:04<00:57,  1.26s/epoch][A
Average Loss:   0.692611:   8%|▊         | 4/50 [00:05<00:57,  1.26s/epoch][A
Average Loss:   0.692611:  10%|█         | 5/50 [00:05<00:52,  1.18s/epoch][A
Average Loss:   0.692235:  10%|█         | 5/50 [00:06<00:52,  1.18s/epoch][A
Average Loss:   0.692235:  12%|█▏        | 6/50 [00:06<00:49,  1.12s/epoch][A
Average Loss: 

    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.

  0%|          | 0/40000 [00:00<?, ?it/s][A
  0%|          | 1/40000 [00:00<2:44:08,  4.06it/s][A
  0%|          | 2/40000 [00:00<2:20:02,  4.76it/s][A
  0%|          | 3/40000 [00:00<2:02:28,  5.44it/s][A
  0%|          | 4/40000 [00:00<1:49:32,  6.09it/s][A
  0%|          | 5/40000 [00:00<1:42:05,  6.53it/s][A
  0%|          | 6/40000 [00:00<1:34:02,  7.09it/s][A
  0%|          | 7/40000 [00:00<1:30:34,  7.36it/s][A
  0%|          | 8/40000 [00:01<1:27:20,  7.63it/s][A
  0%|          | 9/40000 [00:01<1:25:06,  7.83it/s][A
  0%|          | 10/40000 [00:01<1:22:43,  8.06it/s][A
  0%|          | 11/40000 [00:01<1:21:46,  8.15it/s][A
  0%|          | 12/40000 [00:01<1:20:53

In [None]:
print('Size of entity embeddings:', model.ent_emb.shape)
print('Size of entity embeddings:', model.rel_emb.shape)