In [16]:
import numpy as np
from ampligraph.datasets import load_wn18
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.evaluation import mrr_score, hits_at_n_score
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
import tensorflow as tf
from ampligraph.evaluation import train_test_split_no_unseen 

In [17]:
# load csv medsur.csv
colnames = ["subject", "predicate", "object"]
triples_df = pd.read_csv('medsur.csv', names=colnames, header=None) 

print(triples_df.head())

                                      subject  \
0          http://www.medsur.org/drug/N02AE01   
1          http://www.medsur.org/drug/N02AX02   
2  http://www.medsur.org/side_effect/10013573   
3  http://www.medsur.org/side_effect/10003036   
4  http://www.medsur.org/side_effect/10037211   

                                         predicate  \
0            http://www.medsur.org/isPrescribedFor   
1              http://www.medsur.org/hasSideEffect   
2               http://www.medsur.org/hasFrequency   
3               http://www.medsur.org/hasFrequency   
4  http://www.w3.org/1999/02/22-rdf-syntax-ns#type   

                                        object  
0      http://www.medsur.org/symptom/10080284   
1  http://www.medsur.org/side_effect/10046543   
2                                        5.26   
3                                           9   
4           http://www.medsur.org/SideEffects   


In [18]:
# create np array of triples [[row1], [row2], ...]
triples = triples_df.values
print(triples)

[['http://www.medsur.org/drug/N02AE01'
  'http://www.medsur.org/isPrescribedFor'
  'http://www.medsur.org/symptom/10080284 ']
 ['http://www.medsur.org/drug/N02AX02'
  'http://www.medsur.org/hasSideEffect'
  'http://www.medsur.org/side_effect/10046543 ']
 ['http://www.medsur.org/side_effect/10013573'
  'http://www.medsur.org/hasFrequency' '5.26 ']
 ...
 ['http://www.medsur.org/side_effect/10024419'
  'http://www.medsur.org/hasFrequency' 'postmarketing ']
 ['http://example.org/medsur.owl#Patients'
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
  'http://www.w3.org/2002/07/owl#Class ']
 ['http://www.medsur.org/side_effect/10020843'
  'http://www.medsur.org/hasFrequency' 'very_rare ']]


In [22]:
test_size = int(0.1*len(triples_df))

X_train_valid, X_test = train_test_split_no_unseen(triples, test_size=test_size)
X_train, X_valid = train_test_split_no_unseen(X_train_valid, test_size=test_size)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)
print('Validation set size: ', X_valid.shape)

Train set size:  (4635, 3)
Test set size:  (579, 3)
Validation set size:  (579, 3)


In [26]:
from ampligraph.latent_features.models import ScoringBasedEmbeddingModel as model_embedding
    
# Initialize a ComplEx neural embedding model: the embedding size is k,
# eta specifies the number of corruptions to generate per each positive,
# scoring_type determines the scoring function of the embedding model.
model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   scoring_type='ComplEx')

# Optimizer, loss and regularizer definition
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = get_loss('pairwise', {'margin': 0.5})
regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5})

# Compilation of the model
model.compile(optimizer=optim, loss=loss, entity_relation_regularizer=regularizer)

# For evaluation, we can use a filter which would be used to filter out
# positives statements created by the corruption procedure.
# Here we define the filter set by concatenating all the positives
filter = {'test' : np.concatenate((X_train, X_valid, X_test))}

# Early Stopping callback
checkpoint = tf.keras.callbacks.EarlyStopping(
    monitor='val_{}'.format('hits10'),
    min_delta=0,
    patience=5,
    verbose=1,
    mode='max',
    restore_best_weights=True
)

# Fit the model on training and validation set
model.fit(X_train,
          batch_size=int(X_train.shape[0] / 10),
          epochs=20,                    # Number of training epochs
          validation_freq=20,           # Epochs between successive validation
          validation_burn_in=100,       # Epoch to start validation
          validation_data=X_valid,   # Validation data
          validation_filter=filter,     # Filter positives from validation corruptions
          callbacks=[checkpoint],       # Early stopping callback (more from tf.keras.callbacks are supported)
          verbose=True                  # Enable stdout messages
          )


# Run the evaluation procedure on the test set (with filtering)
# To disable filtering: use_filter=None
# Usually, we corrupt subject and object sides separately and compute ranks
ranks = model.evaluate(X_test,
                       use_filter=filter,
                       corrupt_side='s,o')

# compute and print metrics:
mrr = mrr_score(ranks)
hits_10 = hits_at_n_score(ranks, n=10)
print("MRR: %f, Hits@10: %f" % (mrr, hits_10))
# Output: MRR: 0.884418, Hits@10: 0.935500

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MRR: 0.372736, Hits@10: 0.518998
