In [62]:
import numpy as np
from ampligraph.datasets import load_wn18
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.evaluation import mrr_score, hits_at_n_score
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
import tensorflow as tf
from ampligraph.evaluation import train_test_split_no_unseen
import pandas as pd
import rdflib
import re
from scipy.special import expit
import matplotlib as plt
import seaborn as sns

In [64]:
# load csv medsur.csv
colnames = ["subject", "predicate", "object"]
triples_df = pd.read_csv('medsur.csv', names=colnames, header=None)
triples_df['object'] = triples_df['object'].str.rstrip()

print(triples_df.head())


                                             subject  \
0           http://www.medsur.org/patient_NLLRB13960   
1          http://www.medsur.org/patient_NLLRB152302   
2           http://www.medsur.org/patient_NLLRB51203   
3            http://www.medsur.org/patient_NLLRB9515   
4  http://www.medsur.org/patient_NLGRUNENTHAL2014...   

                                         predicate  \
0  http://example.org/medsur.rdf#suffersSideEffect   
1  http://example.org/medsur.rdf#suffersSideEffect   
2  http://example.org/medsur.rdf#suffersSideEffect   
3  http://example.org/medsur.rdf#suffersSideEffect   
4  http://example.org/medsur.rdf#suffersSideEffect   

                                       object  
0  http://www.medsur.org/side_effect/10062226  
1  http://www.medsur.org/side_effect/10024264  
2  http://www.medsur.org/side_effect/10046798  
3  http://www.medsur.org/side_effect/10061182  
4  http://www.medsur.org/side_effect/10040979  


In [65]:
# only select triples that containt the predicate 'has_outcome'
# triples_df = triples_df[triples_df['predicate'] == 'http://example.org/medsur.rdf#hasOutcome']
triples_df = triples_df[:100000]
print(triples_df.head())


                                             subject  \
0           http://www.medsur.org/patient_NLLRB13960   
1          http://www.medsur.org/patient_NLLRB152302   
2           http://www.medsur.org/patient_NLLRB51203   
3            http://www.medsur.org/patient_NLLRB9515   
4  http://www.medsur.org/patient_NLGRUNENTHAL2014...   

                                         predicate  \
0  http://example.org/medsur.rdf#suffersSideEffect   
1  http://example.org/medsur.rdf#suffersSideEffect   
2  http://example.org/medsur.rdf#suffersSideEffect   
3  http://example.org/medsur.rdf#suffersSideEffect   
4  http://example.org/medsur.rdf#suffersSideEffect   

                                       object  
0  http://www.medsur.org/side_effect/10062226  
1  http://www.medsur.org/side_effect/10024264  
2  http://www.medsur.org/side_effect/10046798  
3  http://www.medsur.org/side_effect/10061182  
4  http://www.medsur.org/side_effect/10040979  


In [66]:
# create np array of triples [[row1], [row2], ...]
triples = triples_df.values
entities = np.unique(np.concatenate([triples[:, 0], triples[:, 2]]))
entities


array(['0', '03', '0463', ..., 'rare', 'uncommon', 'veryrare'],
      dtype=object)

In [67]:
test_size = int(0.1*len(triples_df))

X_train, X_test = train_test_split_no_unseen(triples, test_size=test_size)
#X_train, X_valid = train_test_split_no_unseen(X_train_valid, test_size=test_size)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)
#print('Validation set size: ', X_valid.shape)


Train set size:  (90000, 3)
Test set size:  (10000, 3)


In [68]:
from ampligraph.latent_features.models import ScoringBasedEmbeddingModel as model_embedding

# Initialize a ComplEx neural embedding model: the embedding size is k,
# eta specifies the number of corruptions to generate per each positive,
# scoring_type determines the scoring function of the embedding model.
model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   scoring_type='ComplEx')


In [69]:
# Optimizer, loss and regularizer definition
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = get_loss('pairwise', {'margin': 0.5})
regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5})

# Compilation of the model
model.compile(loss=loss,
              optimizer='adam',
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform')


In [70]:
# Fit the model on training and validation set
model.fit(X_train,
          # use 1/10 of the training set as batch size
          batch_size=int(X_train.shape[0] / 10),
          epochs=200,                    # Number of training epochs
          verbose=True                  # Enable stdout messages
          )


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x239037241c0>

In [43]:
# Run the evaluation procedure on the test set (with filtering)
# To disable filtering: use_filter=None
# Usually, we corrupt subject and object sides separately and compute ranks

positives_filter = {'test': np.concatenate([X_train, X_test])}
ranks = model.evaluate(X_test,
                       use_filter=positives_filter,   # Corruption strategy filter defined above
                       corrupt_side='s,o',  # corrupt subj and obj separately while evaluating
                       verbose=True)




In [44]:
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))


MRR: 0.14
Hits@10: 0.19
Hits@3: 0.15
Hits@1: 0.10


In [45]:
from ampligraph.utils import save_model, restore_model
save_model(model, './best_model.pkl')


The path ./best_model.pkl already exists. This save operation will overwrite the model                 at the specified path.


In [46]:
del model
model = restore_model('./best_model.pkl')
if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')


Saved model does not include a db file. Skipping.
The model is fit!


In [84]:
def link_prediction(cluster):

    # get all unique outcomes
    outcomes = []
    for subject, predicate, object in triples_df.values:
        if predicate == 'http://example.org/medsur.rdf#hasOutcome':
            outcomes.append(object)
    outcomes = np.unique(outcomes)

    X_unseen = np.array([])

    for subject, predicate, object in triples_df.values:
        
        # find all predicates for subject
        if re.search('http://example.org/medsur.rdf#Patients', object): 
            predicates = triples_df.loc[triples_df['subject'] == subject, 'predicate'].unique()
            
            # check if predicates hasoutcome is present
            if 'http://example.org/medsur.rdf#hasOutcome' not in predicates:
                
                # add triple (subject, hasOutcome, outcome) for all outcomes
                for outcome in outcomes:
                    
                    # append to X_unseen in format [[subject, hasOutcome, outcome]] with dimensions (n, 3)
                    X_unseen = np.append(X_unseen, np.array([[subject, 'http://example.org/medsur.rdf#hasOutcome', outcome]]))
                    
    # reshape to (n, 3)
    X_unseen = X_unseen.reshape(int(len(X_unseen)/3), 3)
    
    ranks_unseen = model.evaluate(X_unseen,
                              use_filter=positives_filter,   # Corruption strategy filter defined above
                              corrupt_side='s+o',
                              verbose=True)
    
    scores = model.predict(X_unseen)
    probs = expit(scores)
    
    df_ranking = pd.DataFrame(list(zip([' '.join(x) for x in X_unseen],
                                   ranks_unseen,
                                   np.squeeze(scores),
                                   np.squeeze(probs))),
                          columns=['statement', 'rank', 'score', 'prob']).sort_values("score", ascending=False)

    df_ranking.head(10)
    
    # split dataframe into 4 dataframes based on outcome
    results = {}
    for outcome in outcomes:
        df_outcome = df_ranking.loc[df_ranking['statement'].str.contains(outcome)]
        # reset index
        df_outcome = df_outcome.reset_index(drop=True)
        print(df_outcome.head(10))

        # get mean value of score and prob
        mean_score = df_outcome['score'].mean()
        mean_prob = df_outcome['prob'].mean()

        # only select last part of word (after last /) in outcome
        outcome = (outcome.split('/')[-1]).lower()
        if outcome == "resolved_with_sequelae":
            outcome = "rws"
        results[outcome] = {'score': mean_score, 'probability': mean_prob}

    # create barplot of mean score and mean prob for each outcome with seaborn
    df_results = pd.DataFrame.from_dict(results, orient='index')

    plt.pyplot.figure(figsize=(10, 5))
    plt.pyplot.title('Mean score for each outcome')

    # plot error bars
    sns.barplot(x=df_results.index,
                y=df_results['score'], yerr=df_results['score'].std())


    plt.pyplot.figure(figsize=(10, 5))
    plt.pyplot.title('Mean probability for each outcome')
    sns.barplot(x=df_results.index,
                y=df_results['probability'], yerr=df_results['probability'].std())
    
    return df_ranking

                                             subject  \
0           http://www.medsur.org/patient_NLLRB13960   
1          http://www.medsur.org/patient_NLLRB152302   
2           http://www.medsur.org/patient_NLLRB51203   
3            http://www.medsur.org/patient_NLLRB9515   
4  http://www.medsur.org/patient_NLGRUNENTHAL2014...   

                                         predicate  \
0  http://example.org/medsur.rdf#suffersSideEffect   
1  http://example.org/medsur.rdf#suffersSideEffect   
2  http://example.org/medsur.rdf#suffersSideEffect   
3  http://example.org/medsur.rdf#suffersSideEffect   
4  http://example.org/medsur.rdf#suffersSideEffect   

                                       object  
0  http://www.medsur.org/side_effect/10062226  
1  http://www.medsur.org/side_effect/10024264  
2  http://www.medsur.org/side_effect/10046798  
3  http://www.medsur.org/side_effect/10061182  
4  http://www.medsur.org/side_effect/10040979  
(100000, 3)
(1710, 3)


In [88]:
# create df from triples
triples_df = pd.DataFrame(triples, columns=['subject', 'predicate', 'object'])
df_ranking = link_prediction(triples_df)

In [113]:
# TODO: create groups of patients bases on side-effect clusters
# dus bv groep 1 suffersfrom sideeffects uit cluster 1, groep 2 suffersfrom sideeffects uit cluster 2, etc
triples_df = pd.DataFrame(triples, columns=['subject', 'predicate', 'object'])
X_unseen = np.array([])

drugs = []
for subject, predicate, object in triples_df.values:
    if predicate == 'http://example.org/medsur.rdf#isGivenDrug':
        drugs.append(object)
drugs = np.unique(drugs)

for subject, predicate, object in triples_df.values:
    
    # check if subject is patient
    if re.search('http://example.org/medsur.rdf#Patients', object): 
        
        # add triple (subject, hasOutcome, outcome) for all outcomes
        for drug in drugs:
            # append to X_unseen in format [[subject, hasOutcome, outcome]] with dimensions (n, 3)
            X_unseen = np.append(X_unseen, np.array([[subject, 'http://example.org/medsur.rdf#isGivenDrug', drug]]))

# reshape to (n, 3)
X_unseen = X_unseen.reshape(int(len(X_unseen)/3), 3)

ranks_unseen = model.evaluate(X_unseen,
                              use_filter=positives_filter,   # Corruption strategy filter defined above
                              corrupt_side='s+o',
                              verbose=True)
scores = model.predict(X_unseen)
probs = expit(scores)

df_ranking = pd.DataFrame(list(zip([' '.join(x) for x in X_unseen],
                                   ranks_unseen,
                                   np.squeeze(scores),
                                   np.squeeze(probs))),
                          columns=['statement', 'rank', 'score', 'prob']).sort_values("score", ascending=False)

print(df_ranking.head(10))

# split dataframe into 4 dataframes based on outcome
results = {}

for drug in drugs:
    df_drug = df_ranking.loc[df_ranking['statement'].str.contains(drug)]
    # reset index
    df_drug = df_drug.reset_index(drop=True)
    print(df_drug.head(10))
    # get mean value of score and prob
    mean_score = df_drug['score'].mean()
    mean_prob = df_drug['prob'].mean()

    # only select last part of word (after last /) in outcome
    drug = (drug.split('/')[-1]).lower()
    results[drug] = {'score': mean_score, 'probability': mean_prob}

# create barplot of mean score and mean prob for each outcome with seaborn
df_results = pd.DataFrame.from_dict(results, orient='index')

plt.pyplot.figure(figsize=(10, 5))
plt.pyplot.title('Mean score for each side effect')

# plot error bars
sns.barplot(x=df_results.index,
            y=df_results['score'], yerr=df_results['score'].std())


plt.pyplot.figure(figsize=(10, 5))
plt.pyplot.title('Mean probability for each side effect')
sns.barplot(x=df_results.index,
            y=df_results['probability'], yerr=df_results['probability'].std())

                                              statement   rank     score  \
2729  http://www.medsur.org/patient_NLSZ09PHHO2015NL...   [17]  1.731812   
2225  http://www.medsur.org/patient_NLGRUNENTHAL2020...   [19]  1.651481   
89    http://www.medsur.org/patient_NLTEVA413627ISR ...   [25]  1.594398   
197   http://www.medsur.org/patient_NLAPOTEX2016AP00...   [36]  1.444023   
1037  http://www.medsur.org/patient_NLLRB15267 http:...   [47]  1.392875   
209   http://www.medsur.org/patient_NLGRUNENTHAL2014...   [49]  1.371838   
1877  http://www.medsur.org/patient_NLLRB66415 http:...   [55]  1.338632   
130   http://www.medsur.org/patient_NLGRUNENTHAL2009...  [296]  1.287760   
21    http://www.medsur.org/patient_NLLRB72171 http:...    [4]  1.286279   
2129  http://www.medsur.org/patient_NLLRB30253 http:...   [63]  1.283777   

          prob  
2729  0.849644  
2225  0.839091  
89    0.831234  
197   0.809077  
1037  0.801051  
209   0.797677  
1877  0.792265  
130   0.783768  
21    0.78

NameError: name 'df_drug' is not defined