Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import rdflib
from rdflib import Graph, Literal, Namespace, RDF, URIRef, OWL
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
from rdflib.namespace import DC, FOAF

from owlready2 import *
from owlready2 import get_ontology

from sklearn.model_selection import train_test_split

import networkx as nx
import networkx.algorithms.community as nx_comm

random.seed(10)



Loading the original data

In [2]:
data = pd.read_excel('opioid_data_merged_min.xlsx')

In [3]:
data

Unnamed: 0,WorldwideUniqueCaseIdentification,sex,Outcome,ATCode,PTCode,Frequency,age_group,weight_group,is_sideeffect
0,NL-002147023-NVSC2020NL143475,female,Recovered with sequelae,N02AX02,10012218,0.1,"(64.0, 90.0]",overweight,True
1,NL-002147023-NVSC2020NL143475,female,Recovered with sequelae,N02AX02,10075611,0.0,"(64.0, 90.0]",overweight,False
2,NL-002147023-NVSC2020NL223553,female,Recovered,N02AX02,10076493,0.0,"(64.0, 90.0]",overweight,False
3,NL-002147023-NVSC2020NL223553,female,Recovered,N02AX02,10009839,0.0,"(64.0, 90.0]",overweight,False
4,NL-002147023-NVSC2020NL223553,female,Recovered,N02AX02,10009839,0.0,"(64.0, 90.0]",overweight,False
...,...,...,...,...,...,...,...,...,...
6192,NL-TEVA-719924ROM,male,Recovered,N02AA05,10037660,0.0,"(64.0, 90.0]",normal,False
6193,NL-TEVA-719924ROM,male,Recovered,N02AA05,10062352,0.0,"(64.0, 90.0]",normal,False
6194,NL-TEVA-719924ROM,male,Recovered,N02AA05,10006002,0.0,"(64.0, 90.0]",normal,False
6195,NL-TEVA-719924ROM,male,Recovered,N02AA05,10046571,0.0,"(64.0, 90.0]",normal,False


In [4]:
data['WorldwideUniqueCaseIdentification'] = data.WorldwideUniqueCaseIdentification.astype(str)
data['age_group'] = data.age_group.astype(str)
data['PTCode'] = data.PTCode.astype(str)

In [6]:
triples = []
for _, row in data.iterrows():
        
    # Weight group info
    weight_group = (row['WorldwideUniqueCaseIdentification'], "hasWeightGroup", row['weight_group'])

    # Age group info
    age_group = (row['WorldwideUniqueCaseIdentification'], "hasAgeGroup", row['age_group'])

    # Gender info
    sex = (row['WorldwideUniqueCaseIdentification'], "hasGender", row['sex'])

    #Outcome
    outcome = (row['WorldwideUniqueCaseIdentification'], "hasOutcome", row['Outcome'])

    #Symptom
    symptom = (row['WorldwideUniqueCaseIdentification'], "hasSymptom", row['PTCode'])

    #Drug
    drug = (row['WorldwideUniqueCaseIdentification'], "isGivenDrug", row['ATCode'])

    #Frequency
    frequency = (row['PTCode'], "hasFrequency", row['Frequency'])    
    
    #Side effect
    if row['is_sideeffect'] == True:
        side_effect = (row['WorldwideUniqueCaseIdentification'], "hasSideEffect", row['PTCode'])
    else:
        side_effect = (row['WorldwideUniqueCaseIdentification'], "hasSideEffect", "None")

    triples.extend((weight_group, age_group, sex, outcome, symptom, drug, frequency, side_effect))

In [7]:
triples_df = pd.DataFrame(triples, columns=["subject", "predicate", "object"])
triples_df = triples_df.drop_duplicates()
triples_df[(triples_df.subject=="NL-TEVA-719924ROM")]

Unnamed: 0,subject,predicate,object
49520,NL-TEVA-719924ROM,hasWeightGroup,normal
49521,NL-TEVA-719924ROM,hasAgeGroup,"(64.0, 90.0]"
49522,NL-TEVA-719924ROM,hasGender,male
49523,NL-TEVA-719924ROM,hasOutcome,Recovered
49524,NL-TEVA-719924ROM,hasSymptom,10046571
49525,NL-TEVA-719924ROM,isGivenDrug,N02AA05
49527,NL-TEVA-719924ROM,hasSideEffect,
49532,NL-TEVA-719924ROM,hasSymptom,10047700
49540,NL-TEVA-719924ROM,hasSymptom,10037660
49548,NL-TEVA-719924ROM,hasSymptom,10062352


In [8]:
from ampligraph.evaluation import train_test_split_no_unseen

X_train, X_test = train_test_split_no_unseen(np.array(triples), test_size=7930)

In [9]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)

Train set size:  (41646, 3)
Test set size:  (7930, 3)


In [13]:
from ampligraph.latent_features.models import ScoringBasedEmbeddingModel
    
# Initialize a ComplEx neural embedding model: the embedding size is k,
# eta specifies the number of corruptions to generate per each positive,
# scoring_type determines the scoring function of the embedding model.
model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   scoring_type='ComplEx')

In [14]:
import tensorflow as tf

from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer

# Optimizer, loss and regularizer definition
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = get_loss('pairwise', {'margin': 0.5})
regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5})

# Compilation of the model
model.compile(loss=loss,
              optimizer='adam',
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform')

In [16]:
# Fit the model on training and validation set
model.fit(X_train,
          batch_size=int(X_train.shape[0] / 10), # use 1/10 of the training set as batch size
          epochs=10,                    # Number of training epochs
          verbose=True                  # Enable stdout messages
          )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x243e2a2a920>

In [17]:
filter = {'test': np.concatenate([X_train, X_test])}

In [18]:
ranks = model.evaluate(X_test,
                       use_filter=filter,
                       corrupt_side='s,o',
                       verbose=True)



In [19]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
mrr = mrr_score(ranks)

print("MRR: %.2f" % (mrr))
print("MR: %.2f" % (mr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.59
MR: 149.58
Hits@10: 0.65
Hits@3: 0.60
Hits@1: 0.55


In [20]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
%matplotlib inline

In [21]:
patients = data.WorldwideUniqueCaseIdentification.unique()
patient_embeddings = dict(zip(patients, model.get_embeddings(patients)))

In [None]:
len(patient)

In [22]:
embeddings_2d = PCA(n_components=2).fit_transform(np.array([i for i in patient_embeddings.values()]))

In [25]:
from sklearn.cluster import KMeans
from ampligraph.discovery import find_clusters

clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
clusters = find_clusters(patients, model, clustering_algorithm, mode='e')

print(len(clusters))
print(len(patients))

1584
1584


In [None]:
plot_df = pd.DataFrame({"patients": patients, 
                        "embedding1": embeddings_2d[:, 0], 
                        "embedding2": embeddings_2d[:, 1],
                        "cluster": "cluster" + pd.Series(clusters).astype(str)})

In [None]:
# Plot 2D embeddings
def plot_clusters(hue):
    plt.figure(figsize=(12, 12))
    plt.title("{} embeddings".format(hue).capitalize())
    ax = sns.scatterplot(data=plot_df,
                         x="embedding1", y="embedding2", hue=hue)
    texts = []
    adjust_text(texts)

In [None]:
plot_clusters("cluster")