Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import rdflib
from rdflib import Graph, Literal, Namespace, RDF, URIRef, OWL
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
from rdflib.namespace import DC, FOAF

from owlready2 import *
from owlready2 import get_ontology

from sklearn.model_selection import train_test_split

import networkx as nx
import networkx.algorithms.community as nx_comm

from sklearn.decomposition import PCA
import seaborn as sns
from adjustText import adjust_text
%matplotlib inline

random.seed(10)



Loading the data with necessary columns

In [2]:
data = pd.read_excel('opioid_data_merged_min.xlsx')

In [None]:
data

Some transformations

In [None]:
data['WorldwideUniqueCaseIdentification'] = data.WorldwideUniqueCaseIdentification.astype(str)
data['age_group'] = data.age_group.astype(str)
data['PTCode'] = data.PTCode.astype(str)

Triple extraction

In [None]:
triples = []
for _, row in data.iterrows():
        
    # Weight group info
    weight_group = (row['WorldwideUniqueCaseIdentification'], "hasWeightGroup", row['weight_group'])

    # Age group info
    age_group = (row['WorldwideUniqueCaseIdentification'], "hasAgeGroup", row['age_group'])

    # Gender info
    sex = (row['WorldwideUniqueCaseIdentification'], "hasGender", row['sex'])

    #Outcome
    outcome = (row['WorldwideUniqueCaseIdentification'], "hasOutcome", row['Outcome'])

    #Symptom
    symptom = (row['WorldwideUniqueCaseIdentification'], "hasSymptom", row['PTCode'])

    #Drug
    drug = (row['WorldwideUniqueCaseIdentification'], "isGivenDrug", row['ATCode'])

    #Frequency
    frequency = (row['PTCode'], "hasFrequency", row['Frequency'])    
    
    #Side effect
    if row['is_sideeffect'] == True:
        side_effect = (row['WorldwideUniqueCaseIdentification'], "hasSideEffect", row['PTCode'])
    else:
        side_effect = (row['WorldwideUniqueCaseIdentification'], "hasSideEffect", "None")

    triples.extend((weight_group, age_group, sex, outcome, symptom, drug, frequency, side_effect))

This is how they look in a dataframe:

In [None]:
triples_df = pd.DataFrame(triples, columns=["subject", "predicate", "object"])
triples_df = triples_df.drop_duplicates()
triples_df[(triples_df.subject=="NL-TEVA-719924ROM")]

Train-test split

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen

X_train, X_test = train_test_split_no_unseen(np.array(triples), test_size=7930)

In [None]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)

We choose ComplEx as our Knowledge Graph Embedding model:

In [None]:
from ampligraph.latent_features.models import ScoringBasedEmbeddingModel
    
# Initialize a ComplEx neural embedding model: the embedding size is k,
# eta specifies the number of corruptions to generate per each positive,
# scoring_type determines the scoring function of the embedding model.
model = ScoringBasedEmbeddingModel(k=150,
                                   eta=10,
                                   scoring_type='ComplEx')

Compile model

In [None]:
import tensorflow as tf

from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer

# Optimizer, loss and regularizer definition
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = get_loss('pairwise', {'margin': 0.5})
regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5})

# Compilation of the model
model.compile(loss=loss,
              optimizer='adam',
              entity_relation_regularizer=regularizer,
              entity_relation_initializer='glorot_uniform')

Train model

In [None]:
# Fit the model on training and validation set
model.fit(X_train,
          batch_size=int(X_train.shape[0] / 10), # use 1/10 of the training set as batch size
          epochs=200,                    # Number of training epochs
          verbose=True                  # Enable stdout messages
          )

An additional step when evaluating KGEs: Define a filter so that no negative statements generated by the corruption procedure are actually positives.

In [None]:
filter = {'test': np.concatenate([X_train, X_test])}

Evaluate model

In [None]:
ranks = model.evaluate(X_test,
                       use_filter=filter,
                       corrupt_side='s,o',
                       verbose=True)

We used the mrr_score (mean reciprocal rank) and hits_at_n_score functions to evaluate the quality of our predictions:

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
mrr = mrr_score(ranks)

print("MRR: %.2f" % (mrr))
print("MR: %.2f" % (mr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

In [None]:
patients = data.WorldwideUniqueCaseIdentification.unique()
patient_embeddings = dict(zip(patients, model.get_embeddings(patients)))

In [None]:
embeddings_2d = PCA(n_components=2).fit_transform(np.array([i for i in patient_embeddings.values()]))

In [None]:
from sklearn.cluster import KMeans
from ampligraph.discovery import find_clusters

clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
clusters = find_clusters(patients, model, clustering_algorithm, mode='e')

print(len(clusters))
print(len(patients))

In [None]:
# Cluster-patient dictionary

results = dict(zip(clusters, patients))

with open("clusters.txt", 'w') as f: 
    for key, value in results.items(): 
        f.write('%s:%s\n' % (key, value))

In [None]:
plot_df = pd.DataFrame({"patients": patients, 
                        "embedding1": embeddings_2d[:, 0], 
                        "embedding2": embeddings_2d[:, 1],
                        "cluster": "cluster" + pd.Series(clusters).astype(str)})

In [None]:
# Plot 2D embeddings
def plot_clusters(hue):
    plt.figure(figsize=(12, 12))
    plt.title("{} embeddings".format(hue).capitalize())
    ax = sns.scatterplot(data=plot_df,
                         x="embedding1", y="embedding2", hue=hue)
    texts = []
    adjust_text(texts)

In [None]:
plot_clusters("cluster")