In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

from scipy.special import expit
import tensorflow as tf

from ampligraph.datasets import load_wn18
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.evaluation import mrr_score, hits_at_n_score
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.latent_features.models import ScoringBasedEmbeddingModel as model_embedding
from ampligraph.utils import save_model, restore_model

from pykeen.pipeline import pipeline
from pykeen.training import SLCWATrainingLoop
from pykeen.optimizers import Adam
from pykeen.evaluation import RankBasedEvaluator
from pykeen import predict
from pykeen.predict import predict_triples
from pykeen import triples

import os

In [None]:
def build_model(X_train, X_test, model = 'ComplEx', epochs = 100):
    """Builds a ComplEx model on the given dataset."""
     
    pipeline_result = pipeline(
    random_seed=0,
    model=model,               
    dimensions=150,                         
    training=X_train,                     
    testing=X_test,                       
    training_kwargs=dict(                  
        num_epochs=epochs,                                        
        checkpoint_name='checkpoint.pt',
        checkpoint_directory='checkpoint_dir/',
        checkpoint_frequency=10,
        
    ),
    optimizer='adam',                   
    optimizer_kwargs={'lr':1e-3},
    loss='pairwisehinge',               
    regularizer='LP', 
    regularizer_kwargs={'p':3, 'weight':1e-5}, 
    negative_sampler='basic',
    negative_sampler_kwargs=dict(
        filtered=True,
    ))
    
    model = pipeline_result.model
    
    # plot losses
    pipeline_result.plot_losses()    
    
    # print some stats                       
    print(pipeline_result.get_metric('mrr'))            
    print(pipeline_result.get_metric('hits@10'))  
    print(pipeline_result.get_metric('hits@3'))
    print(pipeline_result.get_metric('hits@1'))             
    
    return model

In [None]:
def link_prediction(model, df, cluster, predicate_link, X_training, X_testing, trip):
    """Predicts the probability of a (predicate) link between cluster of patients and all possible outcomes."""	
    
    target_objects = []
    
    # get all unique target_objects
    for subject, predicate, object in df.values:
        if predicate == predicate_link: 
            target_objects.append(object)
            
    target_objects = np.unique(target_objects)

    X_unseen = set()
    
    for subject, predicate, object in cluster.values:
        
        # add triple for each possible target_object for each patient in cluster
        for targ in target_objects: 
            
            X_unseen.add((subject, predicate_link, targ))                
    
    # convert to numpy array 
    X_unseen = np.array(list(X_unseen))

    # predict triples
    pack = predict.predict_triples(model=model, triples=X_unseen, triples_factory=trip)

    # get probabilities and scores for each predicted triple
    processed_results = pack.process().df
    probs = expit(processed_results['score'])
    processed_results['prob'] = probs
    processed_results['triple'] = list(zip([' '.join(x) for x in X_unseen]))

    # processed_results
    df_ranking = pd.DataFrame(list(zip([' '.join(x) for x in X_unseen],  
                      np.squeeze(processed_results['score']),
                      np.squeeze(probs))), 
             columns=['statement', 'score', 'prob']).sort_values("score")
    
    # split dataframe into 4 dataframes based on outcome
    results = {}
    outcomes = []
    
    # get mean value of score and prob for each outcome
    for outcome in target_objects:
        
        # get dataframe for each outcome
        df_outcome = df_ranking.loc[df_ranking['statement'].str.contains(outcome)]
        df_outcome = df_outcome.reset_index(drop=True)

        # get mean value of score and prob
        mean_score = df_outcome['score'].mean()
        mean_prob = df_outcome['prob'].mean()

        # shorten name
        outcome = (outcome.split('/')[-1]).lower()
        if outcome == "resolved_with_sequelae":
            outcome = "rws"
            
        results[outcome] = {'score': mean_score, 'probability': mean_prob}
        outcomes.append(outcome)

    # create barplot of mean score and mean prob for each outcome
    df_results = pd.DataFrame.from_dict(results, orient='index')
    df_results['outcome'] = outcomes
    
    return df_ranking, df_results

In [None]:
# load all triples
colnames = ["subject", "predicate", "object"]
triples_df = pd.read_csv('medsur.csv', names=colnames, header=None) 
triples_df = triples_df.dropna()
triples_df = triples_df.drop_duplicates()
triples_df = triples_df[~triples_df['object'].str.endswith('nan ')]
triples_df['object'] = triples_df['object'].str.rstrip()
triples_df['subject'] = triples_df['subject'].str.rstrip() 
triples_df['predicate'] = triples_df['predicate'].str.rstrip() 
triples_df = triples_df.drop_duplicates()
triples_df = triples_df.dropna()

# put in correct format for pykeen
t = triples_df.values

In [None]:
# split into train and test set
test_size = int(0.1*len(t))

# triplesfactory
trip = triples.TriplesFactory.from_labeled_triples(t)
X_training, X_testing = trip.split([0.95,0.05])

print('Train set size: ', X_training.triples.shape)
print('Test set size: ', X_testing.triples.shape)

# build model
model = build_model(X_training, X_testing)

In [None]:
# create clusters of age groups 
df_boomers = triples_df[triples_df['object'].str.contains('http://www.medsur.org/age/65_above')]
df_gen_x = triples_df[triples_df['object'].str.contains('http://www.medsur.org/age/45_64')]
df_millenials = triples_df[triples_df['object'].str.contains('http://www.medsur.org/age/25_44')]
df_gen_z = triples_df[triples_df['object'].str.contains('http://www.medsur.org/age/18_24')]
names = ['gen_z', 'millenials', 'gen_x', 'boomers']
clusters = [df_gen_z, df_millenials, df_gen_x, df_boomers]

In [None]:
if not os.path.exists('results'):
    os.makedirs('results')
        
# only select the triples that are relevant for predicting the outcome
predicate_link = 'http://example.org/medsur.rdf#hasOutcome'
df_predicate = pd.DataFrame(t, columns=['subject', 'predicate', 'object'])

df_predicate = df_predicate.loc[df_predicate['predicate'] == predicate_link]

# create clusters of df based on subject
for i, cluster in enumerate(clusters):                                
     
    cluster_subjects = []      
    for subject, predicate, object in cluster.values:
        cluster_subjects.append(subject)
    
    df_cluster = df_predicate[df_predicate['subject'].isin(cluster_subjects)]      # working

    df_ranking, df_results = link_prediction(model, df_predicate, df_cluster, predicate_link, X_training, X_testing, trip)
    
    # save ranking to csv
    df_ranking.to_csv('results/ranking_age_' + names[i] + '.csv', index=False)  
    df_results.to_csv('results/results_age_' + names[i] + '.csv', index=False)     
    
df_prob = pd.DataFrame()
df_score = pd.DataFrame()

# load all df results of age groups
for name in names:
    df_results = pd.read_csv('results/results_age_' + name + '.csv')
    df_prob[name] = df_results['probability']
    df_score[name] = df_results['score']

df_prob.index = df_results['outcome']
df_score.index = df_results['outcome']

# only select relevant part of the outcome
df_prob.index = [x.split('/')[-1] for x in df_prob.index]
df_score.index = [x.split('/')[-1] for x in df_score.index]

# save df_prob to csv
df_prob.to_csv('results/results_prob_outcome_age.csv', index=True)
df_score.to_csv('results/results_score_outcome_age.csv', index=True)                      

In [None]:
# create clusters of age groups 
df_obese = triples_df[triples_df['object'].str.contains('http://www.medsur.org/weight/obese')]
df_overweight = triples_df[triples_df['object'].str.contains('http://www.medsur.org/weight/overweight')]
df_normal = triples_df[triples_df['object'].str.contains('http://www.medsur.org/weight/normal')]
df_underweight = triples_df[triples_df['object'].str.contains('http://www.medsur.org/weight/underweight')]

names = ['obese', 'overweight', 'normal', 'underweight']
clusters = [df_obese, df_overweight, df_normal, df_underweight]

In [None]:
if not os.path.exists('results'):
    os.makedirs('results')
        
# only select the triples that are relevant for predicting the outcome
predicate_link = 'http://example.org/medsur.rdf#hasOutcome'
df_predicate = pd.DataFrame(t, columns=['subject', 'predicate', 'object'])

df_predicate = df_predicate.loc[df_predicate['predicate'] == predicate_link]

# create clusters of df based on subject
for i, cluster in enumerate(clusters):                               
     
    cluster_subjects = []      
    for subject, predicate, object in cluster.values:
        cluster_subjects.append(subject)
    
    df_cluster = df_predicate[df_predicate['subject'].isin(cluster_subjects)]      
    
    df_ranking, df_results = link_prediction(model, df_predicate, df_cluster, predicate_link, X_training, X_testing, trip)
    
    # save ranking to csv
    df_ranking.to_csv('results/ranking_weight_' + names[i] + '.csv', index=False)  
    df_results.to_csv('results/results_weight_' + names[i] + '.csv', index=False)   

df_prob = pd.DataFrame()
df_score = pd.DataFrame()

# load all df results of age groups
for name in names:
    df_results = pd.read_csv('results/results_weight_' + name + '.csv')
    df_prob[name] = df_results['probability']
    df_score[name] = df_results['score']

df_prob.index = df_results['outcome']
df_score.index = df_results['outcome']

# only select relevant part of the outcome
df_prob.index = [x.split('/')[-1] for x in df_prob.index]
df_score.index = [x.split('/')[-1] for x in df_score.index]

# save df_prob to csv
df_prob.to_csv('results/results_prob_outcome_weight.csv', index=True)
df_score.to_csv('results/results_score_outcome_weight.csv', index=True)                                                          

In [None]:
drugs = []

# get all unique drug types
for subject, predicate, object in t:
    if predicate == 'http://example.org/medsur.rdf#isGivenDrug':
        drugs.append(object)
drugs = np.unique(drugs)

clusters = []
names = []

# create cluster for each drug
for drug in drugs:
    cluster = triples_df[triples_df['object'].str.contains(drug)]
    clusters.append(cluster)
    name = (drug.split('/')[-1]).lower()
    names.append(name)

In [None]:
# only select the triples that are relevant for predicting the symptom
predicate = 'http://example.org/medsur.rdf#hasSOC'
df_predicate = pd.DataFrame(t, columns=['subject', 'predicate', 'object'])
df_predicate = df_predicate.loc[df_predicate['predicate'] == predicate]

# create clusters of df based on subject
for i, cluster in enumerate(clusters):
    df_cluster = df_predicate[df_predicate['subject'].isin(cluster['subject'])]
    df_ranking, df_results = link_prediction(model, df_predicate, df_cluster, predicate, X_training, X_testing, trip)
    
    # save ranking to csv
    df_ranking.to_csv('results/ranking_symptom_' + names[i] + '.csv', index=False)   
    df_results.to_csv('results/results_symptom_' + names[i] + '.csv', index=False)
    
# load all df results of age groups
df_prob = pd.DataFrame()
df_score = pd.DataFrame()

for name in names:
    df_results = pd.read_csv('results/results_symptom_' + name + '.csv')
    df_prob[name] = df_results['probability']
    df_score[name] = df_results['score']

df_prob.index = df_results['outcome']
df_score.index = df_results['outcome']

# save df_prob to csv
df_prob.to_csv('results/results_prob_symptom_drug.csv', index=True)
df_score.to_csv('results/results_score_symptom_drug.csv', index=True)  

In [None]:
# only select the triples that are relevant for predicting the side effect
predicate = 'http://example.org/medsur.rdf#suffersSideEffect'
df_predicate = pd.DataFrame(t, columns=['subject', 'predicate', 'object'])
df_predicate = df_predicate.loc[df_predicate['predicate'] == predicate]

# create clusters of df based on subject
for i, cluster in enumerate(clusters):
    df_cluster = df_predicate[df_predicate['subject'].isin(cluster['subject'])]
    df_ranking, df_results = link_prediction(model, df_predicate, df_cluster, predicate, X_training, X_testing, trip)
    
    # save ranking to csv
    df_ranking.to_csv('results/ranking_side_effect_' + names[i] + '.csv', index=False)   
    df_results.to_csv('results/results_side_effect_' + names[i] + '.csv', index=False)
    
df_prob = pd.DataFrame()
df_score = pd.DataFrame()

# load all df results of age groups
for name in names:
    df_results = pd.read_csv('results/results_side_effect_' + name + '.csv')
    df_prob[name] = df_results['probability']
    df_score[name] = df_results['score']

df_prob.index = df_results['outcome']
df_score.index = df_results['outcome']

# only select relevant part of the side effect
df_prob.index = [x.split('/')[-1] for x in df_prob.index]
df_score.index = [x.split('/')[-1] for x in df_score.index]

# save df_prob to csv
df_prob.to_csv('results/results_prob_side_effect_drug.csv', index=True)
df_score.to_csv('results/results_score_side_effect_drug.csv', index=True) 