Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from rdflib import Literal, RDF, URIRef
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
import rdflib.namespace

from owlready2 import *
from owlready2 import get_ontology

import networkx as nx
import networkx.algorithms.community as nx_comm

Loading the original data

In [None]:
data = pd.read_excel('opioids_data_original.xlsx')
side_effects = pd.read_excel("sider_output.xlsx")
frequencies = pd.read_csv('meddra_freq.tsv', sep='\t', header=None)

Distribution of different outcome labels:

In [None]:
data.Outcome.value_counts().plot(kind='bar')

In [None]:
data['Outcome'].value_counts(normalize=True) * 100

Changing the 'Outcome' column values

In [None]:
data['Outcome'] = data['Outcome'].replace(['Outcome niet ingevuld', 'Recovered/resolved', 'Not recovered/not resolved/ongoing', 'Recovered/resolved with sequelae', 'Recovering/resolving'], ['Unknown', 'Recovered', 'Ongoing', 'Sequelae', 'Recovering'])
data.Outcome.value_counts().plot(kind='bar');

In [None]:
data['Outcome'].value_counts(normalize=True) * 100

**MISSING VALUES**

In [None]:
data.isna().sum()

**DROP COLUMNS**

In [None]:
data = data.drop(columns=['Primary Source Description', 'Status', 'Category', 'OutcomeCodeSystemVersion', 'OutcomeText', 'CultureID', 'date_received', 'summary', 'narrative', 'IsCurrent', 'IsDefaultSOC'])

**REMOVE OUTLIERS**

- Removal of instances with bodyweight == 0 
- Removal of instances with height == 0

In [None]:
data = data[data.BodyWeight != 0] 
data = data[data.Height != 0] 

**CLEANING**

tramadol met paracetamol; N02AJ13 --> N02AX02
oxy met nalo; N02AA55 --> N02AA05
morf combi; N02AA51 --> N02AA01

In [None]:
data['ATCText'].replace(['TRAMADOL MET PARACETAMOL', 'OXYCODON MET NALOXON', 'MORFINE, COMBINATIEPREPARATEN'], ['TRAMADOL', 'OXYCODON', 'MORFINE'], inplace=True)

data['ATCode'].replace(['N02AJ13', 'N02AA51'], ['N02AX02', 'N02AA01'], inplace=True)

data['ATCText'] = data['ATCText'].str.lower()

**MERGING**

In [None]:
data = data.merge(side_effects[['ATCode', 'ATCText', 'PTCode', 'Side effect', 'Frequency']], how='left', on=['ATCode', 'ATCText', 'PTCode'])

The 'reaction_impact' column presumably measures the impact the medicine has had on the patient. As shown before, there is no reaction_impact for fatalities. 
We can impute the missing values by taking the average of a patient with similar features. 

To make this easier, we'll first create a column to bin the ages. We'll also create a BMI column and a weight group column.

In [None]:
data['age_group'] = pd.cut(x=data['age_year'], bins=[18, 24, 44, 64, 90])
data['BMI'] = data['BodyWeight'] / (data['Height']/100)**2
data['weight_group'] = pd.cut(x=data['BMI'], bins=[0, 18.5, 25, 30, 50], labels=['underweight', 'normal', 'overweight', 'obese'])
data['WorldwideUniqueCaseIdentification'] = data['WorldwideUniqueCaseIdentification'].str.replace(" ", "")
data['age_group'] = data['age_group'].astype(str)
data['PTCode'] = data['PTCode'].astype(str)

In [None]:
data['Frequency'] = data['Frequency'] * 100
data['Frequency'] = data['Frequency'].replace(0.0, 0.0001)
data['Frequency'] = data['Frequency'].fillna(0.0)

data['is_sideeffect'] = data['Side effect'].notna()

In [None]:
data.reset_index(drop=True)
data.columns

In [None]:
gdn = list(data['GenericDrugName'].str.split(' ', expand=True).stack().unique())
type = ['CAPSULE', 'NEUSSPRAY', 'TABLET', 'PLEISTER', 'INJVLST', 'ZETPIL', 'DRANK', 'SPRAY', 'ZUIGTABLET', 'BRUISTABLET', 'INJ/INFOPL', 'INFVLST', 'DRUPPELS', 'SMELTTABLET', 'INJECTIE/INFUUS', 'DISPERTABLET', 'TAB', 'INJECTIEPOEDER']
dosage = []

def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

for i in gdn:
    if has_numbers(i):
        dosage.append(i)

data['Type'] = data['GenericDrugName'].apply(lambda x : ''.join([k for k in str(x).split() if k in type]))
data['Dosage'] = data['GenericDrugName'].apply(lambda x : ''.join([k for k in str(x).split() if k in dosage]))

In [None]:
data_sideeffects = pd.read_excel("sider_output.xlsx")

In [None]:
data

In [None]:
data.isna().sum()

In [None]:
symbols1 = ["-", "_", "\+", "\?", "%", "\*", "\.", "\,", "\:", "\;", "\!", "\@", "\#", "\$", "\^", "\&", "\(", "\)", "\{", "\}", "\[", "\]", "\|", "\/", "\~", "\`", "\=", "\<", "\>", " "]
symbols2 = ["NAN", "NaN", "None", "NaT", "NAT", "nat", "n/a", "N/A", "n/a", "N/A", "n.a.", "N.A.", " "]


for i in symbols1:
    data = data.replace(i, "", regex = True)
    data_sideeffects = data_sideeffects.replace(i, "", regex = True)

for i in symbols2:
    
    # replace symbol in string with "", bit not the whole string
    data = data.replace(rf'\b{i}\b', np.nan, regex = True)
    data_sideeffects = data_sideeffects.replace(rf'\b{i}\b', np.nan, regex = True)

data
data_sideeffects

In [None]:
data = data.reset_index(drop=True)
data

In [None]:
data.isna().sum()

In [None]:
data.to_excel("opioid_datamerged.xlsx") 

In [None]:
onto = get_ontology("http://example.org/medsur.owl")

class Patients(Thing):
    namespace = onto

class AgeGroup(Thing):
    namespace = onto
 
class hasAgeGroup(ObjectProperty):                 
    domain = [Patients]
    range = [AgeGroup]
    namespace = onto
  
class WeightGroup(Thing):
    namespace = onto
    
class hasWeightGroup(ObjectProperty):   
    domain = [Patients]
    range = [WeightGroup]
    namespace = onto
    
class Symptoms(Thing):
    namespace = onto
    
class hasSymptom(ObjectProperty):
    domain = [Patients]
    range = [Symptoms]
    namespace = onto

class Outcome(Thing):
    namespace = onto
    
class hasOutcome(ObjectProperty):
    domain = [Patients]
    range = [Outcome]
    namespace = onto
    
class Gender(Thing):
    namespace = onto

class hasGender(ObjectProperty):
    domain = [Patients]
    range = [Gender]
    namespace = onto

class Drug(Thing):
    namespace = onto
    
class IsGivenDrug(ObjectProperty):
    domain = [Patients]
    range = [Drug]
    namespace = onto
    
class IsOfDosis(ObjectProperty):
    domain = [Drug]
    namespace = onto
    
class IsOfType(ObjectProperty):
    domain = [Drug]
    namespace = onto

class SideEffects(Thing):
    namespace = onto
    
class hasSideEffect(ObjectProperty):
    domain = [Drug]
    range = [SideEffects]
    namespace = onto
    
class hasFrequency(ObjectProperty):                 
    domain = [SideEffects]
    namespace = onto 
    
class suffersSideEffects(ObjectProperty):
    domain = [Patients]
    range = [SideEffects]
    namespace = onto

In [None]:
onto.save(file = "medsur.rdf", format = "rdfxml") 
g = rdflib.Graph()
g.parse("medsur.rdf", format="xml")

# Loop through each triple in the graph (subj, pred, obj)
for subj, pred, obj in g:
    
    # Check if there is at least one triple in the Graph
    if (subj, pred, obj) not in g:
       raise Exception("It better be!")

# Print the number of "triples" in the Graph
print(f"Graph g has {len(g)} statements.")

In [None]:
# print all the triples in the graph 
for s, p, o in g:
    print(s, p, o)

In [None]:
# add RDF triples to the ontology
EX = rdflib.Namespace("http://example.org/medsur.rdf#")

for index, row in data.iterrows():

    patient = URIRef(f"http://www.medsur.org/patient_{row['WorldwideUniqueCaseIdentification']}")
    g.add((patient, RDF.type, EX.Patients))

    weight_group = URIRef(f"http://www.medsur.org/weight/{row['weight_group']}")
    g.add((weight_group, RDF.type, EX.WeightGroup))
    g.add((patient, EX.hasWeightGroup, weight_group))

    if float(row["age_year"]) >= 65:
        agegroup = URIRef("http://www.medsur.org/age/65_above")
    elif float(row["age_year"]) >= 45:
        agegroup = URIRef("http://www.medsur.org/age/45_64") 
    elif float(row["age_year"]) >= 25:
        agegroup = URIRef("http://www.medsur.org/age/25_44")
    elif float(row["age_year"]) >= 18:
        agegroup = URIRef("http://www.medsur.org/age/18_24")
    
    if agegroup:
        g.add((agegroup, RDF.type, EX.AgeGroup))
        g.add((patient, EX.hasAgeGroup, agegroup))

    if row["sex"] == "male" or row["sex"] == "female":
        gender = URIRef(f"http://www.medsur.org/gender/{row['sex']}")
        g.add((gender, RDF.type, EX.Gender))
        g.add((patient, EX.hasGender, gender))

    if row["Outcome"] != "Unknown" or row["Outcome"] != np.nan: 
        outcome = URIRef(f"http://www.medsur.org/outcome/{row['Outcome']}")
        g.add((outcome, RDF.type, EX.Outcome))
        g.add((patient, EX.hasOutcome, outcome))

    if row["PTCode"] != np.nan:
        symptom = URIRef(f"http://www.medsur.org/symptom/{row['PTCode']}")
        g.add((symptom, RDF.type, EX.Symptoms))
        g.add((patient, EX.hasSymptom, symptom))

    if row["ATCode"] != np.nan:
        drug = URIRef(f"http://www.medsur.org/drug/{row['ATCode']}")              
        g.add((drug, RDF.type, EX.Drug))   
        g.add((patient, EX.isGivenDrug, drug))

    df_sideeffects = data_sideeffects.loc[data_sideeffects['ATCode'] == row["ATCode"]]
  
    for index, row2 in df_sideeffects.iterrows():
        
        if row2["PTCode"] != np.nan:
            side_effect = URIRef(f"http://www.medsur.org/side_effect/{row2['PTCode']}")
            g.add((side_effect, RDF.type, EX.SideEffects))
            g.add((drug, EX.hasSideEffect, side_effect))
                
            # check if any side_effect is present in patient file
            if row['is_sideeffect'] == True:
                g.add((patient, EX.suffersSideEffect, side_effect))                           
                
            if row2["Frequency"] != np.nan:
                g.add((side_effect, EX.hasFrequency, Literal(row2['Frequency'])))

In [None]:
# print all the triples in the graph 
for s, p, o in g:
    print(s, p, o)

In [None]:
nx_graph = rdflib_to_networkx_digraph(g)
print("Number of Nodes: {n}".format(n=nx.number_of_nodes(nx_graph)))
print("Number of Edges: {n}".format(n=nx.number_of_edges(nx_graph)))
print("Density of Graph: {n}".format(n=nx.density(nx_graph)))
print("Clustering coefficient: {n}".format(n=nx.average_clustering(nx_graph)))
print("Degree centrality:", nx.degree_centrality(nx_graph))

In [None]:
histdegree = pd.DataFrame(nx.degree_histogram(nx_graph))
degree = dict(nx.degree(nx_graph))

mean_degree = np.mean(list(degree.values()))
mean_degree_centrality = np.mean(list(nx.degree_centrality(nx_graph).values()))

fig, ax = plt.subplots(figsize=(12,6)) 
ax.bar(histdegree.index.values,histdegree[0])

plt.title("Mean Degree: {n1}\n Mean Degree Centrality: {n2}".format(n1=mean_degree,n2=mean_degree_centrality))
plt.show()

In [None]:
# save the triples in a csv file
with open('medsur.csv', 'w') as f: 
    for s, p, o in g:
        f.write(f'{s},{p},{o} \n')