In [1]:
import pandas as pd
import random
from rdflib import Graph
from rdflib import URIRef
from rdflib.namespace import RDF

In [2]:
path = ''
#path = 'dataset/small_dataset/'
g1 = Graph()
g1.parse(path + "G1.ttl", format="ttl")

g2 = Graph()
g2.parse(path + "G2.ttl", format="ttl")

<Graph identifier=N70ccfa04ddad4aa083a5566eb06ddd81 (<class 'rdflib.graph.Graph'>)>

In [3]:
from rdflib.plugins.sparql.processor import SPARQLResult

def sparql_results_to_df(results: SPARQLResult) -> pd.DataFrame:
    """
    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
    """
    return pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in results),
        columns=[str(x) for x in results.vars],
    )

In [4]:
query = """
select ?s ?p ?o 
    where {
        ?s ?p ?o
    }
"""

qres = g1.query(query)
graph_1 = sparql_results_to_df(qres)
graph_1

Unnamed: 0,s,p,o
0,http://example/DrugDrugInteraction#treatment15...,http://www.w3.org/1999/02/22-rdf-syntax-ns#ddi...,http://example/#metabolism_increase
1,http://example/Treatment_Drug#treatment4_DB00316,http://example/#hasHighToxicity,http://example/#higher_toxicity
2,http://example/Treatment/treatment41,http://example/#related_to,http://example/DrugDrugInteraction#treatment41...
3,http://example/Treatment_Drug#treatment576_DB0...,http://example/#related_to,http://example/Treatment/treatment576
4,http://example/Treatment_Drug#treatment75_DB00641,http://example/#related_to,http://example/Drug/DB00641
...,...,...,...
17024,http://example/DrugDrugInteraction#treatment54...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://example/#DDI
17025,http://example/Treatment_Drug#treatment22_DB01229,http://example/#hasHighToxicity,http://example/#higher_toxicity
17026,http://example/Treatment_Drug#treatment344_DB0...,http://example/#hasLowerEffect,http://example/#lower_effectiveness
17027,http://example/Treatment_Drug#treatment92_DB00541,http://example/#related_to,http://example/Treatment/treatment92


In [5]:
def get_treatment_class(graph, p, o):
    query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ex: <http://example/#> 
    PREFIX treatment_drug: <http://example/Treatment_Drug#>

    select distinct ?treatment
    where {
        ?treatment rdf:type ex:Treatment .
        ?treatment """ + p + """ """ + o + """
        }
        """
    qres = graph.query(query)
    df_cls = sparql_results_to_df(qres)
    df_cls['treatment'] = '<' + df_cls['treatment'].astype(str) + '>'
    return df_cls

In [6]:
g1_effectiveness = get_treatment_class(g1, 'ex:hasClassificationEffect', 'ex:decrease_effectiveness')
g1_toxicity = get_treatment_class(g1, 'ex:hasClassificationToxicity', 'ex:increase_toxicity')
display(g1_effectiveness.shape, g1_effectiveness.head(2), g1_toxicity.shape, g1_toxicity.head(2))

(1500, 1)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment4564>
1,<http://example/Treatment/treatment5864>


(1500, 1)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment3570>
1,<http://example/Treatment/treatment3032>


In [7]:
g1_category = pd.concat([g1_toxicity, g1_effectiveness])
g1_category.reset_index(inplace=True)
g1_category.drop(columns=['index'], inplace=True)
display(g1_category.shape, g1_category.head(2))

(3000, 1)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment3570>
1,<http://example/Treatment/treatment3032>


In [8]:
g2_effectiveness = get_treatment_class(g2, 'ex:hasClassificationEffect', 'ex:decrease_effectiveness')
g2_toxicity = get_treatment_class(g2, 'ex:hasClassificationToxicity', 'ex:increase_toxicity')
display(g2_effectiveness.shape, g2_effectiveness.head(2), g2_toxicity.shape, g2_toxicity.head(2))

(3500, 1)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment8083>
1,<http://example/Treatment/treatment8176>


(3500, 1)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment4293>
1,<http://example/Treatment/treatment6531>


In [9]:
g2_category = pd.concat([g2_toxicity, g2_effectiveness])
g2_category.reset_index(inplace=True)
g2_category.drop(columns=['index'], inplace=True)
display(g2_category.shape, g2_category.head(2))

(7000, 1)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment4293>
1,<http://example/Treatment/treatment6531>


In [10]:
new_class_to_add = g2_category.shape[0] - g1_category.shape[0]
new_class_to_add

4000

In [11]:
query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ex: <http://example/#> 
PREFIX treatment_drug: <http://example/Treatment_Drug#>
    
select distinct ?treatment
where {
    ?treatment rdf:type ex:Treatment .
    }
    """

qres = g1.query(query)
#df_toxicity = pd.DataFrame(qres.bindings)
treatment_list = sparql_results_to_df(qres)
treatment_list['treatment'] = '<' + treatment_list['treatment'].astype(str) + '>'
display(treatment_list.head(2), treatment_list.shape)

Unnamed: 0,treatment
0,<http://example/Treatment/treatment6683>
1,<http://example/Treatment/treatment3177>


(10000, 1)

In [12]:
safe_treatment = (treatment_list.merge(g2_category, on=['treatment'], how='left', indicator=True)
 .query('_merge == "left_only"').drop('_merge', 1))
safe_treatment

Unnamed: 0,treatment
2,<http://example/Treatment/treatment1608>
23,<http://example/Treatment/treatment2969>
25,<http://example/Treatment/treatment1424>
33,<http://example/Treatment/treatment2426>
37,<http://example/Treatment/treatment2959>
...,...
9987,<http://example/Treatment/treatment1624>
9989,<http://example/Treatment/treatment2697>
9993,<http://example/Treatment/treatment744>
9994,<http://example/Treatment/treatment1862>


In [13]:
def get_drugs_in_treatment(graph, treatment):
    query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ex: <http://example/#> 
    PREFIX treatment_drug: <http://example/Treatment_Drug#>

    select distinct ?drug
    where {
        ?drug rdf:type ex:Drug .
        ?treatment_drug rdf:type ex:Treatment_Drug .
        ?treatment_drug ex:related_to """ + treatment + """ .
        ?treatment_drug ex:related_to ?drug .
        }
        """
    qres = graph.query(query)
    df_cls = sparql_results_to_df(qres)
    df_cls['drug'] = '<' + df_cls['drug'].astype(str) + '>'
    return df_cls

def get_treatment_drug(graph, treatment):
    query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ex: <http://example/#> 
    PREFIX treatment_drug: <http://example/Treatment_Drug#>

    select distinct ?treatment_drug
    where {
        ?treatment_drug rdf:type ex:Treatment_Drug .
        ?treatment_drug ex:related_to """ + treatment + """ .
        }
        """
    qres = graph.query(query)
    df_cls = sparql_results_to_df(qres)
    #df_cls['treatment_drug'] = df_cls['treatment_drug'].str.replace('http://example/Treatment_Drug#','treatment_drug:')
    return df_cls

def select_d_d_treat_node(graph):
    query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ex: <http://example/#>
    PREFIX treatment_drug: <http://example/Treatment_Drug#>

    select distinct ?ddi_t
    where {
        ?ddi_t rdf:type ex:DDI
        }
        """
    qres = graph.query(query)
    df_cls = sparql_results_to_df(qres)
    #df_cls['ddi_t'] = df_cls.ddi_t.str.replace('http://example/DrugDrugInteraction#', 'ddi:')
    df_cls.replace('http://example/DrugDrugInteraction#', 'ddi:', regex=True, inplace=True)
    return df_cls

In [14]:
drug_drug_treatm_list = select_d_d_treat_node(g1)
drug_drug_treatm_list

Unnamed: 0,ddi_t
0,ddi:treatment8761C0603520C1678805
1,ddi:treatment8189C0035608C0913506
2,ddi:treatment4870C0032821C0039607
3,ddi:treatment643C4691336C0262964
4,ddi:treatment6162C0068483C0009010
...,...
150238,ddi:treatment5014C0015052C0030883
150239,ddi:treatment9453C1122087C0030883
150240,ddi:treatment9715C0056686C2736451
150241,ddi:treatment5204C0012228C0251504


In [15]:
drug_dict = dict()
for index, row in treatment_list.iterrows():
    key = row['treatment']
    df_cls = get_drugs_in_treatment(g1, key)
    drug_dict[key] = list(df_cls.drug)

In [16]:
drug_dict

{'<http://example/Treatment/treatment6683>': ['<http://example/Drug/C0282085>',
  '<http://example/Drug/C0071772>',
  '<http://example/Drug/C0076275>',
  '<http://example/Drug/C0065656>',
  '<http://example/Drug/C0012383>',
  '<http://example/Drug/C0000477>',
  '<http://example/Drug/C0066673>',
  '<http://example/Drug/C1172734>',
  '<http://example/Drug/C0772501>',
  '<http://example/Drug/C1176316>'],
 '<http://example/Treatment/treatment3177>': ['<http://example/Drug/C0002598>',
  '<http://example/Drug/C1958126>',
  '<http://example/Drug/C4519101>',
  '<http://example/Drug/C0013085>',
  '<http://example/Drug/C0009010>',
  '<http://example/Drug/C0072247>',
  '<http://example/Drug/C0041009>',
  '<http://example/Drug/C0092777>',
  '<http://example/Drug/C0070217>',
  '<http://example/Drug/C0060201>'],
 '<http://example/Treatment/treatment1608>': ['<http://example/Drug/C0164662>',
  '<http://example/Drug/C0010590>',
  '<http://example/Drug/C2987430>',
  '<http://example/Drug/C0031849>',
  

In [17]:
treat_drug_dict = dict()
for index, row in treatment_list.iterrows():
    key = row['treatment']
    df_cls = get_treatment_drug(g1, key)
    
    treat_drug_dict[key] = list(df_cls.treatment_drug)

In [18]:
treat_drug_dict

{'<http://example/Treatment/treatment6683>': ['http://example/Treatment_Drug#treatment6683_C0282085',
  'http://example/Treatment_Drug#treatment6683_C0071772',
  'http://example/Treatment_Drug#treatment6683_C0076275',
  'http://example/Treatment_Drug#treatment6683_C0065656',
  'http://example/Treatment_Drug#treatment6683_C0012383',
  'http://example/Treatment_Drug#treatment6683_C0000477',
  'http://example/Treatment_Drug#treatment6683_C0066673',
  'http://example/Treatment_Drug#treatment6683_C1172734',
  'http://example/Treatment_Drug#treatment6683_C0772501',
  'http://example/Treatment_Drug#treatment6683_C1176316'],
 '<http://example/Treatment/treatment3177>': ['http://example/Treatment_Drug#treatment3177_C0002598',
  'http://example/Treatment_Drug#treatment3177_C1958126',
  'http://example/Treatment_Drug#treatment3177_C4519101',
  'http://example/Treatment_Drug#treatment3177_C0013085',
  'http://example/Treatment_Drug#treatment3177_C0009010',
  'http://example/Treatment_Drug#treatmen

In [19]:
num_new_links = len(g2) - len(g1)
num_new_links

677250

In [20]:
def get_drugs_in_treatment(graph, treatment):
    query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ex: <http://example/#> 
    PREFIX treatment_drug: <http://example/Treatment_Drug#>

    select distinct ?drug
    where {
        #?treatment rdf:type ex:Treatment .
        ?drug rdf:type ex:Drug .
        ?treatment_drug rdf:type ex:Treatment_Drug .
        ?treatment_drug ex:related_to """ + treatment + """ .
        ?treatment_drug ex:related_to ?drug .
        }
        """
    qres = graph.query(query)
    df_cls = sparql_results_to_df(qres)
    df_cls['drug'] = '<' + df_cls['drug'].astype(str) + '>'
    return df_cls


def isin_graph(graph, s, p, o):
    s = URIRef(s)
    p = URIRef(p)
    o = URIRef(o)

    if (s, p, o) in graph:
        return True
    return False

def random_pred_obj():
    random_predicate = random.choice(['Effect', 'Toxicity'])
    p = 'http://example/#hasHighToxicity'
    o = 'http://example/#higher_toxicity'
    if random_predicate == 'Effect':
        p = 'http://example/#hasLowerEffect'
        o = 'http://example/#lower_effectiveness'
    return p, o

def random_DDI(drugs, random_treat):
    d1 = random.choice(drugs)
    d2 = random.choice(drugs)
    id1 = random_treat.replace('<http://example/Treatment/', '').replace('>', '')
    id2 = d1.replace('<http://example/Drug/', '').replace('>', '')
    id3 = d2.replace('<http://example/Drug/', '').replace('>', '')
    s = 'http://example/DrugDrugInteraction#'+id1+id2+id3
    p = RDF.type
    o = 'http://example/#DDI'
    return s, p, o, d1, d2

def random_class():
    random_predicate = random.choice(['Effect', 'Toxicity'])
    p = 'ex:hasClassificationToxicity'
    o = 'ex:increase_toxicity'
    if random_predicate == 'Effect':
        p = 'ex:hasClassificationEffect'
        o = 'ex:decrease_effectiveness'
    return p, o

def create_random_classification(safe_treatment):
    random_treat = random.choice(safe_treatment.treatment.values)
    p, o = random_class()
    return random_treat + "\t" + p + "\t" + o + """ .\n"""


def select_treatment_drug_node(treatment_list, treat_drug_dict):
    random_treat = random.choice(treatment_list.treatment)
    treatment_drug = treat_drug_dict[random_treat]
    return random.choice(treatment_drug)

def select_drug_node(treatment_list, drug_dict):
    random_treat = random.choice(treatment_list.treatment)
    drug_list = drug_dict[random_treat]
    return random.choice(drug_list)


def save_ttl_file(graph_ttl, name):
    with open(name, 'a') as file:
        file.write(graph_ttl)

In [72]:
case_1 = 10 * 2 # num_drug in treatment * 2 (tox or effect)
case_2 = 10 * 9 / 2 #add DDI in a treatment (num_drug in treatment)
case_3 = treatment_list.shape[0] -1 #* (treatment_list.shape[0] - 1) /2  # (add relation bettween treatments)
case_4 = 10 * 9 / 2 #add relation between drugs in a treatment
case_5 = 1  # (add a class for a treatment)
total = int(case_1 + case_2 + case_3 + case_4 + case_5)

In [21]:
# ===** case_3 and case_4 do not exist in the original graph **===
case_1 = 25   # add relation between two treatment_drug nodes in the graph.
case_2 = 25   # add relation between two drugs in the graph.
case_3 = 25  # add relation bettween treatments. (treatment_list.shape[0] /2)
case_4 = 25   # add relation bettween two DrugDrugTreatment node in the graph
case_5 = 2   # add a class for a treatment. (toxicity or effectivenness)
total = int(case_1 + case_2 + case_3 + case_4 + case_5)

In [22]:
#g3 = pd.DataFrame(columns=['subject', 'predicate', 'object'])
graph_ttl = """"""
list_random_classification = []
n_triples = 0

treatment_to_select_eff = g1_effectiveness.treatment
treatment_to_select_tox = g1_toxicity.treatment
    
while True:
    
    r = random.choice(list(range(1, total)))
    
    if r <= case_1:
        treatment_drug_a = select_treatment_drug_node(treatment_list, treat_drug_dict)
        treatment_drug_b = select_treatment_drug_node(treatment_list, treat_drug_dict)
        
        s = treatment_drug_a.replace('http://example/Treatment_Drug#','treatment_drug:')
        o = treatment_drug_a.replace('http://example/Treatment_Drug#','treatment_drug:')
        graph_ttl = graph_ttl + s + "\tex:related_to\t" + o + """ .\n"""
        graph_ttl = graph_ttl + o + "\tex:related_to\t" + s + """ .\n"""
        n_triples+=2
        
    elif r <= case_1 + case_2:
        d1 = select_drug_node(treatment_list, drug_dict)
        d2 = select_drug_node(treatment_list, drug_dict)
        
        graph_ttl = graph_ttl + d1 + "\tex:related_to\t" + d2 + """ .\n"""
        graph_ttl = graph_ttl + d2 + "\tex:related_to\t" + d1 + """ .\n"""
        n_triples+=2
    
    elif r < case_1 + case_2 + case_3:
        random_treat = random.choice(treatment_list.treatment)
        random_treat_2 = random.choice(treatment_list.treatment)
        graph_ttl = graph_ttl + random_treat + "\tex:related_to\t" + random_treat_2 + """ .\n"""
        graph_ttl = graph_ttl + random_treat_2 + "\tex:related_to\t" + random_treat + """ .\n"""
        n_triples+=2
    
    elif r < case_1 + case_2 + case_3 + case_4:
        ddt_a = random.choice(drug_drug_treatm_list.ddi_t)
        ddt_b = random.choice(drug_drug_treatm_list.ddi_t)
        graph_ttl = graph_ttl + ddt_a + "\tex:related_to\t" + ddt_b + """ .\n"""
        graph_ttl = graph_ttl + ddt_b + "\tex:related_to\t" + ddt_a + """ .\n"""
        n_triples+=2
    
    elif len(list_random_classification) < new_class_to_add:
        c = create_random_classification(safe_treatment)
        while c in list_random_classification:
            c = create_random_classification(safe_treatment)
        list_random_classification.append(c)
        n_triples+=1
          
    if n_triples > (num_new_links):
        break

In [23]:
triple_text = ''.join([t for t in list_random_classification])
triple_text

'<http://example/Treatment/treatment991>\tex:hasClassificationEffect\tex:decrease_effectiveness .\n<http://example/Treatment/treatment256>\tex:hasClassificationToxicity\tex:increase_toxicity .\n<http://example/Treatment/treatment941>\tex:hasClassificationToxicity\tex:increase_toxicity .\n<http://example/Treatment/treatment620>\tex:hasClassificationToxicity\tex:increase_toxicity .\n<http://example/Treatment/treatment2674>\tex:hasClassificationEffect\tex:decrease_effectiveness .\n<http://example/Treatment/treatment1047>\tex:hasClassificationEffect\tex:decrease_effectiveness .\n<http://example/Treatment/treatment2568>\tex:hasClassificationToxicity\tex:increase_toxicity .\n<http://example/Treatment/treatment2698>\tex:hasClassificationToxicity\tex:increase_toxicity .\n<http://example/Treatment/treatment2527>\tex:hasClassificationEffect\tex:decrease_effectiveness .\n<http://example/Treatment/treatment1877>\tex:hasClassificationEffect\tex:decrease_effectiveness .\n<http://example/Treatment/tr

In [24]:
graph_ttl = graph_ttl + triple_text

In [25]:
save_ttl_file(graph_ttl, 'g3_random.ttl')

In [75]:
print(graph_ttl)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

