In [1]:
import pandas as pd
import random
from rdflib import Graph
from rdflib import URIRef
from rdflib.namespace import RDF

In [2]:
path = ''
#path = 'dataset/small_dataset/'
g1 = Graph()
g1.parse(path + "G1.ttl", format="ttl")

g2 = Graph()
g2.parse(path + "G2.ttl", format="ttl")

<Graph identifier=Ne5c5ee2e50bd423580f796109aa9448a (<class 'rdflib.graph.Graph'>)>

In [3]:
from rdflib.plugins.sparql.processor import SPARQLResult

def sparql_results_to_df(results: SPARQLResult) -> pd.DataFrame:
    """
    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
    """
    return pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in results),
        columns=[str(x) for x in results.vars],
    )

In [4]:
num_new_links = len(g2) - len(g1)
num_new_links

1600

In [5]:
def replace_prefix(df, col):
    df[col] = df[col].str.replace('http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf:')
    df[col] = df[col].str.replace('http://example/#', 'ex:')
    df[col] = df[col].str.replace('http://example/Treatment_Drug#', 'treatment_drug:')
    df[col] = df[col].str.replace('http://example/DrugDrugInteraction#', 'ddi:')
    return df

query = """select distinct ?p
where {
    ?s ?p ?o.
    }
    """
qres = g1.query(query)
property_list = sparql_results_to_df(qres)

query = """select distinct ?e
                where {
                {?e ?p ?o}
                UNION
                {?o ?p ?e}
                }
"""
qres = g1.query(query)
entity_list = sparql_results_to_df(qres)

entity_list = replace_prefix(entity_list, 'e')
property_list = replace_prefix(property_list, 'p')
display(property_list, property_list.shape)
display(entity_list, entity_list.shape)

  df[col] = df[col].str.replace('http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf:')


Unnamed: 0,p
0,rdf:type
1,ex:precipitant_drug
2,ex:hasClassificationEffect
3,ex:object_drug
4,ex:related_to
5,ex:hasLowerEffect
6,ex:hasHighToxicity


(7, 1)

Unnamed: 0,e
0,ddi:treatment50DB00338DB00316
1,ddi:treatment70DB00335DB00641
2,http://example/Treatment/treatment197
3,ddi:treatment169DB00338DB00642
4,treatment_drug:treatment250_DB00642
...,...
4391,ex:Treatment
4392,ex:effective
4393,ex:Drug
4394,ex:HigherToxicity


(4396, 1)

In [6]:
def get_treatment_class(treatment_class, g):
    query = """
    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    prefix ex: <http://example/#>
    select distinct ?s
    where {
    ?s rdf:type ex:Treatment .
    ?s ex:hasClassificationEffect ex:""" + treatment_class + """ .
    }
    """
    qres = g.query(query)
    treatment = sparql_results_to_df(qres)
    treatment['s'] = '<' + treatment['s'].astype(str) + '>'
    return treatment


def link_to_class(g1):
    treatment_eff = get_treatment_class('effective', g1)
    treatment_dec_eff = get_treatment_class('decrease_effectiveness', g1)
    treatment_eff = treatment_eff.loc[:treatment_eff.shape[0]/3]
    treatment_dec_eff = treatment_dec_eff.loc[:treatment_dec_eff.shape[0]/3]
    
    treatment_eff['p'] = 'ex:hasClassificationEffect'
    treatment_eff['o'] = 'ex:decrease_effectiveness .'
    treatment_dec_eff['p'] = 'ex:hasClassificationEffect'
    treatment_dec_eff['o'] = 'ex:effective .'
    
    df_category = pd.concat([treatment_eff, treatment_dec_eff])
    #df_category['s'] = '<' + df_category['s'].astype(str) + '>'
    return df_category

#link_to_class(g1)

In [6]:
def generate_triple(entity_list, property_list):
    random_subj = random.choice(entity_list)
    random_obj = random.choice(entity_list)
    random_prop = random.choice(property_list)
    if 'http://example/' in random_subj:
        random_subj = '<'+random_subj+'>'
    if 'http://example/' in random_obj:
        random_obj = '<'+random_obj+'>'
    
    
    
    random_obj = random_obj + ' .'
    return [random_subj, random_prop, random_obj]


def create_random_triples(entity_list, property_list, num_new_links, random_triples):
    entity_list = entity_list.e.values
    property_list = property_list.p.values
    while random_triples.shape[0]<num_new_links:
        random_triples.loc[len(random_triples.index)] = generate_triple(entity_list, property_list)
        random_triples.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    return random_triples

In [7]:
# random_triples = link_to_class(g1)
random_triples = pd.DataFrame(columns=['s', 'p', 'o'])
random_triples = create_random_triples(entity_list, property_list, num_new_links, random_triples)
display(random_triples.shape, random_triples)

(1600, 3)

Unnamed: 0,s,p,o
0,ddi:treatment52DB00515DB00338,ex:hasClassificationEffect,treatment_drug:treatment422_DB00584 .
1,treatment_drug:treatment373_DB00316,ex:hasLowerEffect,treatment_drug:treatment301_DB09035 .
2,ddi:treatment267DB00958DB01060,ex:precipitant_drug,treatment_drug:treatment154_DB00997 .
3,treatment_drug:treatment297_DB00338,ex:object_drug,treatment_drug:treatment139_DB01248 .
4,treatment_drug:treatment457_DB00958,ex:precipitant_drug,treatment_drug:treatment322_DB00335 .
...,...,...,...
1595,treatment_drug:treatment155_DB00338,rdf:type,treatment_drug:treatment210_DB00338 .
1596,<http://example/Treatment/treatment571>,ex:object_drug,ddi:treatment415DB00338DB09063 .
1597,<http://example/Treatment/treatment500>,ex:object_drug,treatment_drug:treatment214_DB09037 .
1598,<http://example/Treatment/treatment519>,rdf:type,ddi:treatment360DB00641DB01229 .


In [8]:
random_triples.to_csv('G3.ttl', sep='\t', header=None, index=None)

In [6]:
def get_drugs(g):
    query = """
    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    prefix ex: <http://example/#>
    select distinct ?o
    where {
    ?o rdf:type ex:Drug .
    }
    """
    qres = g.query(query)
    drugs = sparql_results_to_df(qres)
    drugs['o'] = '<' + drugs['o'].astype(str) + '>'
    return drugs.sample(n = 15)

treatment_eff = get_treatment_class('effective', g1)
treatment_dec_eff = get_treatment_class('decrease_effectiveness', g1)
treatment_dec_eff = treatment_dec_eff.sample(n = 56)
drug_list = get_drugs(g1)
drug_list['p'] = 'ex:related_to'
drug_list

Unnamed: 0,o,p
3,<http://example/Drug/DB00437>,ex:related_to
19,<http://example/Drug/DB09214>,ex:related_to
29,<http://example/Drug/DB00441>,ex:related_to
46,<http://example/Drug/DB09330>,ex:related_to
26,<http://example/Drug/DB00515>,ex:related_to
28,<http://example/Drug/DB01001>,ex:related_to
17,<http://example/Drug/DB00193>,ex:related_to
70,<http://example/Drug/DB11595>,ex:related_to
38,<http://example/Drug/DB00112>,ex:related_to
18,<http://example/Drug/DB01137>,ex:related_to


In [7]:
drug_list['key'] = 1
treatment_eff['key'] = 1
treatment_dec_eff['key'] = 1
# to obtain the cross join we will merge on 
# the key and drop it.
result_eff = pd.merge(drug_list, treatment_eff, on ='key').drop("key", 1)
result_dec_eff = pd.merge(drug_list, treatment_dec_eff, on ='key').drop("key", 1)

df_category = pd.concat([result_eff, result_dec_eff])
df_category['s'] = df_category['s'].astype(str) + ' .'
df_category = df_category.sample(1600)
df_category

  result_eff = pd.merge(drug_list, treatment_eff, on ='key').drop("key", 1)
  result_dec_eff = pd.merge(drug_list, treatment_dec_eff, on ='key').drop("key", 1)


Unnamed: 0,o,p,s
277,<http://example/Drug/DB00515>,ex:related_to,<http://example/Treatment/treatment68> .
817,<http://example/Drug/DB00853>,ex:related_to,<http://example/Treatment/treatment474> .
441,<http://example/Drug/DB11595>,ex:related_to,<http://example/Treatment/treatment149> .
45,<http://example/Drug/DB00437>,ex:related_to,<http://example/Treatment/treatment569> .
575,<http://example/Drug/DB11746>,ex:related_to,<http://example/Treatment/treatment331> .
...,...,...,...
254,<http://example/Drug/DB00515>,ex:related_to,<http://example/Treatment/treatment554> .
470,<http://example/Drug/DB00112>,ex:related_to,<http://example/Treatment/treatment7> .
435,<http://example/Drug/DB11595>,ex:related_to,<http://example/Treatment/treatment194> .
689,<http://example/Drug/DB00641>,ex:related_to,<http://example/Treatment/treatment358> .


In [8]:
df_category.to_csv('G3.ttl', sep='\t', header=None, index=None)

In [7]:
treatment_eff = get_treatment_class('effective', g1)
treatment_dec_eff = get_treatment_class('decrease_effectiveness', g1)
treatment_dec_eff = treatment_dec_eff.sample(n = 336).reset_index()

In [8]:
def get_relations(query, g, c1, c2):
    qres = g.query(query)
    relations = sparql_results_to_df(qres)
    relations = replace_prefix(relations, c1)
    relations = replace_prefix(relations, c2)
    return relations


In [9]:
new_triple = pd.DataFrame()

for i in range(0, 336, 56):
    for j in range(treatment_eff.shape[0]):
        query = """
            select distinct ?p ?o
            where {
            """ + treatment_eff.s[j] + """ ?p ?o .
            FILTER (?p NOT IN (<http://example/#hasClassificationEffect>, <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>))
            }        
            """
        p_o = get_relations(query, g1, 'p', 'o')
        p_o['s'] = treatment_dec_eff.s[i+j]
        p_o = p_o[['s', 'p', 'o']]

        query = """
        select distinct ?s ?p
        where {
        ?s ?p """ + treatment_eff.s[j] + """ .
        }
        """
        s_p = get_relations(query, g1, 's', 'p')
        s_p['o'] = treatment_dec_eff.s[i+j]
        new_triple = pd.concat([new_triple, p_o, s_p]) #.reset_index()
        #display(new_triple)
        if new_triple.shape[0]>=num_new_links:
            break

new_triple['o'] = new_triple['o'].astype(str) + ' .'
display(new_triple)

  df[col] = df[col].str.replace('http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf:')


Unnamed: 0,s,p,o
0,<http://example/Treatment/treatment288>,ex:related_to,ddi:treatment524DB00335DB00421 .
1,<http://example/Treatment/treatment288>,ex:related_to,ddi:treatment524DB00335DB00641 .
2,<http://example/Treatment/treatment288>,ex:related_to,ddi:treatment524DB00338DB00335 .
3,<http://example/Treatment/treatment288>,ex:related_to,ddi:treatment524DB00421DB00338 .
4,<http://example/Treatment/treatment288>,ex:related_to,ddi:treatment524DB00338DB00437 .
...,...,...,...
1,treatment_drug:treatment569_DB09330,ex:related_to,<http://example/Treatment/treatment47> .
0,treatment_drug:treatment570_DB00361,ex:related_to,<http://example/Treatment/treatment24> .
1,treatment_drug:treatment570_DB00958,ex:related_to,<http://example/Treatment/treatment24> .
0,treatment_drug:treatment571_DB06186,ex:related_to,<http://example/Treatment/treatment33> .


In [10]:
new_triple.to_csv('G3.ttl', sep='\t', header=None, index=None)