In [38]:
import gseapy
gseapy.__version__

'1.1.3'

In [39]:
!pip install rdflib
!pip install rdflib-hdt
#!biobricks install wikipathways
#!biobricks install eutoxrisk-temposeq

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [40]:
import pandas as pd
#import biobricks as bb
from rdflib import Graph, Namespace
from rdflib.plugins.stores import sparqlstore
from rdflib_hdt import HDTStore
from rdflib.namespace import RDF, XSD, DCTERMS
import rdflib
from tqdm import tqdm

In [41]:
# Connect studies, compounds and temposeq dataset id
df_overview = pd.read_csv("../download/overview.csv")
df_overview.columns

Index(['Study', 'Treatment compound', 'Treatment concentration',
       'Treatment timepoint', 'Control compound', 'Control concentration',
       'Control timepoint', 'Filename', 'Case study', 'Index in compound list',
       'Comments', 'Parent compound', 'Harris's Preferred Name (ChEMBL/ECHA)',
       'SMILES', 'InChI key', 'CAS number', 'Starting substance',
       'Barbara's prefered name', 'Alternative name',
       'Starting substance SMILES', 'Starting substance CAS number',
       'Supplier', 'Cat no', 'Lot no', 'Form', 'Storage pure compound',
       'Solvent', 'Stock concentration', 'Stock concentration unit',
       'Stock aliquot storage', 'Purity', 'Molar weight', 'Molecular formula',
       'MDLNUM', 'Code', 'Dataset id'],
      dtype='object')

In [42]:
inchi_to_compound_dict = df_overview[['Treatment compound', 'InChI key']].set_index("InChI key")['Treatment compound'].to_dict()
inchi_to_compound_dict_without_underscores = df_overview[['Treatment compound', 'InChI key']].set_index("InChI key")['Treatment compound'].str.replace('_', ' ').to_dict()

In [43]:
# Connect genes and pathways
df_pathways = pd.read_csv("../download/pathways.csv")

In [44]:
# compound to gene relationship
compound_to_gene_list = []


for _, row in tqdm(df_overview.iterrows()):
    dataset_id = row['Dataset id']
    d = row.to_dict()

    try:
        data_frame = pd.read_csv("../download/temposeq/{0}.csv".format(dataset_id))
        
        LOG_FOLD_CHANGE_THRESHOLD = 2
        ADJUSTED_P_VALUE_THRESHOLD = 0.05

        is_positive_significant_fold_change = (data_frame['logFC'] > LOG_FOLD_CHANGE_THRESHOLD)
        is_negative_significant_fold_change = (data_frame['logFC'] < -LOG_FOLD_CHANGE_THRESHOLD)
    
        is_significant_p_value = data_frame['padj'] < ADJUSTED_P_VALUE_THRESHOLD
    
        d['positive deregulated genes'] = data_frame[is_positive_significant_fold_change & is_significant_p_value]['SYMBOL'].tolist()
        d['negative deregulated genes'] = data_frame[is_negative_significant_fold_change & is_significant_p_value]['SYMBOL'].tolist()

        compound_to_gene_list.append(d)
    except:
        pass

0it [00:00, ?it/s]

2336it [00:25, 90.46it/s] 


In [51]:
g = rdflib.Graph()
context = []

# namespaces
nm = g.namespace_manager

sio, prefix = rdflib.Namespace("http://semanticscience.org/resource/SIO/"), "sio"
nm.bind(prefix, sio)

cheminf, prefix = rdflib.Namespace("http://semanticscience.org/resource/CHEMINF/"), "cheminf"
nm.bind(prefix, cheminf)

wp, prefix = rdflib.Namespace("http://vocabularies.wikipathways.org/wpTypes#"), "wp"
nm.bind(prefix, wp)

bp, prefix = rdflib.Namespace("http://www.biopax.org/release/biopax-level3.owl#"), "bp"
nm.bind(prefix, bp)

bao, prefix = rdflib.Namespace("http://www.bioassayontology.org/bao#"), "bao"
nm.bind(prefix, bao)

metago, prefix = rdflib.Namespace("http://model.geneontology.org/"), "metago"
nm.bind(prefix, metago)

# Graph is built from triplets of (subject, predicate, object)

# Compounds
for inchikey in df_overview['InChI key'].unique():
    s = rdflib.URIRef("inchikey:{0}".format(inchikey))
    
    # inchikey object has value of #inchikey
    # (InChIKey, sio:SIO_000300 (has value), InChIKeyValue)
    p = sio.SIO_000300
    o = rdflib.Literal(inchikey, datatype=XSD.string)
    g.add((s, p, o))
    # inchikey object is type of inchikey
    # InChIKey, rdf:type cheminf:CHEMINF_000399)
    p = RDF.type
    o = cheminf.CHEMINF_000399
    g.add((s, p, o))

    # (InChIKey, sio:SIO_000011 (is attribute of), Compound)
    p = sio.SIO_000011
    o = rdflib.URIRef("compound:{0}".format(inchi_to_compound_dict[inchikey]))
    g.add((s, p, o))
    #context.append("Compound {0} has InChIKey {1}.".format(inchi_to_compound_dict_without_underscores[inchikey], inchikey))


# Genes
for gene_symbol in df_pathways.Genes.unique():
    # gene is type of gene
    s = rdflib.URIRef("gene:{0}".format(gene_symbol))
    p = RDF.type
    o = wp.GeneProduct #sio.SIO_010035 <-- what should we use? Pubchem uses SIO. WP uses wp.
    g.add((s, p, o))

    # gene has symbol
    p = bao.BAO_0002870
    o = rdflib.Literal(gene_symbol, datatype=XSD.string)
    g.add((s, p, o))


# Pathways
for pathway_title in df_pathways.Term.unique():
    pathway_name = pathway_title.replace(" ", "-")

    # pathway is type of pathway
    s = rdflib.URIRef("pathway:{0}".format(pathway_name))
    p = RDF.type
    o = wp.Pathway #bp.Pathway
    g.add((s, p, o))

    # pathway has title 
    p = DCTERMS.title
    o = rdflib.Literal(pathway_title, datatype=XSD.string)
    g.add((s, p, o))


# Compound-gene relationship
for row in compound_to_gene_list:
    # one row is one compound in a particular study
    pos_deregulated_genes = list(set(row['positive deregulated genes']))
    neg_deregulated_genes = list(set(row['negative deregulated genes']))

    inchikey = row['InChI key']
    s =  rdflib.URIRef("compound:{0}".format(inchi_to_compound_dict[inchikey]))
    
    for gene_symbol in pos_deregulated_genes:
        p = metago.GO_0048521
        o = rdflib.URIRef("gene:{0}".format(gene_symbol))
        g.add((s, p, o))
        context.append("Compound {0} upregulates gene {1}.".format(inchi_to_compound_dict_without_underscores[inchikey], gene_symbol))

    for gene_symbol in neg_deregulated_genes:
        p = metago.GO_0048522
        o = rdflib.URIRef("gene:{0}".format(gene_symbol))
        g.add((s, p, o))
        context.append("Compound {0} downregulates gene {1}.".format(inchi_to_compound_dict_without_underscores[inchikey], gene_symbol))

# Gene-Pathway relationship
for _, row in df_pathways.iterrows():
    gene_symbol = row['Genes']
    pathway_title = row['Term']
    pathway_name = pathway_title.replace(" ", "-")

    # Gene is part of pathway
    # (wp:GeneProduct, DCTERMS.isPartOf, wp:Pathway)
    s = rdflib.URIRef("gene:{0}".format(gene_symbol))
    p = DCTERMS.isPartOf
    o = rdflib.URIRef("pathway:{0}".format(pathway_name))
    g.add((s, p, o))
    
    #context.append("Gene {0} is part of {1} pathway.".format(gene_symbol, pathway_title))
    

print("Graph has {0} nodes.".format(len(g.all_nodes())))
print("Graph has {0} statements.".format(len(g)))
context_str = " ".join(context)

Graph has 9641 nodes.
Graph has 46527 statements.


In [52]:
print(len(g))
print(len(context_str))
print(len(context))
print(len(context_str) / len(g))

46527
1577500
29244
33.90504438283148


In [59]:
context_str[:45500]

'Compound Butanone oxime upregulates gene ATAD2. Compound Butanone oxime upregulates gene LGALS1. Compound Butanone oxime upregulates gene S100A10. Compound Butanone oxime upregulates gene TP53. Compound Butanone oxime upregulates gene DDR1. Compound Butanone oxime upregulates gene PTPN1. Compound Butanone oxime upregulates gene CDCA8. Compound Butanone oxime upregulates gene S100P. Compound Butanone oxime upregulates gene CEACAM6. Compound Butanone oxime upregulates gene IFIT3. Compound Butanone oxime upregulates gene MCM10. Compound Butanone oxime upregulates gene EGR3. Compound Butanone oxime upregulates gene MSH2. Compound Butanone oxime upregulates gene SP110. Compound Butanone oxime upregulates gene CDC25A. Compound Butanone oxime upregulates gene CDCA7. Compound Butanone oxime upregulates gene CCNE2. Compound Butanone oxime upregulates gene HJURP. Compound Butanone oxime upregulates gene H2BC14. Compound Butanone oxime upregulates gene PROS1. Compound Butanone oxime upregulates 

In [47]:
q = """
PREFIX sio: <http://semanticscience.org/resource/SIO/>
PREFIX cheminf: <http://semanticscience.org/resource/CHEMINF/>

SELECT ?inchikey ?compound
WHERE {
    ?p rdf:type cheminf:CHEMINF_000399 .
    ?p sio:SIO_000300 ?inchikey .
    ?p sio:SIO_000011 ?compound .
    
}
"""


results = g.query(q)
print(len(results))

for r in results:
    print(str(r))

71
(rdflib.term.Literal('WHIVNJATOVLWBW-SNAWJCMRSA-N', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.URIRef('compound:Butanone_oxime'))
(rdflib.term.Literal('YMLFYGFCXGNERH-UHFFFAOYSA-K', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.URIRef('compound:Butyl_tin_trichloride'))
(rdflib.term.Literal('RJGHQTVXGKYATR-UHFFFAOYSA-L', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.URIRef('compound:Dibutyl_tin_dichloride'))
(rdflib.term.Literal('PDQAZBWRQCGBEV-UHFFFAOYSA-N', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.URIRef('compound:Imidazolidinethione'))
(rdflib.term.Literal('YHMYGUUIMTVXNW-UHFFFAOYSA-N', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.URIRef('compound:Mercapto_benzimidazole'))
(rdflib.term.Literal('XLSZMDLNRCVEIJ-UHFFFAOYSA-N', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSc