In [1]:
import pandas
import gripql
import itertools
import scipy.stats as stats

conn = gripql.Connection("http://grip.compbio.ohsu.edu")
O = conn.graph("bmeg_rc1_2")

In [2]:
q = O.query().V("Program:CCLE").in_("InProgram").in_("InProject").in_("SampleFor").in_("AliquotFor")
all_aliquots = []
for row in q:
    all_aliquots.append(row.gid)

[INFO]	2019-02-20 16:57:22,605	504 results received in 0 seconds


In [3]:
GENES = ["CDKN2A", "PTEN", "TP53", "SMAD4"]

In [4]:
gene_ids = {}
for g in GENES:
    for i in O.query().V().hasLabel("Gene").has(gripql.eq("symbol", g)):
        gene_ids[g] = i.gid

[INFO]	2019-02-20 16:57:22,750	1 results received in 0 seconds
[INFO]	2019-02-20 16:57:22,883	1 results received in 0 seconds
[INFO]	2019-02-20 16:57:23,014	1 results received in 0 seconds
[INFO]	2019-02-20 16:57:23,145	1 results received in 0 seconds


In [5]:
gene_ids

{'CDKN2A': 'ENSG00000147889',
 'PTEN': 'ENSG00000171862',
 'TP53': 'ENSG00000141510',
 'SMAD4': 'ENSG00000141646'}

In [7]:
mut_samples = {}
norm_samples = {}

q = O.query().V(all_aliquots).as_("sample").in_("CallsetFor").outE("AlleleCall")
q = q.has(gripql.within("ensembl_gene", list(gene_ids.values()))).as_("variant")
q = q.render({"sample" : "$sample._gid", "gene" : "$variant._data.ensembl_gene"})

for res in q:
    mut_samples[res.gene] = mut_samples.get(res.gene, set()) | set([res.sample])

#get CCLE samples without mutation    
for i in gene_ids.values():
    norm_samples[i] = list(set(all_aliquots).difference(mut_samples[i]))

    print( "%s Positive Set: %d" % (i, len(mut_samples[i])) )
    print( "%s Negative Set: %d" % (i, len(norm_samples[i])) )


[INFO]	2019-02-20 16:57:29,279	673 results received in 6 seconds


ENSG00000147889 Positive Set: 69
ENSG00000147889 Negative Set: 435
ENSG00000171862 Positive Set: 76
ENSG00000171862 Negative Set: 428
ENSG00000141510 Positive Set: 357
ENSG00000141510 Negative Set: 147
ENSG00000141646 Positive Set: 39
ENSG00000141646 Negative Set: 465


In [9]:
pos_response = {}
for g in gene_ids.values():
    pos_response[g] = {}
    q = O.query().V(list(mut_samples[g])).in_("ResponseIn").has(gripql.eq("source", "CTRP")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in pos_response[g]:
            pos_response[g][compound] = [ v ]
        else:
            pos_response[g][compound].append(v)
   

[INFO]	2019-02-20 16:57:44,006	30,074 results received in 14 seconds
[INFO]	2019-02-20 16:57:59,881	32,137 results received in 15 seconds
[INFO]	2019-02-20 16:59:07,251	146,042 results received in 67 seconds
[INFO]	2019-02-20 16:59:15,939	17,933 results received in 8 seconds


In [10]:
neg_response = {}
for g in gene_ids.values():
    neg_response[g] = {}
    q = O.query().V(list(norm_samples[g])).in_("ResponseIn").has(gripql.eq("source", "CTRP")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in neg_response[g]:
            neg_response[g][compound] = [ v ]
        else:
            neg_response[g][compound].append(v)
   

[INFO]	2019-02-20 17:00:35,445	169,815 results received in 79 seconds
[INFO]	2019-02-20 17:01:55,601	167,752 results received in 80 seconds
[INFO]	2019-02-20 17:02:20,688	53,847 results received in 25 seconds
[INFO]	2019-02-20 17:03:44,473	181,956 results received in 83 seconds


In [11]:
drugs = set(itertools.chain.from_iterable( i.keys() for i in pos_response.values() ))
out = []
for drug in drugs:
    for g in gene_ids.values():
        if drug in pos_response[g] and drug in neg_response[g]:
            row = {"drug" : drug, "mutation" : g}
            mut_values = pos_response[g][drug]
            norm_values = neg_response[g][drug]
            if len(mut_values) > 5 and len(norm_values) > 5:
                s = stats.ttest_ind(mut_values, norm_values, equal_var=False)
                row["t-statistic"] = s.statistic
                row["t-pvalue"] = s.pvalue
                s = stats.f_oneway(mut_values, norm_values)
                row["a-statistic"] = s.statistic
                row["a-pvalue"] = s.pvalue
                out.append(row)

In [12]:
pandas.DataFrame(out, columns=["drug", "mutation", "t-statistic", "t-pvalue", "a-statistic", "a-pvalue"]).sort_values("a-pvalue")

Unnamed: 0,drug,mutation,t-statistic,t-pvalue,a-statistic,a-pvalue
574,Compound:CID10127622,ENSG00000141510,12.882003,1.245427e-36,184.807516,2.327147e-41
769,Compound:CID11433190,ENSG00000141510,9.700591,1.811754e-17,152.563635,3.455239e-30
573,Compound:CID10127622,ENSG00000171862,10.166484,2.334855e-23,89.182738,5.377786e-21
62,Compound:CID11717001,ENSG00000141510,5.298232,4.338974e-07,43.765050,1.113371e-10
468,Compound:CHEMBL401930,ENSG00000171862,6.515571,1.631341e-09,28.139904,1.837929e-07
1817,Compound:CID24978538,ENSG00000147889,5.279545,1.610209e-07,27.066078,2.054418e-07
98,Compound:CID11626560,ENSG00000141510,5.032685,7.502575e-07,27.348135,2.143673e-07
1061,Compound:CID44462760,ENSG00000141510,3.849080,2.647069e-04,27.806434,3.247028e-07
1843,Compound:CID31703,ENSG00000141510,4.941254,1.150275e-06,26.332060,3.554195e-07
969,Compound:CID6505803,ENSG00000141510,4.334796,1.730668e-05,21.030566,4.963309e-06
