In [1]:
import pandas
import gripql
import itertools
import scipy.stats as stats

conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Find all of the aliquots in the [CTRP](https://portals.broadinstitute.org/ctrp/) experiment

In [2]:
q = O.query().V("Program:CTRP").in_("InProgram").in_("InProject").in_("SampleFor").in_("AliquotFor").distinct("_gid")
all_aliquots = []
for row in q:
    all_aliquots.append(row.gid)

[INFO]	2019-03-11 16:27:26,999	841 results received in 0 seconds


For the genes of interest, get Ensembl gene ids, from the HUGO symbols

In [3]:
GENES = ["CDKN2A", "PTEN", "TP53", "SMAD4"]

In [4]:
gene_ids = {}
for g in GENES:
    for i in O.query().V().hasLabel("Gene").has(gripql.eq("symbol", g)):
        gene_ids[g] = i.gid

[INFO]	2019-03-11 16:27:27,225	1 results received in 0 seconds
[INFO]	2019-03-11 16:27:27,399	1 results received in 0 seconds
[INFO]	2019-03-11 16:27:27,576	1 results received in 0 seconds
[INFO]	2019-03-11 16:27:27,752	1 results received in 0 seconds


In [5]:
gene_ids

{'CDKN2A': 'ENSG00000147889',
 'PTEN': 'ENSG00000171862',
 'TP53': 'ENSG00000141510',
 'SMAD4': 'ENSG00000141646'}

For each of the genes, find the set of samples that have a mutation in that gene

In [6]:
mut_samples = {}
norm_samples = {}

q = O.query().V(all_aliquots).as_("sample").in_("CallsetFor").outE("AlleleCall")
q = q.has(gripql.within("ensembl_gene", list(gene_ids.values()))).as_("variant")
q = q.render({"sample" : "$sample._gid", "gene" : "$variant._data.ensembl_gene"})

for res in q:
    mut_samples[res.gene] = mut_samples.get(res.gene, set()) | set([res.sample])

#get CCLE samples without mutation    
for i in gene_ids.values():
    norm_samples[i] = list(set(all_aliquots).difference(mut_samples[i]))

    print( "%s Positive Set: %d" % (i, len(mut_samples[i])) )
    print( "%s Negative Set: %d" % (i, len(norm_samples[i])) )


[INFO]	2019-03-11 16:27:48,969	1,198 results received in 21 seconds


ENSG00000147889 Positive Set: 99
ENSG00000147889 Negative Set: 742
ENSG00000171862 Positive Set: 120
ENSG00000171862 Negative Set: 721
ENSG00000141510 Positive Set: 597
ENSG00000141510 Negative Set: 244
ENSG00000141646 Positive Set: 69
ENSG00000141646 Negative Set: 772


In [7]:
pos_response = {}
for g in gene_ids.values():
    pos_response[g] = {}
    q = O.query().V(list(mut_samples[g])).in_("ResponseIn").has(gripql.eq("source", "CTRP")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in pos_response[g]:
            pos_response[g][compound] = [ v ]
        else:
            pos_response[g][compound].append(v)
   

[INFO]	2019-03-11 16:28:07,266	45,065 results received in 18 seconds
[INFO]	2019-03-11 16:28:28,027	52,559 results received in 20 seconds
[INFO]	2019-03-11 16:30:08,716	261,093 results received in 100 seconds
[INFO]	2019-03-11 16:30:19,911	30,334 results received in 11 seconds


In [8]:
neg_response = {}
for g in gene_ids.values():
    neg_response[g] = {}
    q = O.query().V(list(norm_samples[g])).in_("ResponseIn").has(gripql.eq("source", "CTRP")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in neg_response[g]:
            neg_response[g][compound] = [ v ]
        else:
            neg_response[g][compound].append(v)
   

[INFO]	2019-03-11 16:32:24,361	321,190 results received in 124 seconds
[INFO]	2019-03-11 16:34:41,644	313,696 results received in 137 seconds
[INFO]	2019-03-11 16:35:22,911	105,162 results received in 41 seconds
[INFO]	2019-03-11 16:37:35,496	335,921 results received in 132 seconds


In [9]:
drugs = set(itertools.chain.from_iterable( i.keys() for i in pos_response.values() ))
out = []
for drug in drugs:
    for g in gene_ids.values():
        if drug in pos_response[g] and drug in neg_response[g]:
            row = {"drug" : drug, "mutation" : g}
            mut_values = pos_response[g][drug]
            norm_values = neg_response[g][drug]
            if len(mut_values) > 5 and len(norm_values) > 5:
                s = stats.ttest_ind(mut_values, norm_values, equal_var=False)
                row["t-statistic"] = s.statistic
                row["t-pvalue"] = s.pvalue
                s = stats.f_oneway(mut_values, norm_values)
                row["a-statistic"] = s.statistic
                row["a-pvalue"] = s.pvalue
                out.append(row)

In [12]:
pandas.DataFrame(out, columns=["drug", "mutation", "t-statistic", "t-pvalue", "a-statistic", "a-pvalue"]).sort_values("a-pvalue").head(30)

Unnamed: 0,drug,mutation,t-statistic,t-pvalue,a-statistic,a-pvalue
1038,Compound:CID11433190,ENSG00000141510,13.047139,5.317522e-31,259.77879,1.1842469999999998e-50
1845,Compound:CID10127622,ENSG00000171862,14.762042,7.981893e-47,179.304787,1.641445e-40
1846,Compound:CID10127622,ENSG00000141510,10.688355,2.4651679999999998e-26,127.40596,2.358116e-29
1338,Compound:CID24978538,ENSG00000141510,7.740424,1.240948e-14,63.907146,1.483366e-15
1510,Compound:CID11609586,ENSG00000141510,6.241976,8.088568e-10,52.314159,7.472634e-13
693,Compound:CHEMBL401930,ENSG00000171862,8.302645,2.440095e-14,49.757986,3.862614e-12
1336,Compound:CID24978538,ENSG00000147889,6.527219,9.533273e-11,40.010125,2.65987e-10
558,Compound:CID31703,ENSG00000141510,5.785938,1.071401e-08,39.482605,4.256624e-10
1339,Compound:CID24978538,ENSG00000141646,6.429776,2.146093e-10,34.934173,3.547833e-09
549,Compound:CID9825149,ENSG00000171862,5.932992,1.95591e-08,33.090768,1.266485e-08
