In [2]:
import pandas
import gripql
import itertools
import scipy.stats as stats

conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Find all of the samples in the [CCLE](https://portals.broadinstitute.org/ccle/) experiment

In [34]:
q = O.query().V("Program:CCLE").in_("InProgram").in_("InProject").hasLabel("Sample").distinct("_gid")
all_samples = []
for row in q:
    all_samples.append(row.gid)

[INFO]	2019-05-28 13:15:31,490	1,617 results received in 2 seconds


For the genes of interest, get Ensembl gene ids, from the HUGO symbols

In [10]:
GENES = ["CDKN2A", "PTEN", "TP53", "SMAD4"]

In [11]:
gene_ids = {}
for g in GENES:
    for i in O.query().V().hasLabel("Gene").has(gripql.eq("symbol", g)):
        gene_ids[g] = i.gid

[INFO]	2019-05-28 13:04:38,613	1 results received in 0 seconds
[INFO]	2019-05-28 13:04:38,812	1 results received in 0 seconds
[INFO]	2019-05-28 13:04:38,944	1 results received in 0 seconds
[INFO]	2019-05-28 13:04:39,071	1 results received in 0 seconds


For each of the genes, find the set of samples that have a mutation in that gene

In [48]:
mut_samples = {}
norm_samples = {}

q = O.query().V(all_samples).as_("sample").in_("AliquotFor").in_("CallsetFor").outE("AlleleCall")
q = q.has(gripql.within("ensembl_gene", list(gene_ids.values()))).as_("variant")
q = q.render({"sample" : "$sample._gid", "gene" : "$variant._data.ensembl_gene"})

for res in q:
    mut_samples[res.gene] = mut_samples.get(res.gene, set()) | set([res.sample])

# get CCLE samples without mutation    
for i in gene_ids.values():
    norm_samples[i] = list(set(all_samples).difference(mut_samples[i]))

    print( "%s Positive Set: %d" % (i, len(mut_samples[i])) )
    print( "%s Negative Set: %d" % (i, len(norm_samples[i])) )


[INFO]	2019-05-28 13:35:09,960	1,847 results received in 13 seconds


ENSG00000147889 Positive Set: 154
ENSG00000147889 Negative Set: 1463
ENSG00000171862 Positive Set: 186
ENSG00000171862 Negative Set: 1431
ENSG00000141510 Positive Set: 1005
ENSG00000141510 Negative Set: 612
ENSG00000141646 Positive Set: 102
ENSG00000141646 Negative Set: 1515


In [50]:
pos_response = {}
for g in gene_ids.values():
    pos_response[g] = {}
    q = O.query().V(list(mut_samples[g])).in_("AliquotFor").in_("ResponseIn").has(gripql.eq("source", "CCLE")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in pos_response[g]:
            pos_response[g][compound] = [ v ]
        else:
            pos_response[g][compound].append(v)
   

[INFO]	2019-05-28 13:35:31,374	1,609 results received in 1 seconds
[INFO]	2019-05-28 13:35:32,717	1,760 results received in 1 seconds
[INFO]	2019-05-28 13:35:38,827	8,242 results received in 6 seconds
[INFO]	2019-05-28 13:35:39,514	888 results received in 0 seconds


In [51]:
neg_response = {}
for g in gene_ids.values():
    neg_response[g] = {}
    q = O.query().V(list(norm_samples[g])).in_("AliquotFor").in_("ResponseIn").has(gripql.eq("source", "CCLE")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in neg_response[g]:
            neg_response[g][compound] = [ v ]
        else:
            neg_response[g][compound].append(v)
   

[INFO]	2019-05-28 13:35:47,019	10,061 results received in 7 seconds
[INFO]	2019-05-28 13:35:54,303	9,910 results received in 7 seconds
[INFO]	2019-05-28 13:35:56,928	3,428 results received in 2 seconds
[INFO]	2019-05-28 13:36:05,018	10,782 results received in 8 seconds


In [52]:
drugs = set(itertools.chain.from_iterable( i.keys() for i in pos_response.values() ))
out = []
for drug in drugs:
    for g in gene_ids.values():
        if drug in pos_response[g] and drug in neg_response[g]:
            row = {"drug" : drug, "mutation" : g}
            mut_values = pos_response[g][drug]
            norm_values = neg_response[g][drug]
            if len(mut_values) > 5 and len(norm_values) > 5:
                s = stats.ttest_ind(mut_values, norm_values, equal_var=False)
                row["t-statistic"] = s.statistic
                row["t-pvalue"] = s.pvalue
                s = stats.f_oneway(mut_values, norm_values)
                row["a-statistic"] = s.statistic
                row["a-pvalue"] = s.pvalue
                out.append(row)

In [53]:
pandas.DataFrame(out, columns=["drug", "mutation", "t-statistic", "t-pvalue", "a-statistic", "a-pvalue"]).sort_values("a-pvalue").head(30)

Unnamed: 0,drug,mutation,t-statistic,t-pvalue,a-statistic,a-pvalue
6,Compound:CID9826528,ENSG00000141510,-4.599548,7e-06,25.468694,6.297216e-07
74,Compound:CID10127622,ENSG00000141510,-4.326337,2.3e-05,23.75368,1.471637e-06
86,Compound:CID24180719,ENSG00000141510,-3.173771,0.001772,16.161735,6.724098e-05
94,Compound:CID11433190,ENSG00000141510,-3.555544,0.000458,15.012723,0.000120943
38,Compound:CID6505803,ENSG00000141510,-3.6265,0.000348,14.151932,0.0001885532
10,Compound:CID11656518,ENSG00000141510,-3.360247,0.000895,11.53476,0.000742592
81,Compound:CID11476171,ENSG00000171862,-4.037831,9.2e-05,10.930977,0.001013539
5,Compound:CID9826528,ENSG00000171862,-3.397564,0.000933,9.154932,0.002607714
73,Compound:CID10127622,ENSG00000171862,-3.166863,0.001973,8.028787,0.004789648
64,Compound:CID5479543,ENSG00000147889,-3.358856,0.001049,6.491708,0.01114273
