In [61]:
import pandas
import aql
import itertools
import scipy.stats as stats

conn = aql.Connection("http://arachne.compbio.ohsu.edu")
O = conn.graph("bmeg")

In [4]:
q = O.query().V().where(aql.eq("_label", "Biosample"))
q = q.where(aql.and_(aql.eq("source", "ccle"))).render({"id":"_gid"})
all_samples = []
for row in q:
    all_samples.append(row.id)

In [5]:
GENES = ["CDKN2A", "PTEN", "TP53", "SMAD4"]

In [12]:
gene_ids = {}
for g in GENES:
    for i in O.query().V().where(aql.eq("_label", "Gene")).where(aql.eq("symbol", g)):
        gene_ids[g] = i.gid

In [13]:
gene_ids

{'CDKN2A': u'gene:ENSG00000147889',
 'PTEN': u'gene:ENSG00000171862',
 'SMAD4': u'gene:ENSG00000141646',
 'TP53': u'gene:ENSG00000141510'}

In [31]:
mut_samples = {}
norm_samples = {}
for g, i in gene_ids.items():
    #get CCLE samples with mutation
    mut_samples[g] = list(set(k['gid'] for k in O.query().V(i).in_("variantIn").out("variantCall").out("callSetOf").where(aql.in_("_gid", all_samples)).render({"gid":"_gid"})))

    #get CCLE samples without mutation
    norm_samples[g] = list(set(all_samples).difference(mut_samples[g]))

    print "%s Positive Set: %d" % (g, len(mut_samples[g]))
    print "%s Negative Set: %d" % (g, len(norm_samples[g]))


SMAD4 Positive Set: 77
SMAD4 Negative Set: 969
PTEN Positive Set: 139
PTEN Negative Set: 907
CDKN2A Positive Set: 114
CDKN2A Negative Set: 932
TP53 Positive Set: 704
TP53 Negative Set: 342


In [58]:
pos_response = {}
for g in GENES:
    pos_response[g] = {}
    for row in O.query().V(mut_samples[g]).in_("responseFor").mark("a").out("responseTo").mark("b").select(["a", "b"]):
        for v in row['a']['data']['summary']:
            if v['type'] == "AUC":
                compound = row['b']['gid']
                if compound not in pos_response[g]:
                    pos_response[g][compound] = [ v["value"] ]
                else:
                    pos_response[g][compound].append(v["value"])
   

In [59]:
neg_response = {}
for g in GENES:
    neg_response[g] = {}
    for row in O.query().V(norm_samples[g]).in_("responseFor").mark("a").out("responseTo").mark("b").select(["a", "b"]):
        for v in row['a']['data']['summary']:
            if v['type'] == "AUC":
                compound = row['b']['gid']
                if compound not in neg_response[g]:
                    neg_response[g][compound] = [ v["value"] ]
                else:
                    neg_response[g][compound].append(v["value"])
   

In [62]:
drugs = set(itertools.chain.from_iterable( i.keys() for i in pos_response.values() ))
out = []
for drug in drugs:
    for g in GENES:
        if drug in pos_response[g] and drug in neg_response[g]:
            row = {"drug" : drug, "mutation" : g}
            mut_values = pos_response[g][drug]
            norm_values = neg_response[g][drug]
            if len(mut_values) > 5 and len(norm_values) > 5:
                s = stats.ttest_ind(mut_values, norm_values, equal_var=False)
                row["t-statistic"] = s.statistic
                row["t-pvalue"] = s.pvalue
                s = stats.f_oneway(mut_values, norm_values)
                row["a-statistic"] = s.statistic
                row["a-pvalue"] = s.pvalue
                out.append(row)

In [64]:
pandas.DataFrame(out, columns=["drug", "mutation", "t-statistic", "t-pvalue", "a-statistic", "a-pvalue"]).sort_values("a-pvalue")

Unnamed: 0,drug,mutation,t-statistic,t-pvalue,a-statistic,a-pvalue
330,compound:UNKNOWN:nutlin-3,TP53,12.748725,8.944658e-30,2.486176e+02,1.119052e-48
793,compound:UNKNOWN:pluripotin,PTEN,7.972738,3.368025e-14,5.889811e+01,2.911128e-14
1150,compound:UNKNOWN:serdemetan,TP53,6.286505,6.274604e-10,5.287950e+01,5.720887e-13
305,compound:UNKNOWN:BMS-536924,PTEN,8.249686,4.432999e-14,4.947380e+01,4.497125e-12
1462,compound:CID31703,TP53,5.732241,1.465723e-08,3.886098e+01,5.838422e-10
257,compound:CID9825149,PTEN,6.102360,9.312908e-09,3.502188e+01,4.941441e-09
938,compound:UNKNOWN:SCH-529074,TP53,4.781053,2.205242e-06,3.046533e+01,4.011900e-08
1189,compound:UNKNOWN:linsitinib,PTEN,7.412015,3.155715e-12,2.960986e+01,7.104802e-08
313,compound:CID16038120,PTEN,6.214502,4.862971e-09,2.881643e+01,1.057912e-07
1405,compound:UNKNOWN:bosutinib,PTEN,5.483273,1.776162e-07,2.714513e+01,2.433327e-07
