Testing enrichment for genesets in the rare variant paper Nature. 2014 Feb 13; 506(7487): 185–190.

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
from bsmcalls import SNPnexus
from bsmcalls import operations
from matplotlib import pyplot as plt
import attila_utils
%matplotlib inline

In [3]:
data = SNPnexus.load_data('/home/attila/projects/bsm/results/2020-09-07-annotations/annotated-calls.p')

In [48]:
fpath = '/home/attila/projects/bsm/resources/rare-variants/41586_2014_BFnature12975_MOESM71_ESM.csv'
genesets = pd.read_csv(fpath, sep='\t', index_col='Gene symbol').dropna().astype(pd.CategoricalDtype())
genesets = genesets.apply(lambda s: s.cat.rename_categories({'.': False, 'Y': True}), axis=1)
genesets['composite'] = True
genesets

Unnamed: 0_level_0,PSD,FMRP-target,SCZ de novo,Calcium channel,composite
Gene symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAK1,False,True,False,False,True
AARS,True,False,False,False,True
AATK,False,True,False,False,True
ABCA2,False,True,False,False,True
ABCA3,False,True,False,False,True
...,...,...,...,...,...
ZNF721,False,False,True,False,True
ZNF77,False,False,True,False,True
ZNFX1,False,True,False,False,True
ZP4,False,False,True,False,True


In [49]:
genesetsd = {c: set(genesets.index[genesets[c]]) for c in genesets}
querydict = {'near_gens_Annotation': ['coding nonsyn', 'coding syn', 'missense', 'stop-gain', 'intronic (splice_site)'],
             'near_gens_Overlapped Gene': genesetsd,
            }
results = operations.multiquery(querydict, data, do_sum=False, do_sort=False)

In [50]:
results_vtype = results.xs('near_gens_Annotation', axis=1)[['coding nonsyn', 'coding syn']]
results_constr = results.xs('near_gens_Overlapped Gene', axis=1)

l = [pd.DataFrame({constr: results_vtype[vtype] & results_constr[constr] for constr in results_constr}) for vtype in results_vtype]

results_combi = pd.concat(l, axis=1)
results_combi.columns = pd.MultiIndex.from_product([results_vtype.columns, results_constr.columns])

In [51]:
summary = operations.summarize_query_results(pd.concat([results, results_combi], axis=1), data)
summary

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp


Unnamed: 0_level_0,Unnamed: 1_level_0,Control,SCZ,ASD,All,chisq stat,chisq p
Feature,Query,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
near_gens_Annotation,coding nonsyn,23,43,23,89,4.48034,0.10644
near_gens_Annotation,coding syn,16,20,15,51,1.177849,0.554924
near_gens_Annotation,missense,0,0,0,0,,
near_gens_Annotation,stop-gain,1,0,0,1,2.887477,0.236044
near_gens_Annotation,intronic (splice_site),0,1,0,1,1.582797,0.45321
near_gens_Overlapped Gene,PSD,42,70,71,183,1.097207,0.577756
near_gens_Overlapped Gene,FMRP-target,88,139,114,341,0.803238,0.669236
near_gens_Overlapped Gene,SCZ de novo,43,69,69,181,0.623119,0.732304
near_gens_Overlapped Gene,Calcium channel,2,9,2,13,5.15427,0.075991
near_gens_Overlapped Gene,composite,140,237,192,569,2.069647,0.355289


In [2]:
%connect_info

{
  "shell_port": 55089,
  "iopub_port": 44629,
  "stdin_port": 57407,
  "control_port": 54101,
  "hb_port": 45817,
  "ip": "127.0.0.1",
  "key": "1b847f2d-6c79c0c9c9f33bc6e1171f04",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-63c244bb-5234-4905-a909-ad56793c2fbc.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
