Various GWAS genesets are tested for enrichment in SCZ somatic variants.

In [41]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
from bsmcalls import SNPnexus
from bsmcalls import operations
from bsmcalls import resources
from matplotlib import pyplot as plt
import attila_utils
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data = SNPnexus.load_data('/home/attila/projects/bsm/results/2020-09-07-annotations/annotated-calls.p')

## Autism spectrum disorder (ASD)
Read autism GWAS loci from the GWAS Catalog to a DataFrame

In [3]:
gwaspath = '/home/attila/projects/bsm/resources/GWAS/autism-spectrum-disorder.tsv'
asdloci = pd.read_csv(gwaspath, sep='\t')

We will select the study ``Identification of common genetic risk variants for autism spectrum disorder.'' (PMID [30804558](www.ncbi.nlm.nih.gov/pubmed/30804558)).

In [4]:
asdloci = asdloci.loc[asdloci['PUBMEDID'] == 30804558]
selcols = ['CHR_ID', 'CHR_POS', 'REPORTED GENE(S)', 'MAPPED_GENE']
asdloci[selcols]

Unnamed: 0,CHR_ID,CHR_POS,REPORTED GENE(S),MAPPED_GENE
94,1,96096246,"LOC102723661, PTBP2",LINC02790 - RNU1-130P
95,1,96513405,Intergenic,EEF1A1P11 - RN7SL831P
96,1,98627228,Intergenic,LINC01776 - AC095031.1
97,1,104249635,Intergenic,AC092506.1 - AL591888.1
98,1,193533479,Intergenic,AL136456.1
...,...,...,...,...
151,20,21267478,"KIZ, XRN2, NKX2-2, NKX2-4",ZNF877P - AL117332.1
152,5,104676602,,AC099520.1
153,14,103551616,,RNU7-160P - AL139300.2
154,1,72263459,,NEGR1


In [5]:
reported = set(asdloci['REPORTED GENE(S)'].dropna().str.split(', ').sum())
reported.remove('Intergenic')
reported

{'C8orf74',
 'CADPS',
 'KIZ',
 'KMT2E',
 'LOC102723661',
 'MACROD2',
 'MROH5',
 'NKX2-2',
 'NKX2-4',
 'PINX1',
 'PTBP2',
 'SOX7',
 'SRPK2',
 'XRN2'}

In [6]:
mappeds = asdloci['MAPPED_GENE'].dropna()
commas = mappeds.str.match('^.*, .*$')
mappeds1 = mappeds.loc[commas]
mapped1 = set(mappeds1.str.split(', ').sum())
mappeds2 = mappeds.loc[~ commas]
mapped2 = set(mappeds2.str.split(' - ').sum())
mapped = mapped2.union(mapped1)
asd_gwas_genes = mapped.union(reported)
print(len(asd_gwas_genes), 'genes')
print(asd_gwas_genes)

79 genes
{'LINC01776', 'EEF1A1P11', 'AL589740.1', 'PAUPAR', 'MAPT', 'ESRRB', 'AC021192.2', 'LINC02790', 'AL117332.1', 'SERPINA2', 'PTBP2', 'XRN2', 'EGR2', 'RNU1-130P', 'BEND4', 'AL139300.2', 'AC095031.1', 'AC120036.1', 'RSU1', 'CADPS', 'GALNT10', 'C8orf74', 'GALNT1', 'AC013287.1', 'NRGN', 'U95743.1', 'OLFM4', 'AC090987.1', 'RN7SL831P', 'NEGR1', 'MROH5', 'AL133270.1', 'KCNN2', 'NKX2-4', 'AL391117.1', 'AC092506.1', 'RSRC1', 'RNU7-160P', 'SRPK2', 'AL591888.1', 'AC099520.1', 'ITIH4', 'ZNF568', 'AL022724.3', 'AC026320.3', 'AC005070.3', 'NKX2-2', 'ZNF877P', 'LINC02058', 'AL032822.1', 'ASAP1', 'PINX1', 'AL136456.1', 'GUCY1A2', 'AC025839.1', 'PCDH9', 'LINC00461', 'KMT2E', 'SGO1-AS1', 'KANSL1', 'ADTRP', 'AL035078.4', 'RBFOX1', 'SOX7', 'MACROD2', 'AC022387.1', 'FGF12', 'SERPINA1', 'KIZ', 'GABBR1', 'ITIH3', 'LOC102723661', 'LINC02210-CRHR1', 'AL139093.1', 'RP1L1', 'PKP4', 'U91319.1', 'NEDD4L', 'SORCS3'}


Let us read in the set of SCZ GWAS genes!

In [7]:
clozukpath = '/home/attila/projects/bsm/resources/CLOZUK/supp-table-4.csv'
scz_gwas_genes = operations.get_geneset(df=pd.read_csv(clozukpath, skiprows=7), col='Gene(s) tagged')

In [8]:
querydict = {'near_gens_Annotation': ['coding nonsyn', 'coding syn', 'missense', 'stop-gain', 'intronic (splice_site)'],
             'ensembl_Predicted Function': ['coding'],
             'sift_Prediction': ['Deleterious', 'Deleterious - Low Confidence'],
             'polyphen_Prediction': ['Probably Damaging', 'Possibly Damaging'],
             'tfbs_TFBS Name': None,
             'phast_Score': None,
             'gerp_Element RS Score': None,
             'cpg_CpG Island': None,
             'near_gens_Overlapped Gene': {'SCZ GWAS genes': scz_gwas_genes,
                                           'ASD GWAS genes': asd_gwas_genes,
                                           'ASD reported GWAS genes': reported,
                                          },
            }
results = operations.multiquery(querydict, data, do_sum=False, do_sort=False)
summary = operations.summarize_query_results(results, data, chisq=True, margin=False)
summary#.style.bar(subset='chisq stat')

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp


Unnamed: 0_level_0,Unnamed: 1_level_0,Control,SCZ,ASD,chisq stat,chisq p
Feature,Query,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
near_gens_Annotation,coding nonsyn,23,43,23,4.48034,0.10644
near_gens_Annotation,coding syn,16,20,15,1.177849,0.554924
near_gens_Annotation,missense,0,0,0,,
near_gens_Annotation,stop-gain,1,0,0,2.887477,0.236044
near_gens_Annotation,intronic (splice_site),0,1,0,1.582797,0.45321
ensembl_Predicted Function,coding,39,64,40,3.794387,0.149989
sift_Prediction,Deleterious,8,9,6,1.315056,0.518131
sift_Prediction,Deleterious - Low Confidence,1,1,2,0.429823,0.806613
polyphen_Prediction,Probably Damaging,3,5,2,1.080625,0.582566
polyphen_Prediction,Possibly Damaging,2,2,0,2.470275,0.290795


## Psychiatric disorders

The [Analysis of shared heritability in common disorders of the brain](https://pubmed.ncbi.nlm.nih.gov/29930110/) showed the following psychiatric disorders to correlate significantly with schizophrenia:
* ADHD; PMID: [30478444](https://pubmed.ncbi.nlm.nih.gov/30478444/); Demontis et al 2019 Nat Genet
* Anorexia nervosa; PMID: [31308545](https://pubmed.ncbi.nlm.nih.gov/31308545/); Watson et al 2019 Nat Genet
* ASD; PMID: [30804558](https://pubmed.ncbi.nlm.nih.gov/30804558/); Grove et al 2019 Nat Genet
* Bipolar disorder; PMID: [31043756](https://pubmed.ncbi.nlm.nih.gov/31043756/); Stahl et al 2019 Nat Genet
* MDD; PMID: [30718901](https://pubmed.ncbi.nlm.nih.gov/30718901/); Howard et al 2019 Nat Neurosci
* OCD;  PMID: [31891238](https://pubmed.ncbi.nlm.nih.gov/31891238/); Smit et al 2020 Am J Med Genet B Neuropsychiatr Genet
* Schizophrenia; PMID: [29483656](https://pubmed.ncbi.nlm.nih.gov/29483656/); Pardiñas et al 2018 Nat Genet
* Tourette syndrome; PMID: [30818990](https://pubmed.ncbi.nlm.nih.gov/30818990/); Yu et al 2019 Am J Psychiatry

In [88]:
fpath = '/home/attila/projects/bsm/notebook/2021-03-09-gwas-genes/selected-gwas.csv'
selgwas = pd.read_csv(fpath, index_col='Trait', sep='\t')
selgwas

Unnamed: 0_level_0,Trait type,PMID,Citation
Trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADHD,psychiatric,30478444,Demontis et al 2019 Nat Genet
Anorexia nervosa,psychiatric,31308545,Watson et al 2019 Nat Genet
ASD,psychiatric,30804558,Grove et al 2019 Nat Genet
Bipolar disorder,psychiatric,31043756,Stahl et al 2019 Nat Genet
MDD,psychiatric,30718901,Howard et al 2019 Nat Neurosci
Shizophrenia,psychiatric,29483656,Pardiñas et al 2018 Nat Genet
Tourette syndrome,psychiatric,30818990,Yu et al 2019 Am J Psychiatry


In [29]:
fpath = '/home/attila/projects/bsm/resources/GWAS/gwas_catalog_v1.0.2-associations_e100_r2021-02-25.tsv'
gwas = pd.read_csv(fpath, sep='\t')
gwas = gwas.loc[gwas['PUBMEDID'].isin(selgwas['PMID']), :]

  interactivity=interactivity, compiler=compiler, result=result)


In [91]:
D = resources.gwascat_multi_genesets(gwas, selgwas)

In [93]:
querydict = {'near_gens_Annotation': ['coding nonsyn', 'coding syn', 'missense', 'stop-gain', 'intronic (splice_site)'],
             'ensembl_Predicted Function': ['coding'],
             'sift_Prediction': ['Deleterious', 'Deleterious - Low Confidence'],
             'polyphen_Prediction': ['Probably Damaging', 'Possibly Damaging'],
             'tfbs_TFBS Name': None,
             'phast_Score': None,
             'gerp_Element RS Score': None,
             'cpg_CpG Island': None,
             'near_gens_Overlapped Gene': D,
            }
results = operations.multiquery(querydict, data, do_sum=False, do_sort=False)
summary = operations.summarize_query_results(results, data, chisq=True, margin=False)
summary#.style.bar(subset='chisq stat')

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp


Unnamed: 0_level_0,Unnamed: 1_level_0,Control,SCZ,ASD,chisq stat,chisq p
Feature,Query,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
near_gens_Annotation,coding nonsyn,23,43,23,4.48034,0.10644
near_gens_Annotation,coding syn,16,20,15,1.177849,0.554924
near_gens_Annotation,missense,0,0,0,,
near_gens_Annotation,stop-gain,1,0,0,2.887477,0.236044
near_gens_Annotation,intronic (splice_site),0,1,0,1.582797,0.45321
ensembl_Predicted Function,coding,39,64,40,3.794387,0.149989
sift_Prediction,Deleterious,8,9,6,1.315056,0.518131
sift_Prediction,Deleterious - Low Confidence,1,1,2,0.429823,0.806613
polyphen_Prediction,Probably Damaging,3,5,2,1.080625,0.582566
polyphen_Prediction,Possibly Damaging,2,2,0,2.470275,0.290795


In [58]:
%connect_info

{
  "shell_port": 41725,
  "iopub_port": 36167,
  "stdin_port": 60409,
  "control_port": 40337,
  "hb_port": 52123,
  "ip": "127.0.0.1",
  "key": "96401c45-fa6385b36e8d4ec92ba12e38",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-3022da89-61f5-46b7-b6e2-9f710614c66e.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
