Calculating genic recurrence and addressing its association with schizophrenia.

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import attila_utils
from bsmcalls import readVCF, individuals, preprocessing
import attila_utils
%matplotlib inline

Import annotated, expanded calls

In [2]:
csvpath = '/home/attila/projects/bsm/results/2020-11-13-functional-variants/expanded-annotated-calls.csv'
calls = pd.read_csv(csvpath, index_col=['Individual ID', 'Tissue', 'CHROM', 'POS', 'Mutation'])

Number of overlapped genes

In [3]:
coding = calls.loc[~ (calls['near_gens_Overlapped Gene'] == 'None')]
ncoding = coding.pivot_table(values='near_gens_Overlapped Gene', index='Dx', aggfunc='nunique', margins=True).rename(columns={'near_gens_Overlapped Gene': 'num calls'})
ncoding['frac calls'] = ncoding / ncoding.loc['All', 'num calls']
ncoding.style.bar(vmin=0)

Unnamed: 0_level_0,num calls,frac calls
Dx,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,471,0.292547
SCZ,1228,0.762733
All,1610,1.0


In [4]:
genes = list(set(coding['near_gens_Overlapped Gene']))
genes.sort()

In [5]:
s = coding.groupby('Individual ID')['near_gens_Overlapped Gene'].agg(set)
s

Individual ID
CMC_MSSM_027    {RP11-135J2.4, CAPN8, SLC35F6:CENPA, SLIT3, AN...
CMC_MSSM_055    {LDB3, AFF1, REXO1, GALNT5, NREP:NREP-AS1, GRI...
CMC_MSSM_056    {RP11-260O18.1, ADRBK2, C9, SNED1:AC005237.4, ...
CMC_MSSM_069        {EPHA6, GALNT16, LIPA, CASK, ZNF385D, CNKSR2}
CMC_MSSM_097    {MLIP:MLIP-AS1, ULK4, LYRM7, TNFAIP3, SLC1A2, ...
                                      ...                        
CMC_PITT_098    {MYLK4, RP11-108K3.1, ZNF804B, C9orf89, QSER1,...
CMC_PITT_101    {ZFHX4, MIPOL1, NEGR1, RALGAPB, SLC35E4, HECW1...
CMC_PITT_113    {RNFT2, SEMA3D, ERBB4, QPCT, MARK4, KCNJ6, TUS...
CMC_PITT_117    {AC133680.1, KCNQ5:KCNQ5-IT1, CACNA1E, C9orf96...
CMC_PITT_118    {ZBTB20, RP11-399H11.2, MDGA2, RP11-179A16.1, ...
Name: near_gens_Overlapped Gene, Length: 86, dtype: object

In [6]:
recurrence_dict = {gene: [ind for ind in s.index if gene in s[ind]] for gene in genes}
recurrence_ser = pd.Series(recurrence_dict)
recurrence_ser

5S_rRNA                            [CMC_MSSM_391]
AAGAB                              [CMC_MSSM_304]
ABCA12                             [CMC_MSSM_372]
ABCA9:ABCA9-AS1                    [CMC_MSSM_363]
ABCG1                              [CMC_MSSM_352]
                                 ...             
ZNF804A                            [CMC_MSSM_273]
ZNF804B              [CMC_MSSM_130, CMC_PITT_098]
ZYG11A                             [CMC_MSSM_321]
hsa-mir-490                        [CMC_MSSM_097]
hsa-mir-490:CHRM2    [CMC_MSSM_055, CMC_MSSM_099]
Length: 1610, dtype: object

In [7]:
ss = recurrence_ser.apply(lambda l: [sum([calls.xs(key=ind, axis=0, level='Individual ID')['Dx'].unique()[0] == dx for ind in l]) for dx in ['Control', 'SCZ']])
recurrence = ss.apply(pd.Series).rename(columns={0: 'n Control indiv', 1: 'n SCZ indiv'})
recurrence['Individual IDs'] = recurrence_ser

In [8]:
recurrence

Unnamed: 0,n Control indiv,n SCZ indiv,Individual IDs
5S_rRNA,1,0,[CMC_MSSM_391]
AAGAB,0,1,[CMC_MSSM_304]
ABCA12,0,1,[CMC_MSSM_372]
ABCA9:ABCA9-AS1,0,1,[CMC_MSSM_363]
ABCG1,0,1,[CMC_MSSM_352]
...,...,...,...
ZNF804A,0,1,[CMC_MSSM_273]
ZNF804B,2,0,"[CMC_MSSM_130, CMC_PITT_098]"
ZYG11A,0,1,[CMC_MSSM_321]
hsa-mir-490,0,1,[CMC_MSSM_097]


In [9]:
joint_recur = recurrence.value_counts(['n Control indiv', 'n SCZ indiv']).unstack().fillna(0).astype('int64').rename_axis(index='m Control indiv')
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
joint_recur.style.background_gradient(cmap=cm, vmin=0, vmax=100).set_caption('# overlapping genes in (m, n) individuals')

n SCZ indiv,0,1,2,3,4
m Control indiv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,1035,93,9,2
1,367,65,13,0,0
2,15,8,1,1,0
3,0,1,0,0,0


In [10]:
marginal_recur = pd.DataFrame([recurrence.value_counts(x) for x in ['n Control indiv', 'n SCZ indiv']], index=['Control', 'SCZ']).T.fillna(0).astype('int64').rename_axis(index='n indiv')
marginal_recur.style.background_gradient(cmap=cm, vmin=0, vmax=500).set_caption('# overlapping genes in n individuals')

Unnamed: 0_level_0,Control,SCZ
n indiv,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1139,382
1,445,1109
2,25,107
3,1,10
4,0,2
