In [69]:
import collections

import numpy as np
import pandas as pd
from scipy.stats import binom, hypergeom
from statsmodels.sandbox.stats.multicomp import multipletests

from tqdm.auto import tqdm

#from bioinf_common.tools import multipletests_nan

from tad_helper_functions import get_tad_lengths, EmptyTAD, TADTooSmall, OverlappingTADs
from typing import Any, Optional, List, Sequence
import statsmodels.api as sm

In [2]:
def multipletests_nan(pval_list: List, method: str = 'fdr_bh') -> List:
    """Multiple testing correction on lists with NaNs."""
    pval_list = np.asarray(pval_list)

    nan_idx, = np.where(np.isnan(pval_list))
    pval_list_nonan = pval_list[~np.isnan(pval_list)]

    if len(pval_list_nonan) == 0:
        assert np.isnan(pval_list).all()  # contains only np.nan
        pval_corr = pval_list
    else:
        _, pval_corr, _, _ = multipletests(pval_list_nonan, method=method)
        for i in nan_idx:
            pval_corr = np.insert(pval_corr, i, np.nan)

    return pval_corr

# Parameters

In [37]:
db_fname = '../../results/databases/per_source/snpdb.Rao2014-IMR90-MboI-allreps-filtered-10kb.5.csv'
tads_fname = '../../results/tads/data/tads.Rao2014-IMR90-MboI-allreps-filtered-10kb.5.csv'
info_fname = '../../results/hic_files/info.csv'

source = 'Rao2014-IMR90-MboI-allreps-filtered-10kb'
filter_type = 'exonic'
tad_borders = {'50in': [50000, 0]}
allow_snp_multiplicity_in_enrichment = False
enrichment_distribution = 'hypergeom'
enrichment_null_model = 'base_sample'

#fname_out = snakemake.output.fname

# Load data

In [38]:
df = pd.read_csv(db_fname)
display(df.head())

disease_cancer_map = df.set_index('diseaseId').to_dict()['is_cancer']

Unnamed: 0,diseaseId,snpId,snp_source,diseaseIdType,odds_ratio,associated_genes,diseaseLabel,is_cancer,chromosome_hg19,chromosome_hg38,...,filter_nofilter_hg38,filter_exonic_hg19,filter_exonic_hg38,filter_intronic_hg19,filter_intronic_hg38,filter_intergenic_hg19,filter_intergenic_hg38,filter_nonexonic_hg19,filter_nonexonic_hg38,50in
0,EFO_0004278,rs190759,gwas_catalog,EFO,1.11,,sudden cardiac arrest,False,6,6,...,True,False,False,False,False,True,True,True,True,tad
1,EFO_0004278,rs1823172,gwas_catalog,EFO,1.17,,sudden cardiac arrest,False,12,12,...,True,False,False,False,False,True,True,True,True,border
2,EFO_0004278,rs950776,gwas_catalog,EFO,1.09,ENSG00000117971,sudden cardiac arrest,False,15,15,...,True,False,False,True,True,False,False,True,True,tad
3,EFO_0004278,rs2251393,gwas_catalog,EFO,1.13,"ENSG00000173838,ENSG00000265702",sudden cardiac arrest,False,17,17,...,True,True,True,False,False,False,False,False,False,border
4,EFO_0004278,rs944260,gwas_catalog,EFO,1.1,"ENSG00000179242,ENSG00000280641",sudden cardiac arrest,False,20,20,...,True,False,False,True,True,False,False,True,True,tad


In [39]:
df_info = pd.read_csv(info_fname, index_col=1)
genome_assembly = df_info.loc[source, 'genome_assembly']
genome_assembly

'hg19'

# Get TAD stats

In [40]:
genome_length = {
    'hg19': 2_991_688_216,  # https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37.p13
    'hg38': 3_092_480_053,  # https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38.p11
}[genome_assembly]

df_tads = pd.read_csv(tads_fname)
df_tads.head()

Unnamed: 0,chrname,tad_start,tad_stop
0,chr1,700000,910000
1,chr1,910000,1180000
2,chr1,1180000,1240000
3,chr1,1240000,1310000
4,chr1,1310000,1850000


In [41]:
df_tads['prev_tad_stop'] = df_tads.tad_stop.shift(1)
df_tads['next_tad_start'] = df_tads.tad_start.shift(-1)
df_tads['prev_tad_chr'] = df_tads.chrname.shift(1)
df_tads['next_tad_chr'] = df_tads.chrname.shift(-1)

In [42]:
tad_statistics = collections.defaultdict(dict)

for border_name, border_range in tqdm(tad_borders.items()):
    tad_len = 0
    border_len = 0
    chrom_lens = collections.defaultdict(list)
    for row in df_tads.itertuples():
        try:
            b1_range, tad_range, b2_range = get_tad_lengths(row, border_range)
        except (EmptyTAD, TADTooSmall, OverlappingTADs):
            continue

        tad_len += tad_range.stop - tad_range.start
        border_len += (b1_range.stop - b1_range.start) + (
            b2_range.stop - b2_range.start
        )

        chrom_lens[row.chrname].append(row.tad_stop)

    outside_len = genome_length - tad_len - border_len

    tad_statistics[border_name]['chrom'] = genome_length
    tad_statistics[border_name]['tad'] = tad_len
    tad_statistics[border_name]['border'] = border_len
    tad_statistics[border_name]['outside'] = outside_len

tad_statistics = dict(tad_statistics)
tad_statistics

  0%|          | 0/1 [00:00<?, ?it/s]

{'50in': {'chrom': 2991688216,
  'tad': 1905530000,
  'border': 860390000,
  'outside': 225768216}}

# Compute enrichments

In [48]:
filter_column = f'filter_{filter_type}_{genome_assembly}'
filter_column

'filter_exonic_hg19'

In [49]:
df[df[filter_column]]

Unnamed: 0,diseaseId,snpId,snp_source,diseaseIdType,odds_ratio,associated_genes,diseaseLabel,is_cancer,chromosome_hg19,chromosome_hg38,...,filter_nofilter_hg38,filter_exonic_hg19,filter_exonic_hg38,filter_intronic_hg19,filter_intronic_hg38,filter_intergenic_hg19,filter_intergenic_hg38,filter_nonexonic_hg19,filter_nonexonic_hg38,50in
3,EFO_0004278,rs2251393,gwas_catalog,EFO,1.130000,"ENSG00000173838,ENSG00000265702",sudden cardiac arrest,False,17,17,...,True,True,True,False,False,False,False,False,False,border
7,EFO_0004278,rs7157599,gwas_catalog,EFO,1.130000,ENSG00000168350,sudden cardiac arrest,False,14,14,...,True,True,True,False,False,False,False,False,False,tad
14,EFO_0004278,rs16872085,gwas_catalog,EFO,1.830000,ENSG00000169946,sudden cardiac arrest,False,8,8,...,True,True,False,False,True,False,False,False,True,tad
27,EFO_0004278,rs17291650,gwas_catalog,EFO,1.430000,ENSG00000123268,sudden cardiac arrest,False,12,12,...,True,True,True,False,False,False,False,False,False,border
145,EFO_0000341,rs13180,gwas_catalog,EFO,1.300000,ENSG00000136381,chronic obstructive pulmonary disease,False,15,15,...,True,True,True,False,False,False,False,False,False,border
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84826,EFO_0009382,rs9028,gwas_catalog,EFO,5.048128,ENSG00000136514,metabolically healthy obesity,False,3,3,...,True,True,True,False,False,False,False,False,False,border
84851,EFO_0009761,rs2242442,gwas_catalog,EFO,1.291753,"ENSG00000135111,ENSG00000257817",periprosthetic osteolysis,False,12,12,...,True,True,True,False,False,False,False,False,False,border
84878,EFO_0000684,rs41283642,gwas_catalog,EFO,,ENSG00000106799,respiratory system disease,False,9,9,...,True,True,True,False,False,False,False,False,False,border
84906,EFO_0000684,rs10975277,gwas_catalog,EFO,,"ENSG00000107036,ENSG00000099219",respiratory system disease,False,9,9,...,True,True,True,False,False,False,False,False,False,border


In [50]:
df_filter_sub = df[df[filter_column]]
print(df.shape, df_filter_sub.shape)

(84920, 27) (7200, 27)


In [46]:
df_filter_sub.head()

Unnamed: 0,diseaseId,snpId,snp_source,diseaseIdType,odds_ratio,associated_genes,diseaseLabel,is_cancer,chromosome_hg19,chromosome_hg38,...,filter_nofilter_hg38,filter_exonic_hg19,filter_exonic_hg38,filter_intronic_hg19,filter_intronic_hg38,filter_intergenic_hg19,filter_intergenic_hg38,filter_nonexonic_hg19,filter_nonexonic_hg38,50in
3,EFO_0004278,rs2251393,gwas_catalog,EFO,1.13,"ENSG00000173838,ENSG00000265702",sudden cardiac arrest,False,17,17,...,True,True,True,False,False,False,False,False,False,border
7,EFO_0004278,rs7157599,gwas_catalog,EFO,1.13,ENSG00000168350,sudden cardiac arrest,False,14,14,...,True,True,True,False,False,False,False,False,False,tad
14,EFO_0004278,rs16872085,gwas_catalog,EFO,1.83,ENSG00000169946,sudden cardiac arrest,False,8,8,...,True,True,False,False,True,False,False,False,True,tad
27,EFO_0004278,rs17291650,gwas_catalog,EFO,1.43,ENSG00000123268,sudden cardiac arrest,False,12,12,...,True,True,True,False,False,False,False,False,False,border
145,EFO_0000341,rs13180,gwas_catalog,EFO,1.3,ENSG00000136381,chronic obstructive pulmonary disease,False,15,15,...,True,True,True,False,False,False,False,False,False,border


In [68]:
enr_result = []
group = df_filter_sub[df_filter_sub['diseaseId'] == 'EFO_0001645']
for border_name in tad_borders.keys():
        print(border_name)
        # get TAD-related statistics
        if allow_snp_multiplicity_in_enrichment:
            tads = group[border_name].tolist()

            snp_counts = {
                'total': df_filter_sub['snpId'].shape[0],
                'tad': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'tad', 'snpId'
                ].shape[0],
                'border': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'border', 'snpId'
                ].shape[0],
                'outside': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'outside', 'snpId'
                ].shape[0],
            }
        else:
            tads = group[['snpId', border_name]].drop_duplicates(subset='snpId')[
                border_name
            ]

            snp_counts = {
                'total': df_filter_sub['snpId'].drop_duplicates().shape[0],
                'tad': df_filter_sub.loc[df_filter_sub[border_name] == 'tad', 'snpId']
                .drop_duplicates()
                .shape[0],
                'border': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'border', 'snpId'
                ]
                .drop_duplicates()
                .shape[0],
                'outside': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'outside', 'snpId'
                ]
                .drop_duplicates()
                .shape[0],
            }

        N = len(tads)
        counts = collections.Counter(tads)

        # compute enrichment
        if counts['border'] == 0:
            cdf_tad = np.nan
            cdf_border = np.nan
            cdf_outside = np.nan
        else:
            if enrichment_null_model == 'base_sample':
                # get overall lengths
                cur_cl = tad_statistics[border_name]['chrom']
                cur_tl = tad_statistics[border_name]['tad']
                cur_bl = tad_statistics[border_name]['border']
                cur_nl = tad_statistics[border_name]['outside']
                #tads = group[['snpId', border_name]][border_name]
                
                print(cur_bl, cur_cl, cur_nl, cur_tl, '\n')
                print('N: ', N, '\n')
                print('counts outside:', counts['outside'], '\n')
                print('counts border:', counts['border'], '\n')
                print('counts tad:', counts['tad'], '\n')
                print('snp counts: ', snp_counts, '\n')
                print('tads: ', tads, '\n')

                if enrichment_distribution == 'binom':
                    cdf_tad = binom.cdf(counts['tad'], N, cur_tl / cur_cl)
                    cdf_border = binom.cdf(counts['border'], N, cur_bl / cur_cl)
                    cdf_outside = binom.cdf(counts['outside'], N, cur_nl / cur_cl)
                elif enrichment_distribution == 'hypergeom':
                    cdf_tad = hypergeom.cdf(counts['tad'], cur_cl, N, cur_tl)
                    cdf_border = hypergeom.cdf(counts['border'], cur_cl, N, cur_bl)
                    cdf_outside = hypergeom.cdf(counts['outside'], cur_cl, N, cur_nl)
                    
                    print(cdf_tad, cdf_border, cdf_outside, '\n')
            elif enrichment_null_model == 'snp_sample':
                if enrichment_distribution == 'binom':
                    cdf_tad = binom.cdf(
                        counts['tad'], N, snp_counts['tad'] / snp_counts['total']
                    )
                    cdf_border = binom.cdf(
                        counts['border'], N, snp_counts['border'] / snp_counts['total']
                    )
                    cdf_outside = binom.cdf(
                        counts['outside'],
                        N,
                        snp_counts['outside'] / snp_counts['total'],
                    )
                elif enrichment_distribution == 'hypergeom':
                    cdf_tad = hypergeom.cdf(
                        counts['tad'], snp_counts['total'], N, snp_counts['tad']
                    )
                    cdf_border = hypergeom.cdf(
                        counts['border'], snp_counts['total'], N, snp_counts['border']
                    )
                    cdf_outside = hypergeom.cdf(
                        counts['outside'], snp_counts['total'], N, snp_counts['outside']
                    )

        enr_result.append(
            {
                'diseaseId': disease,
                '#snp': N,
                '#border_snp': counts['border'],
                'pval_tad': 1 - cdf_tad,
                'pval_border': 1 - cdf_border,
                'pval_outside': 1 - cdf_outside,
                'TAD_type': border_name,
            }
        )
df_enr = pd.DataFrame(enr_result)
    

50in
860390000 2991688216 225768216 1905530000 

N:  94 

counts outside: 7 

counts border: 53 

counts tad: 34 

snp counts:  {'total': 2998, 'tad': 1353, 'border': 1551, 'outside': 93} 

tads:  2344     border
4475     border
8904     border
9145     border
10017       tad
          ...  
42689       tad
42705    border
42709    border
42724       tad
42739    border
Name: 50in, Length: 94, dtype: object 

5.5732578059455974e-08 0.999999993797705 nan 



In [61]:
enr_result = []
for disease, group in tqdm(
    df_filter_sub.groupby('diseaseId'), total=df_filter_sub['diseaseId'].nunique()
):
    for border_name in tad_borders.keys():
        print(border_name)
        # get TAD-related statistics
        if allow_snp_multiplicity_in_enrichment:
            tads = group[border_name].tolist()

            snp_counts = {
                'total': df_filter_sub['snpId'].shape[0],
                'tad': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'tad', 'snpId'
                ].shape[0],
                'border': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'border', 'snpId'
                ].shape[0],
                'outside': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'outside', 'snpId'
                ].shape[0],
            }
        else:
            tads = group[['snpId', border_name]].drop_duplicates(subset='snpId')[
                border_name
            ]

            snp_counts = {
                'total': df_filter_sub['snpId'].drop_duplicates().shape[0],
                'tad': df_filter_sub.loc[df_filter_sub[border_name] == 'tad', 'snpId']
                .drop_duplicates()
                .shape[0],
                'border': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'border', 'snpId'
                ]
                .drop_duplicates()
                .shape[0],
                'outside': df_filter_sub.loc[
                    df_filter_sub[border_name] == 'outside', 'snpId'
                ]
                .drop_duplicates()
                .shape[0],
            }

        N = len(tads)
        counts = collections.Counter(tads)

        # compute enrichment
        if counts['border'] == 0:
            cdf_tad = np.nan
            cdf_border = np.nan
            cdf_outside = np.nan
        else:
            if enrichment_null_model == 'base_sample':
                # get overall lengths
                cur_cl = tad_statistics[border_name]['chrom']
                cur_tl = tad_statistics[border_name]['tad']
                cur_bl = tad_statistics[border_name]['border']
                cur_nl = tad_statistics[border_name]['outside']
                #tads = group[['snpId', border_name]][border_name]
                
                print(cur_bl, cur_cl, cur_nl, cur_tl, '\n')
                print('N: ', N, '\n')
                print('counts outside:', counts['outside'], '\n')
                print('snp counts: ', snp_counts, '\n')
                print('tads: ', tads, '\n')

                if enrichment_distribution == 'binom':
                    cdf_tad = binom.cdf(counts['tad'], N, cur_tl / cur_cl)
                    cdf_border = binom.cdf(counts['border'], N, cur_bl / cur_cl)
                    cdf_outside = binom.cdf(counts['outside'], N, cur_nl / cur_cl)
                elif enrichment_distribution == 'hypergeom':
                    cdf_tad = hypergeom.cdf(counts['tad'], cur_cl, N, cur_tl)
                    cdf_border = hypergeom.cdf(counts['border'], cur_cl, N, cur_bl)
                    cdf_outside = hypergeom.cdf(counts['outside'], cur_cl, N, cur_nl)
                    
                    print(cdf_tad, cdf_border, cdf_outside, '\n')
            elif enrichment_null_model == 'snp_sample':
                if enrichment_distribution == 'binom':
                    cdf_tad = binom.cdf(
                        counts['tad'], N, snp_counts['tad'] / snp_counts['total']
                    )
                    cdf_border = binom.cdf(
                        counts['border'], N, snp_counts['border'] / snp_counts['total']
                    )
                    cdf_outside = binom.cdf(
                        counts['outside'],
                        N,
                        snp_counts['outside'] / snp_counts['total'],
                    )
                elif enrichment_distribution == 'hypergeom':
                    cdf_tad = hypergeom.cdf(
                        counts['tad'], snp_counts['total'], N, snp_counts['tad']
                    )
                    cdf_border = hypergeom.cdf(
                        counts['border'], snp_counts['total'], N, snp_counts['border']
                    )
                    cdf_outside = hypergeom.cdf(
                        counts['outside'], snp_counts['total'], N, snp_counts['outside']
                    )

        enr_result.append(
            {
                'diseaseId': disease,
                '#snp': N,
                '#border_snp': counts['border'],
                'pval_tad': 1 - cdf_tad,
                'pval_border': 1 - cdf_border,
                'pval_outside': 1 - cdf_outside,
                'TAD_type': border_name,
            }
        )
df_enr = pd.DataFrame(enr_result)

  0%|          | 0/543 [00:00<?, ?it/s]

50in
50in
860390000 2991688216 225768216 1905530000 

N:  12 

counts outside: 0 

snp counts:  {'total': 2998, 'tad': 1353, 'border': 1551, 'outside': 93} 

tads:  25764    border
25775       tad
25846    border
25851       tad
25860       tad
25861       tad
25865       tad
25880    border
25910    border
25923       tad
25933       tad
25958       tad
Name: 50in, dtype: object 

0.6877819608145959 0.7557115934728031 nan 

50in
860390000 2991688216 225768216 1905530000 

N:  6 

counts outside: 0 

snp counts:  {'total': 2998, 'tad': 1353, 'border': 1551, 'outside': 93} 

tads:  24104       tad
45401       tad
49883    border
49917    border
49933       tad
49934       tad
Name: 50in, dtype: object 

0.7048649136919247 0.7669367754853769 nan 

50in
860390000 2991688216 225768216 1905530000 

N:  33 

counts outside: 0 

snp counts:  {'total': 2998, 'tad': 1353, 'border': 1551, 'outside': 93} 

tads:  11517    border
11551    border
11559    border
11570       tad
11579       tad
1158

In [74]:
from scipy.stats import hypergeom

# Example with smaller numbers to see if the CDF calculation works
cdf_outside = hypergeom.cdf(1, 1000000000, 50, 2000000)
print(cdf_outside)


0.9954028136720371


In [63]:
cdf_outside = hypergeom.cdf(1, 2991688216, 55, 225768216)
cdf_outside

nan

In [21]:
tad_borders

{'50in': [50000, 0]}

In [24]:
cdf_outside = hypergeom.cdf(counts['outside'], cur_cl, N, cur_nl)

In [70]:
cdf_outside = sm.stats.hypergeom.cdf(counts['outside'], cur_cl, N, cur_nl)
cdf_outside

AttributeError: module 'statsmodels.stats.api' has no attribute 'hypergeom'

In [29]:
cdf_border = hypergeom.cdf(counts['border'], cur_cl, N, cur_bl)
cdf_border

1.0

In [31]:
cdf_tad = hypergeom.cdf(counts['tad'], cur_cl, N, cur_tl)
cdf_tad

0.3630586169573728

In [22]:
cur_nl

225768216

In [33]:
cur_cl

2991688216

In [19]:
snp_counts

{'total': 19020, 'tad': 12655, 'border': 6187, 'outside': 177}

In [20]:
counts

Counter({'border': 1})

In [14]:
df_enr.head()

Unnamed: 0,diseaseId,#snp,#border_snp,pval_tad,pval_border,pval_outside,TAD_type
0,EFO_0000094,7,3,0.502408,0.110523,,50in
1,EFO_0000095,48,26,0.991328,6.2e-05,,50in
2,EFO_0000096,8,3,0.397055,0.172196,,50in
3,EFO_0000174,5,1,0.104833,0.446107,,50in
4,EFO_0000178,5,3,0.744214,0.026335,,50in


# Multiple-testing correction

In [15]:
df_enr['is_cancer'] = df_enr['diseaseId'].apply(lambda x: disease_cancer_map[x])
df_enr.head()

Unnamed: 0,diseaseId,#snp,#border_snp,pval_tad,pval_border,pval_outside,TAD_type,is_cancer
0,EFO_0000094,7,3,0.502408,0.110523,,50in,False
1,EFO_0000095,48,26,0.991328,6.2e-05,,50in,False
2,EFO_0000096,8,3,0.397055,0.172196,,50in,False
3,EFO_0000174,5,1,0.104833,0.446107,,50in,True
4,EFO_0000178,5,3,0.744214,0.026335,,50in,True


In [16]:
df_enr_tmp = df_enr.copy()
df_enr_tmp['is_cancer'] = df_enr_tmp['diseaseId'].apply(lambda x: disease_cancer_map[x])
df_enr_corr = df_enr_tmp.groupby(['TAD_type', 'is_cancer'])[
    ['pval_border', 'pval_tad', 'pval_outside']
].transform(multipletests_nan)

df_enr['pval_border__notcorrected'] = df_enr['pval_border']
df_enr['pval_outside__notcorrected'] = df_enr['pval_outside']
df_enr['pval_tad__notcorrected'] = df_enr['pval_tad']

df_enr['pval_border'] = df_enr_corr['pval_border']
df_enr['pval_outside'] = df_enr_corr['pval_outside']
df_enr['pval_tad'] = df_enr_corr['pval_tad']

In [17]:
df_enr.head()

Unnamed: 0,diseaseId,#snp,#border_snp,pval_tad,pval_border,pval_outside,TAD_type,is_cancer,pval_border__notcorrected,pval_outside__notcorrected,pval_tad__notcorrected
0,EFO_0000094,7,3,0.748295,0.282839,,50in,False,0.110523,,0.502408
1,EFO_0000095,48,26,0.999989,0.000648,,50in,False,6.2e-05,,0.991328
2,EFO_0000096,8,3,0.643397,0.375247,,50in,False,0.172196,,0.397055
3,EFO_0000174,5,1,0.483844,0.557634,,50in,True,0.446107,,0.104833
4,EFO_0000178,5,3,0.905222,0.098757,,50in,True,0.026335,,0.744214


# Save result

In [18]:
df_enr.to_csv(fname_out, index=False)
df_enr.head()

NameError: name 'fname_out' is not defined