In [11]:
import os 
import glob
import pandas as pd 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

## Load Coloc SNP data

In [12]:
globs = 'results/main/GRCh37/coloc/Results/eQTL_Catalogue/*/*/*/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
globs = glob.glob(globs)

coloc_data = []
for fn in globs:
    
    file_info = fn.split('/')    
    gwas_source = file_info[6]
    eqtl_source = file_info[7]
    ge_source = file_info[8]
    
    num_lines = len(open(fn).readlines())
    if num_lines > 0: 
        
        #print(fn)
        
        tdf = pd.read_table(fn)
        tdf.loc[:, 'gwas_source'] = gwas_source
        tdf.loc[:, 'eqtl_source'] = eqtl_source
        tdf.loc[:, 'ge_source'] = ge_source
    
        coloc_data.append(tdf)

coloc_data = pd.concat(coloc_data)

In [13]:
coloc_summary = coloc_data.groupby(['gwas_source', 'eqtl_source', 'ge_source'])#.pos.count()
coloc_summary = coloc_summary.pos.count().to_frame()
coloc_summary.columns = ['Number of SNPs in Coloc']
coloc_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of SNPs in Coloc
gwas_source,eqtl_source,ge_source,Unnamed: 3_level_1
T1D_25751624,BLUEPRINT,T-cell,1
T1D_25751624,BLUEPRINT,monocyte,1
T1D_25751624,BLUEPRINT,neutrophil,1
T1D_25751624,GENCORD,LCL,2
T1D_25751624,GENCORD,T-cell,2
T1D_25751624,GTEx,LCL,3
T1D_25751624,GTEx,blood,1
T1D_25751624,Lepik_2017,blood,1
T1D_25751624,Quach_2016,monocyte_IAV,2
T1D_25751624,Quach_2016,monocyte_LPS,1


## Load LD data

In [14]:
globs = glob.glob('results/main/GRCh37/coloc/eQTL_Catalogue/*/*/*/ldpairs/coloc_ld_snps.txt')

In [15]:
ld_data = []
ld_summary_zeros = []
for fn in globs:
    
    file_info = fn.split('/')
    gwas_source = file_info[5]
    genome_ref = 'GRCh37'
    eqtl_source = file_info[6]
    ge_source = file_info[7]
    
    num_lines = len(open(fn).readlines())
    if num_lines > 0: 
        
        tdf = pd.read_table(fn)
        tdf.loc[:, 'gwas_source'] = gwas_source
        tdf.loc[:, 'genome_ref'] = genome_ref
        tdf.loc[:, 'eqtl_source'] = eqtl_source
        tdf.loc[:, 'ge_source'] = ge_source
        tdf.drop_duplicates(subset=['rsID', 'ld_rsID'], inplace=True)
        ld_data.append(tdf)
        
    else:
        ld_summary_zeros.append([gwas_source, eqtl_source, ge_source, 0])

In [16]:
ld_df = pd.concat(ld_data)

In [17]:
ld_summary = ld_df.groupby(['gwas_source', 'eqtl_source', 'ge_source'])#.pos.count()
ld_summary = ld_summary.pos.count().to_frame()
ld_summary.columns = ['Number of SNPs in LD']

In [18]:
# add entries which have zero SNPs in LD
if len(ld_summary_zeros) > 0:
    ld_summary_zeros = pd.DataFrame(ld_summary_zeros)
    ld_summary_zeros.set_index([0,1,2], inplace=True)
    ld_summary_zeros.columns = ['Number of SNPs in LD']
    ld_summary = pd.concat([ld_summary, ld_summary_zeros])

## Merge

In [19]:
pd.options.display.min_rows = 100

In [20]:
summary = pd.merge(coloc_summary, ld_summary, left_index=True, right_index=True, how='outer')
summary.sort_values('Number of SNPs in LD', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of SNPs in Coloc,Number of SNPs in LD
gwas_source,eqtl_source,ge_source,Unnamed: 3_level_1,Unnamed: 4_level_1
T1D_25751624,Schmiedel_2018,Tfh_memory,4,335.0
T1D_25751624,Quach_2016,monocyte_R848,2,289.0
T1D_25751624,Quach_2016,monocyte_LPS,1,252.0
T1D_34012112_Gaulton,Schmiedel_2018,Treg_naive,5,216.0
T1D_34012112_Gaulton,BLUEPRINT,T-cell,2,133.0
T1D_25751624,Schmiedel_2018,NK-cell_naive,3,129.0
T1D_34012112_Gaulton,Schmiedel_2018,CD4_T-cell_anti-CD3-CD28,3,120.0
T1D_25751624,Schmiedel_2018,Th1-17_memory,2,104.0
T1D_25751624,Schmiedel_2018,CD8_T-cell_anti-CD3-CD28,4,102.0
T1D_34012112_Gaulton,Schmiedel_2018,Th2_memory,4,96.0
