In [115]:
import os 
import pandas as pd 
import glob
import json
os.chdir("/mnt/BioHome/jreyna/jreyna/projects/dchallenge/")
outdir = 'results/main/loop_analysis/Coloc_Approach/'

In [116]:
# loading and concat all the data
data = []
for fn in glob.glob('results/main/loop_analysis/Coloc_Approach/T1D_34012112_Gaulton/*/*/*/master.tsv'):
    
    dice_cline = fn.split('/')[5]
    study = fn.split('/')[6]
    eqtl_cline = fn.split('/')[7]
    
    df = pd.read_table(fn, header=0)
    df['dice_cline'] = dice_cline
    df['eqtl_cline'] = eqtl_cline
    df['study'] = study
    data.append(df)
    
data = pd.concat(data)
data.rename(columns={'5kb_gname': 'fivekb_gname'}, inplace=True)

In [117]:
new_order = [
 'sid',
 'rs_id',
 'gene_name',
 'dice_cline',
 'eqtl_cline',
 'study',
 'gene_id',
 'chrom',
 'snp_pos',
 'gene_start',
 'gene_end',
 'is_eqtl_pair',
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'nvar',
 'shape1',
 'shape2',
 'dist',
 'npval',
 'slope',
 'ppval',
 'bpval',
 'qval',
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_slope',
 'gwas_slope_se',
 'gwas_pval_nominal',
 'SampleSize']

In [118]:
data = data.loc[:, new_order]
data.drop_duplicates(subset=['sid', 'gene_name', 'dice_cline', 'eqtl_cline', 'study'], inplace=True)

In [119]:
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.tsv')
data.to_csv(master_fn, sep='\t', index=False, na_rep='nan')

## eQTL Summary

I want to know how many eQTL's comparisons there are total

In [120]:
eqtl_only = data[(data.is_eqtl_pair == 1)]
print(eqtl_only.shape[0])

97632


I want to know how many eQTL's have a loop

In [121]:
eqtl_loops_only = data[(data.is_eqtl_pair == 1) & (data.has_fithichip_loop == 1)]
print(eqtl_loops_only.shape[0])

50


## Colocalization Summary

I want to know how many colocalization comparisons there are total

In [122]:
coloc_only = data[(data.is_coloc_pair == 1)]
print(coloc_only.shape[0])

251


I want to know how many colocalizations have a loop

In [123]:
coloc_loops_only = data[(data.is_coloc_pair == 1) & (data.has_fithichip_loop == 1)]
print(coloc_loops_only.shape[0])

50


## Breakdown of colocalized loops (SNP perspective)

I want to know the breakdown in terms of gene

In [124]:
coloc_loops_only.value_counts(subset=['sid', 'gene_name'])

sid          gene_name  
21:43855067  UBASH3A        14
6:90976768   BACH2           7
16:11433103  RMI2            3
11:64107735  AP003774.1      3
12:9833628   RP11-75L1.1     3
11:64107477  AP003774.1      2
20:1610551   SIRPG           2
12:56401085  RPS26           2
16:11439303  RMI2            2
12:9147569   M6PR            1
15:79229199  CTSH            1
15:79231478  CTSH            1
16:11439679  RMI2            1
16:28599411  SULT1A2         1
16:28631530  SULT1A2         1
1:114426001  AP4B1           1
1:114447565  PTPN22          1
1:192537400  RGS1            1
21:43823736  TMPRSS3         1
21:43827765  TMPRSS3         1
11:64102948  AP003774.1      1
dtype: int64

I want to know the breakdown in terms of cell line

In [125]:
coloc_loops_only.value_counts(subset=['gene_name', 'sid', 'dice_cline'])

gene_name    sid          dice_cline
UBASH3A      21:43855067  TREGMEM       2
                          TH2           2
                          TH17          2
                          TH1           2
                          TFH           2
                          THSTAR        2
BACH2        6:90976768   TH17          1
M6PR         12:9147569   TREGNAIVE     1
CTSH         15:79231478  CM            1
             15:79229199  NCM           1
BACH2        6:90976768   TREGNAIVE     1
                          TREGMEM       1
                          THSTAR        1
                          TH2           1
                          TFH           1
                          TH1           1
RGS1         1:192537400  NCM           1
AP4B1        1:114426001  NCM           1
AP003774.1   11:64107735  TREGMEM       1
                          TH2           1
                          NB            1
             11:64107477  THSTAR        1
                          NCM          

## Breakdown of colocalized loops (Gene perspective)

In [126]:
coloc_loops_only.groupby(['gene_name']).rs_id.unique()

gene_name
AP003774.1        [rs663743, rs479777, rs574087]
AP4B1                               [rs11102694]
BACH2                               [rs72928038]
CTSH                    [rs12592898, rs12148472]
M6PR                                 [rs1805721]
PTPN22                               [rs1217397]
RGS1                                 [rs1323297]
RMI2           [rs7187741, rs12149160, rs918738]
RP11-75L1.1                          [rs3764021]
RPS26                               [rs10876864]
SIRPG                                [rs2281808]
SULT1A2                 [rs62031607, rs55792032]
TMPRSS3                   [rs9978717, rs9784215]
UBASH3A                              [rs1893592]
Name: rs_id, dtype: object

In [127]:
coloc_loops_only.groupby(['gene_name']).rs_id.nunique().sort_values()

gene_name
AP4B1          1
BACH2          1
M6PR           1
PTPN22         1
RGS1           1
RP11-75L1.1    1
RPS26          1
SIRPG          1
UBASH3A        1
CTSH           2
SULT1A2        2
TMPRSS3        2
AP003774.1     3
RMI2           3
Name: rs_id, dtype: int64

In [128]:
coloc_loops_only.groupby(['gene_name']).rs_id.nunique().sort_values().shape

(14,)

In [155]:
coloc_loops_only

Unnamed: 0,sid,rs_id,gene_name,dice_cline,eqtl_cline,study,gene_id,chrom,snp_pos,gene_start,...,pp_H4_Coloc_Summary,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval_nominal,SampleSize
2787,16:11433103,rs7187741,RMI2,TREGMEM,T-cell,BLUEPRINT_eQTL,ENSG00000175643.7,chr16,11433103,11343476,...,0.999228,G,C,2828.0,0.564696,5008.0,-0.099476,0.014105,1.75e-12,520580.0
4910,21:43855067,rs1893592,UBASH3A,TREGMEM,T-cell,BLUEPRINT_eQTL,ENSG00000160185.9,chr21,43855067,43824008,...,0.999988,A,C,1058.0,0.211262,5008.0,-0.116047,0.015577,9.34e-14,520580.0
6610,6:90976768,rs72928038,BACH2,TREGMEM,T-cell,BLUEPRINT_eQTL,ENSG00000112182.10,chr6,90976768,90636248,...,0.999001,G,A,358.0,0.071486,5008.0,0.199873,0.018481,2.93e-27,520580.0
351,11:64107735,rs663743,AP003774.1,TREGMEM,TREG_MEMORY,DICE_eQTL,ENSG00000236935.1,chr11,64107735,64092522,...,0.959176,G,A,993.0,0.198283,5008.0,-0.099964,0.015092,3.5e-11,520580.0
620,12:56401085,rs10876864,RPS26,TREGMEM,TREG_MEMORY,DICE_eQTL,ENSG00000197728.5,chr12,56401085,56435637,...,0.987121,G,A,2796.0,0.558307,5008.0,-0.226848,0.014116,4.16e-58,520580.0
1538,21:43855067,rs1893592,UBASH3A,TREGMEM,TREG_MEMORY,DICE_eQTL,ENSG00000160185.9,chr21,43855067,43824008,...,0.999988,A,C,1058.0,0.211262,5008.0,-0.116047,0.015577,9.34e-14,520580.0
504,12:9147569,rs1805721,M6PR,TREGNAIVE,TREG_NAIVE,DICE_eQTL,ENSG00000003056.3,chr12,9147569,9092959,...,0.999422,G,A,1666.0,0.332668,5008.0,-0.09059,0.01428,2.24e-10,520580.0
622,12:56401085,rs10876864,RPS26,TREGNAIVE,TREG_NAIVE,DICE_eQTL,ENSG00000197728.5,chr12,56401085,56435637,...,0.999922,G,A,2796.0,0.558307,5008.0,-0.226848,0.014116,4.16e-58,520580.0
1549,21:43827765,rs9978717,TMPRSS3,TREGNAIVE,TREG_NAIVE,DICE_eQTL,ENSG00000160183.9,chr21,43827765,43791999,...,0.973007,A,G,2357.0,0.470647,5008.0,0.087971,0.014006,3.37e-10,520580.0
1574,12:9833628,rs3764021,RP11-75L1.1,TREGNAIVE,T-cell,BLUEPRINT_eQTL,ENSG00000256582.1,chr12,9833628,9856673,...,0.984448,C,T,2327.0,0.464657,5008.0,-0.086774,0.013958,5.08e-10,520580.0


In [170]:
summary = coloc_loops_only.loc[~coloc_loops_only.duplicated(subset=['study', 'eqtl_cline', 'sid', 'gene_id']), :]
summary = summary.groupby(['study', 'eqtl_cline']).apply(len)
summary = summary.reset_index()
summary.study = summary.study.str.replace('_eQTL', '')
summary = summary.sort_values(['study', 0, 'eqtl_cline'], ascending=[True, False, True])

In [175]:
summary

Unnamed: 0,study,eqtl_cline,0
1,BLUEPRINT,T-cell,5
0,BLUEPRINT,Monocyte,2
15,DICE,TREG_MEMORY,3
16,DICE,TREG_NAIVE,3
2,DICE,B_NAIVE,2
5,DICE,CD8_NAIVE,2
9,DICE,NONCLASSICAL_MONOCYTES,2
10,DICE,TFH,2
11,DICE,TH1,2
12,DICE,TH1-17,2


In [174]:
summary[0].describe()

count    17.0
mean      2.0
std       1.0
min       1.0
25%       1.0
50%       2.0
75%       2.0
max       5.0
Name: 0, dtype: float64