In [1]:
import os 
import dask.dataframe as dd
import pandas as pd 
pd.get_option("display.max_columns", None)
pd.options.display.max_columns = None
import numpy as np
import glob
import json
from IPython.display import HTML
os.chdir("/mnt/BioHome/jreyna/jreyna/projects/dchallenge/")
outdir = 'results/main/sgls/combined/'
os.makedirs(outdir, exist_ok=True)

In [2]:
major_cols = [
 'gwas_source',
 'ge_source',
 'loop_source',
 'sid',
 'rsid',
 'geneid',
 'gene_name',
 'chrom',
 'snp_pos',
 'tss_start',
 'tss_end',
 'is_eqtl_pair',
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'eqtl_pval',
 'eqtl_fdr',
 'dist',
 'ppH4',
 'gene_start',
 'gene_end',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_pval',
 'gene_strand',
 'eqtl_source']

## Loading GENCODE information

In [3]:
gene_info = pd.read_table('results/refs/gencode/v30/gencode.v30.annotation.bed', header=None)
gene_dict = gene_info.iloc[:, [5,6]]#{sr[5]: sr[6] for i, sr in gene_info.iterrows()}
gene_dict.drop_duplicates(inplace=True)
gene_dict.set_index(5, inplace=True)
gene_dict = gene_dict.squeeze()
gene_dict = gene_dict.to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_dict.drop_duplicates(inplace=True)


## Checking the samplesheet

In [4]:
samplesheet = pd.read_table('config/sgl.samplesheet.tsv.v2')
template = 'results/main/sgls/T1D_34012112_Gaulton/{}/{}'
output_exists = []
for i, sr in samplesheet.iterrows():
    test = os.path.exists(template.format(sr.eqtl_db, sr.eqtl_origin, sr.loop_origin))
    output_exists.append(test)
samplesheet['analyzed'] = output_exists

In [5]:
samplesheet.sort_values(['analyzed', 'eqtl_db', 'eqtl_origin', 'loop_origin'], \
                       ascending=[False, True, True, True], inplace=True)

samplesheet.reset_index(drop=True, inplace=True)
samplesheet.columns = ['GWAS Source', 'eQTL Source', 'GE Source', 'Loop Source', 'Analyzed?']

In [6]:
samplesheet

Unnamed: 0,GWAS Source,eQTL Source,GE Source,Loop Source,Analyzed?
0,T1D_34012112_Gaulton,Quach_2016,monocyte_IAV,monocyte_naive,True
1,T1D_34012112_Gaulton,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,True
2,T1D_34012112_Gaulton,Quach_2016,monocyte_R848,monocyte_naive,True
3,T1D_34012112_Gaulton,Schmiedel_2018,B-cell_naive,B-cell_naive,True
4,T1D_34012112_Gaulton,Schmiedel_2018,CD4_T-cell_anti-CD3-CD28,CD4_T-cell_naive,True
5,T1D_34012112_Gaulton,Schmiedel_2018,CD4_T-cell_naive,CD4_T-cell_naive,True
6,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_anti-CD3-CD28,CD8_T-cell_naive,True
7,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive,True
8,T1D_34012112_Gaulton,Schmiedel_2018,NK-cell_naive,NK-cell_naive,True
9,T1D_34012112_Gaulton,Quach_2016,monocyte_LPS,monocyte_naive,False


## Combining the other datasets

In [7]:
# loading and concat all the data
data = []
for fn in glob.glob('results/main/sgls/T1D_34012112_Gaulton/*/*/*/master.tsv'):
        
    gwas_source = fn.split('/')[3]
    eqtl_source = fn.split('/')[4]
    ge_source = fn.split('/')[5]
    loop_source = fn.split('/')[6]    
    
    df = pd.read_table(fn, header=0)
    df['gwas_source'] = gwas_source
    df['eqtl_source'] = eqtl_source
    df['ge_source'] = ge_source
    df['loop_source'] = loop_source
    
    if df.shape[0] > 0: 
        data.append(df)
        
data = pd.concat(data)

In [8]:
data.head()

Unnamed: 0,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,has_colocSNP_anchor,eqtl_pval,eqtl_beta,eqtl_fdr,dist,ppH0,ppH1,ppH2,ppH3,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval,sample_size,gene_start.1,gene_end.1,gene_strand,gwas_source,eqtl_source,ge_source,loop_source
0,14:98019683,,ENSG00000258393,AL049833.1,chr14,98019683,97116354,97116355,0,0,0,0,0,,,,,,,,,,97116355,97121501,,,,,,,,,,97116355,97121501,+,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive
1,14:98019683,,ENSG00000259026,AL049833.2,chr14,98019683,97119246,97119247,0,0,0,0,0,,,,,,,,,,97110416,97119247,,,,,,,,,,97110416,97119247,-,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive
2,14:98019683,,ENSG00000259110,LINC02304,chr14,98019683,97154856,97154857,0,0,0,0,0,,,,,,,,,,97154857,97158736,,,,,,,,,,97154857,97158736,+,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive
3,14:98019683,,ENSG00000285584,AL158800.1,chr14,98019683,97180524,97180525,0,0,0,0,0,,,,,,,,,,97180525,97217778,,,,,,,,,,97180525,97217778,+,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive
4,14:98019683,,ENSG00000246084,LINC02325,chr14,98019683,97458815,97458816,0,0,0,1,0,0.475823,0.030109,0.963438,560867.0,,,,,,97458816,97581601,T,C,,,,,,,,97458816,97581601,+,T1D_34012112_Gaulton,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive


### Checking the uniq set of genes

In [9]:
unique_genes = data.geneid.unique()
unique_gnames = [gene_dict[x] for x in unique_genes]
unique_genes_df = pd.DataFrame([unique_genes, unique_gnames]).T
unique_genes_df.columns = ['geneid', 'genename']
unique_genes_df

Unnamed: 0,geneid,genename
0,ENSG00000258393,AL049833.1
1,ENSG00000259026,AL049833.2
2,ENSG00000259110,LINC02304
3,ENSG00000285584,AL158800.1
4,ENSG00000246084,LINC02325
...,...,...
434,ENSG00000212312,RNA5SP109
435,ENSG00000237750,AC011900.1
436,ENSG00000184611,KCNH7
437,ENSG00000230282,RPL7P61


### Analyzing the Number of eQTLs, loops, colocaled SNP-Gene (per DataSet)

In [10]:
cells_with_loops = data[data.has_fithichip_loop == 1]
uniq_cells = cells_with_loops[['eqtl_source', 'ge_source', 'loop_source']].values.tolist()
uniq_cells = set([tuple(x) for x in uniq_cells])
uniq_cells = list(uniq_cells)
uniq_cells = pd.DataFrame(uniq_cells)
uniq_cells.columns = ['eqtl_source', 'ge_source', 'loop_source']

In [11]:
uniq_cells.sort_values('ge_source')

Unnamed: 0,eqtl_source,ge_source,loop_source
3,Schmiedel_2018,B-cell_naive,B-cell_naive
0,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive
1,Quach_2016,monocyte_IAV,monocyte_naive
2,Quach_2016,monocyte_Pam3CSK4,monocyte_naive
4,Quach_2016,monocyte_R848,monocyte_naive


In [12]:
# group by the dataset combos
eqtl_ge_grps = data.groupby(['eqtl_source', 'ge_source', 'loop_source'])

# calculate the number of sg pairs 
def count_uniq(x):
    v = x[['sid', 'geneid']].values.tolist()
    v = [tuple(x) for x in v]
    v = set(v)
    return(len(v))
eqtl_ge_sg_pairs = eqtl_ge_grps.apply(count_uniq).to_frame()
eqtl_ge_sg_pairs.columns = ['num_sg_pairs']

# calculate the number of sg pairs with loops
eqtl_ge_loops = eqtl_ge_grps.has_fithichip_loop.sum().to_frame()

# calculate the number of sg pairs with significant eQTL signal
eqtl_ge_eqtl = eqtl_ge_grps.is_eqtl_pair.sum().to_frame()

# calculate the number of sg pairs with significant coloc 
eqtl_ge_coloc = eqtl_ge_grps.is_coloc_pair.sum().to_frame()

In [13]:
eqtl_ge_master = eqtl_ge_sg_pairs.merge(eqtl_ge_coloc, left_index=True, right_index=True)
eqtl_ge_master = eqtl_ge_master.merge(eqtl_ge_eqtl, left_index=True, right_index=True)
eqtl_ge_master = eqtl_ge_master.merge(eqtl_ge_loops, left_index=True, right_index=True)

eqtl_ge_master.columns = ['Number of\\nSNP-Gene Pairs',
                          'Number of\\nColoc Pairs',
                          'Number of\\neQTL Pairs',
                          'Number of\\nPairs with a Loop']
eqtl_ge_master.index.names = ['eQTL Source', 'GE Source', 'Loop Source']

In [14]:
HTML(eqtl_ge_master.to_html().replace('\\n', '<br>'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of SNP-Gene Pairs,Number of Coloc Pairs,Number of eQTL Pairs,Number of Pairs with a Loop
eQTL Source,GE Source,Loop Source,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Quach_2016,monocyte_IAV,monocyte_naive,67,2,0,4
Quach_2016,monocyte_Pam3CSK4,monocyte_naive,41,1,0,2
Quach_2016,monocyte_R848,monocyte_naive,92,3,1,2
Schmiedel_2018,B-cell_naive,B-cell_naive,38,1,0,12
Schmiedel_2018,CD4_T-cell_anti-CD3-CD28,CD4_T-cell_naive,217,3,0,0
Schmiedel_2018,CD4_T-cell_naive,CD4_T-cell_naive,50,1,0,0
Schmiedel_2018,CD8_T-cell_anti-CD3-CD28,CD8_T-cell_naive,51,2,1,0
Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive,19,1,0,1
Schmiedel_2018,NK-cell_naive,NK-cell_naive,48,1,0,0


#### Extract all pairs with a significant eQTL

In [15]:
sg_with_eqtl = data.loc[(data.is_eqtl_pair == 1), major_cols]

In [16]:
print('There are {} SNP-Gene pairs with an eQTL.'.format(len(sg_with_eqtl)))

There are 2 SNP-Gene pairs with an eQTL.


In [17]:
sg_with_eqtl

Unnamed: 0,gwas_source,ge_source,loop_source,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,eqtl_pval,eqtl_fdr,dist,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_pval,gene_strand,eqtl_source
6,T1D_34012112_Gaulton,CD8_T-cell_anti-CD3-CD28,CD8_T-cell_naive,18:69869260,rs12969657,ENSG00000206052,DOK6,chr18,69869260,69400887,69400888,1,1,0,0,8.844e-09,,468372.0,0.825578,69400888,69849087,C,T,2060.0,0.411342,5008.0,8.87e-13,+,Schmiedel_2018
6,T1D_34012112_Gaulton,monocyte_R848,monocyte_naive,18:69870115,rs17207042,ENSG00000206052,DOK6,chr18,69870115,69400887,69400888,1,1,0,0,7.02623e-07,0.000302,469227.0,0.904612,69400888,69849087,T,C,2041.0,0.407548,5008.0,6.04e-13,+,Quach_2016


#### Extract all pairs with a colocalization

In [18]:
sg_with_coloc = data.loc[(data.is_coloc_pair == 1), major_cols]

In [19]:
print('There are {} SNP-Gene pairs with a coloc.'.format(len(sg_with_coloc)))

There are 15 SNP-Gene pairs with a coloc.


In [20]:
sg_with_coloc

Unnamed: 0,gwas_source,ge_source,loop_source,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,eqtl_pval,eqtl_fdr,dist,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_pval,gene_strand,eqtl_source
10,T1D_34012112_Gaulton,CD8_T-cell_naive,CD8_T-cell_naive,14:98019683,rs922406,ENSG00000259097,AL163932.1,chr14,98019683,98205142,98205143,0,1,0,0,0.0697046,0.786455,185460.0,0.832921,98068240,98205143,T,C,2485.0,0.496206,5008.0,1.11e-09,-,Schmiedel_2018
6,T1D_34012112_Gaulton,CD8_T-cell_anti-CD3-CD28,CD8_T-cell_naive,18:69869260,rs12969657,ENSG00000206052,DOK6,chr18,69869260,69400887,69400888,1,1,0,0,8.844e-09,,468372.0,0.825578,69400888,69849087,C,T,2060.0,0.411342,5008.0,8.87e-13,+,Schmiedel_2018
44,T1D_34012112_Gaulton,CD8_T-cell_anti-CD3-CD28,CD8_T-cell_naive,8:119070702,rs13259300,ENSG00000136982,DSCC1,chr8,119070702,119855893,119855894,0,1,0,0,0.00407592,,785192.0,0.884369,119833976,119855894,A,C,3146.0,0.628195,5008.0,3.28e-10,-,Schmiedel_2018
42,T1D_34012112_Gaulton,NK-cell_naive,NK-cell_naive,10:88275897,rs12416116,ENSG00000286116,AL157394.2,chr10,88275897,88994248,88994249,0,1,0,0,0.00246838,0.292485,718352.0,0.866106,88990045,88994249,C,A,1484.0,0.296326,5008.0,7.03e-22,-,Schmiedel_2018
159,T1D_34012112_Gaulton,CD4_T-cell_anti-CD3-CD28,CD4_T-cell_naive,14:100839708,rs941576,ENSG00000258404,LINC02320,chr14,100839708,101731107,101731108,0,1,0,0,0.00260523,0.27657,891400.0,0.831116,101634454,101731108,A,G,1895.0,0.378395,5008.0,4.91e-16,-,Schmiedel_2018
172,T1D_34012112_Gaulton,CD4_T-cell_anti-CD3-CD28,CD4_T-cell_naive,18:69855122,rs36024512,ENSG00000206052,DOK6,chr18,69855122,69400887,69400888,0,1,0,0,,,,0.85065,69400888,69849087,,,2329.0,0.465056,5008.0,8.29e-14,+,Schmiedel_2018
215,T1D_34012112_Gaulton,CD4_T-cell_anti-CD3-CD28,CD4_T-cell_naive,8:119070732,rs13261635,ENSG00000254343,AC091563.1,chr8,119070732,120052179,120052180,0,1,0,0,0.294779,0.934395,981448.0,0.816566,120052180,120056201,T,C,3669.0,0.732628,5008.0,9.46e-10,+,Schmiedel_2018
6,T1D_34012112_Gaulton,CD4_T-cell_naive,CD4_T-cell_naive,1:113285485,rs773560,ENSG00000273483,AL354760.1,chr1,113285485,112518440,112518441,0,1,0,0,0.146252,0.868665,767044.0,0.978083,112517799,112518441,G,A,1854.0,0.370208,5008.0,4.2300000000000004e-27,-,Schmiedel_2018
4,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,rs17106304,ENSG00000258837,AL133370.1,chr14,68793794,68130195,68130196,0,1,0,0,0.0388787,0.692297,663598.0,0.924525,68125004,68130196,C,G,3426.0,0.684105,5008.0,6.83e-15,-,Schmiedel_2018
9,T1D_34012112_Gaulton,monocyte_Pam3CSK4,monocyte_naive,9:4296430,rs10814917,ENSG00000107249,GLIS3,chr9,4296430,4348391,4348392,0,1,0,0,0.0276442,0.568634,51962.0,0.776345,3824127,4348392,A,G,1700.0,0.339457,5008.0,1.18e-17,-,Quach_2016


#### Extract all pairs with a FitHiChIP Loop

In [21]:
sg_with_loops = data.loc[(data.has_fithichip_loop == 1), major_cols]

In [22]:
print('There are {} SNP-Gene pairs with a loop.'.format(len(sg_with_loops)))

There are 21 SNP-Gene pairs with a loop.


In [23]:
sg_with_loops

Unnamed: 0,gwas_source,ge_source,loop_source,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,eqtl_pval,eqtl_fdr,dist,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_pval,gene_strand,eqtl_source
4,T1D_34012112_Gaulton,CD8_T-cell_naive,CD8_T-cell_naive,14:98019683,,ENSG00000246084,LINC02325,chr14,98019683,97458815,97458816,0,0,0,1,0.475823,0.963438,560867.0,,97458816,97581601,T,C,,,,,+,Schmiedel_2018
0,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000258759,AL049779.2,chr14,68793794,67799003,67799004,0,0,0,1,,,,,67799004,67799609,,,,,,,+,Schmiedel_2018
1,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000072121,ZFYVE26,chr14,68793794,67816589,67816590,0,0,0,1,0.829822,0.991668,977204.0,,67727374,67816590,C,G,,,,,-,Schmiedel_2018
2,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000182185,RAD51B,chr14,68793794,67819778,67819779,0,0,0,1,0.343669,0.937732,974015.0,,67819779,68730218,C,G,,,,,+,Schmiedel_2018
3,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000274666,AL133370.2,chr14,68793794,68114143,68114144,0,0,0,1,,,,,68113706,68114144,,,,,,,-,Schmiedel_2018
5,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000244677,RN7SL706P,chr14,68793794,68149907,68149908,0,0,0,1,,,,,68149617,68149908,,,,,,,-,Schmiedel_2018
6,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000243546,RN7SL108P,chr14,68793794,68236538,68236539,0,0,0,1,,,,,68236243,68236539,,,,,,,-,Schmiedel_2018
7,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000240210,AL122013.1,chr14,68793794,68339527,68339528,0,0,0,1,,,,,68338728,68339528,,,,,,,-,Schmiedel_2018
8,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000258477,PPIAP6,chr14,68793794,68422195,68422196,0,0,0,1,,,,,68421698,68422196,,,,,,,-,Schmiedel_2018
9,T1D_34012112_Gaulton,B-cell_naive,B-cell_naive,14:68793794,,ENSG00000259038,AL121820.2,chr14,68793794,68628444,68628445,0,0,0,1,0.796612,0.989653,165349.0,,68627166,68628445,C,G,,,,,-,Schmiedel_2018


## Finalizing the super master table

In [24]:
#data = data.loc[:, new_order]
data.drop_duplicates(subset=['sid', 'geneid', 'eqtl_source', 'ge_source', 'loop_source'], inplace=True)
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.tsv')
data[major_cols].to_csv(master_fn, sep='\t', index=False, na_rep='nan')

In [25]:
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.xlsx')
xdata = data.sort_values(['rsid', 'gwas_source','eqtl_source', 'ge_source', 'loop_source', 'geneid'])
xdata = xdata[major_cols].set_index(['rsid', 'gwas_source','eqtl_source', 'ge_source', 'loop_source', 'geneid'])
xdata.to_excel(master_fn, na_rep='nan')