# Make the master table

In [27]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp
import json
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strand1', 'strand2']

In [28]:
## default values for the command line
sys.argv = [0] * 8
sys.argv[1] =  'results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/'
sys.argv[1] += 'DICE_eQTL_CD4_NAIVE/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
sys.argv[2] = 'results/refs/ensembl/gencode.v19.annotation.bed'
sys.argv[3] = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_L/FitHiChIP.interactions_FitHiC_Q0.01.bed'
sys.argv[4] = 'results/refs/spp/SPP_D-Challenge_networks.xlsx'
sys.argv[5] = 'results/refs/hg19/hg19.chrom.sizes'
sys.argv[6] = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/DICE_eQTL/CD4_NAIVE.txt.gz'
sys.argv[7] = 'results/main/loop_analysis/washU/'

In [29]:
# parsing the commandline arguments
coloc_fn = sys.argv[1]
genes_fn = sys.argv[2]
loop_fn = sys.argv[3]
spp_fn = sys.argv[4]
gs_fn = sys.argv[5]
eqtl_fn = sys.argv[6]
outdir = sys.argv[7]

# setting the output file names
os.makedirs(outdir, exist_ok=True)

## Load the colocalization data

In [30]:
# load the colocalization data
coloc = pd.read_table(coloc_fn)

# extract the most significant according the H4 
coloc_sig_df = coloc[coloc['pp_H4_Coloc_Summary'] > 0.75]
coloc_sig_df['sid'] = coloc_sig_df['chr'].str.replace('chr', '') + ':' + coloc_sig_df['pos'].astype(str)

coloc_sig_full = coloc_sig_df.copy(deep=True)
coloc_sig_df.rename(columns={'pos': 'end'}, inplace=True)
coloc_sig_df.loc[:, 'start'] = coloc_sig_df.loc[:, 'end'] - 1

coloc_sig_df = coloc_sig_df[['chr', 'start', 'end', 'rs_id', 'variant_id', 'sid', 'geneName']]
coloc_sig_df = coloc_sig_df.loc[~coloc_sig_df.duplicated(subset='rs_id'),]
coloc_sig_pbt = pbt.BedTool.from_dataframe(coloc_sig_df.iloc[:, [0,1,2,5]])

In [126]:
coloc_sig_df

Unnamed: 0,chr,start,end,rs_id,variant_id,sid,geneName
0,chr6,26357897,26357898,rs556123236,rs142257165:26357898:C:CT,6:26357898,BTN3A2
2,chr11,64107476,64107477,rs479777,rs479777,11:64107477,AP003774.1
3,chr12,112521447,112521448,rs4767364,exm-rs4767364,12:112521448,TMEM116
4,chr12,56401084,56401085,rs10876864,rs10876864,12:56401085,RPS26
5,chr16,11426330,11426331,rs12597893,rs12597893:11426331:A:G,16:11426331,RMI2
7,chr16,28534037,28534038,rs62034321,rs62034321:28534038:C:T,16:28534038,SULT1A2
8,chr18,67525856,67525857,rs3018275,rs3018275:67525857:C:T,18:67525857,DOK6
9,chr21,43855066,43855067,rs1893592,rs1893592,21:43855067,UBASH3A


In [180]:
coloc_genes = coloc_sig_df.geneName.unique()

In [181]:
coloc_genes

array(['BTN3A2', 'AP003774.1', 'TMEM116', 'RPS26', 'RMI2', 'SULT1A2',
       'DOK6', 'UBASH3A'], dtype=object)

In [184]:
fivekb_genes.gname.unique()

array(['RP11-21A7A.3', 'TRPT1', 'RP11-783K16.14', 'DNAJC4', 'VEGFB',
       'FKBP2', 'RP11-783K16.5', 'RP11-783K16.13', 'ENSG00000207024.1',
       'TEX40', 'TRMT112', 'PRDX5', 'MIR1237', 'AP003774.6', 'AP003774.4',
       'AP005273.1', 'AP006288.1', 'SLC22A12', 'AP001462.6', 'MEN1',
       'AP001187.11', 'AP001187.1', 'RP11-665N17.4', 'MIR192', 'MIR194-2',
       'GPHA2', 'BATF2', 'AP000436.4', 'SAC3D1', 'RN7SL114P', 'ZFPL1',
       'AP003068.6', 'FLRT1', 'FERMT3', 'NUDT22', 'PPP1R14B', 'PLCB3',
       'BAD', 'GPR137', 'KCNK4', 'RP11-783K16.10', 'ESRRA', 'CCDC88B',
       'RPS6KA4', 'AP003774.5', 'SLC22A11', 'AP001092.4', 'RASGRP2',
       'PYGM', 'SF1', 'RP11-869B15.1', 'MAP4K2', 'CDC42BPG', 'EHD1',
       'ATG2A', 'PPP2R5B', 'C11orf85', 'ARL2', 'RP11-399J13.3', 'SNX15',
       'NAALADL1', 'CDCA5', 'VPS51', 'MACROD1', 'RP11-21A7A.2',
       'AP003774.1', 'NRXN2', 'STIP1', 'RP11-697H9.5', 'C11orf95',
       'RP11-466C23.4', 'RN7SL596P', 'RP11-466C23.5', 'RNU6-1306P',
       'RCOR2', '

## Load the gene data

In [31]:
# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode.type.isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5]]
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

## Load the eQTL's

In [32]:
# get eQTL's
eqtls = pd.read_table(eqtl_fn)
eqtls.columns = ['eqtl_gname', 'nvar', 'shape1', 'shape2', 'dummy',
                 'sid', 'dist', 'npval', 'slope', 'ppval', 'bpval', 'qval']
eqtl_pbt = eqtls['sid'].to_frame()
eqtl_pbt['chr'], eqtl_pbt['end'] = list(zip(*eqtl_pbt['sid'].str.split(':').values))
eqtl_pbt['chr'] = 'chr' + eqtl_pbt['chr']
eqtl_pbt['end'] = eqtl_pbt['end'].astype(int)
eqtl_pbt['start'] = eqtl_pbt['end'] - 1 
eqtl_pbt = eqtl_pbt[['chr', 'start', 'end', 'sid']]
eqtl_pbt = pbt.BedTool.from_dataframe(eqtl_pbt)

## Find all genes +/- 500kb of the eQTL SNPs

In [42]:
# filter for eqtl that are near the coloc snps
eqtl_pbt = eqtl_pbt.intersect(coloc_sig_pbt.slop(b=500000, g=gs_fn), wa=True)

In [47]:
# get a list of gene names within +- 500kb of the SNPs
fivekb_genes = eqtl_pbt.slop(b=500000, g=gs_fn)
fivekb_genes = fivekb_genes.intersect(genes_pbt, wa=True, wb=True)
fivekb_genes = fivekb_genes.to_dataframe().iloc[:, [0,1,2,4,5,6,3,7,8]]
fivekb_genes.columns = bedpe_6cols + ['sid', 'gname', 'gid']
fivekb_genes['startA'] += 500000
fivekb_genes['endA'] -= 500000

## Find the closest gene

In [48]:
closest_gene = eqtl_pbt.closest(genes_pbt, d=True)
closest_gene = closest_gene.to_dataframe().iloc[:, [0,1,2,4,5,6,3,7,8,9]]
closest_gene.columns = bedpe_6cols + ['sid', 'gname', 'gid', 'dist']
closest_gene.set_index(['sid', 'gname'], inplace=True)

## Get the loops

In [49]:
# load the loop data
loops = pd.read_table(loop_fn)
tmp_loops = loops[['chr1', 's1', 'e1', 'chr2', 's2', 'e2']]
tmp_loops.rename(columns={'p': 'score'}, inplace=True)
tmp_loops.loc[:, 'name'] = '.'
tmp_loops.loc[:, 'score'] = loops['p']
tmp_loops.loc[:, 'strand1'] = '.'
tmp_loops.loc[:, 'strand2'] = '.'
loops = pbt.BedTool.from_dataframe(tmp_loops)
print('FitHiChIP found {} significant loops.'.format(tmp_loops.shape[0]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.or

FitHiChIP found 577026 significant loops.


In [50]:
# #### Find out SNP - 5kb gene pairs with loops\
# re-arranging to fit bedpe format
fivekb_gloops = fivekb_genes.copy()
fivekb_gloops['dummy'] = 'drop'

# loading into pbt
fivekb_gloops = pbt.BedTool.from_dataframe(fivekb_gloops)
fivekb_gloops = fivekb_gloops.pair_to_pair(loops, type='both',  **{'is':True})
fivekb_gloops = fivekb_gloops.to_dataframe(disable_auto_names=True, header=None)

In [51]:
fivekb_gloops.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,chr11,63934301,63934302,chr11,63742079,63744015,11:63934302,COX8A,ENSG00000176340.3,drop,chr11,63740000,63745000,chr11,63930000,63935000,.,7.219366e-08,.,.
1,chr11,63934301,63934302,chr11,63991271,63993726,11:63934302,TRPT1,ENSG00000149743.9,drop,chr11,63930000,63935000,chr11,63990000,63995000,.,2.701329e-07,.,.
2,chr11,63934301,63934302,chr11,64002010,64006259,11:63934302,VEGFB,ENSG00000173511.5,drop,chr11,63930000,63935000,chr11,64005000,64010000,.,2.007063e-07,.,.
3,chr11,63934301,63934302,chr11,64008475,64011604,11:63934302,FKBP2,ENSG00000173486.8,drop,chr11,63930000,63935000,chr11,64005000,64010000,.,2.007063e-07,.,.
4,chr11,63934301,63934302,chr11,64067863,64072242,11:63934302,TEX40,ENSG00000219435.3,drop,chr11,63930000,63935000,chr11,64070000,64075000,.,9.857282e-08,.,.


In [52]:
fivekb_gloops_set = fivekb_gloops.iloc[:, [6,8]]
fivekb_gloops_uniq = set([tuple(x) for x in fivekb_gloops_set.values.tolist()])

## Construct master table

In [238]:
# begin making the master
master = fivekb_genes.copy()
master['sid'] = master['chrA'].str.replace('chr', '') + ':' + master['endA'].astype(str)

In [239]:
# add eqtl metadata
# Need to join using outer so as to include ALL eQTL data
master = master.merge(eqtls, left_on=['sid', 'gname'], right_on=['sid', 'eqtl_gname'], how='outer')
master['is_eqtl_pair'] = (~master['ppval'].isna()).astype(int)
master.loc[master['gname'].isna(), 'gname'] = master.loc[master['gname'].isna(), 'eqtl_gname'] 

In [240]:
master.shape

(3553, 21)

In [241]:
# check for the closets gene
closets_check = [0] * master.shape[0]
for i, sr in master.iterrows():

    # check closest gene
    rs_gene = (sr.sid, sr.gname)
    if rs_gene in closest_gene.index:
        closets_check[i] = 1

master['is_closest_gene'] = closets_check

  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)


In [242]:
# add colocalization data for SNP and is_coloc_snp columns
tmp_coloc = coloc_sig_full.copy()
tmp_coloc = coloc_sig_full[['sid',
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'rs_id',
 'geneName',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'slope_gwas',
 'slope_se_gwas',
 'pval_nominal',
 'SampleSize']]

In [243]:
tmp_coloc.rename(columns={'slope_gwas': 'gwas_slope',
                          'slope_se_gwas': 'gwas_slope_se',
                          'pval_nominal': 'gwas_pval_nominal',
                          'geneName': 'gname'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [244]:
master = master.merge(tmp_coloc, on=['sid', 'gname'], how='outer')

In [245]:
master.shape

(3554, 37)

In [246]:
# # add a columns to check the coloc gene status
# coloc_gids = master.loc[master.gname == master.eqtl_gname, 'gid']
# master['is_coloc_gene_id'] = master.gid.isin(coloc_gids).astype(int)

In [247]:
# add a column to check the coloc pair status 
master['is_coloc_pair'] = (~master['pp_H4_Coloc_Summary'].isna()).astype(int)

In [248]:
# check for the loop gene
loop_check = [0] * master.shape[0]
for i, sr in master.iterrows():

    # check closest gene
    rs_gene = (sr.sid, sr.gid)
    if rs_gene in fivekb_gloops_uniq:
        loop_check[i] = 1

master['has_fithichip_loop'] = loop_check

In [249]:
master = master[[
 'rs_id',
 'gname',
 'gid',
 'chrA',
 'endA',    
 'startB',
 'endB',
 #'is_coloc_gene_id', 
 'is_eqtl_pair', 
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'sid',
 'nvar',
 'shape1',
 'shape2',
 'npval',
 'slope',
 'ppval',
 'bpval',
 'qval',
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_slope',
 'gwas_slope_se',
 'gwas_pval_nominal',
 'SampleSize']]

In [250]:
master.rename(columns={'chrA':'chrom', 'endA': 'snp_pos', 
                       'startB': 'gene_start', 'endB': 'gene_end',
                       'gname': 'gene_name', 'gid': 'gene_id'}, inplace=True)
master.sort_values(['chrom', 'snp_pos', 'gene_start', 'rs_id'], inplace=True)

In [255]:
master[(master.is_eqtl_pair == 1) & (master.has_fithichip_loop == 1) & (master.is_coloc_pair == 0)]

Unnamed: 0,rs_id,gene_name,gene_id,chrom,snp_pos,gene_start,gene_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,...,pp_H4_Coloc_Summary,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval_nominal,SampleSize
88,,TRMT112,ENSG00000173113.2,chr11,63934302.0,64083932.0,64085556.0,1,0,0,...,,,,,,,,,,
244,,MEN1,ENSG00000133895.10,chr11,64549514.0,64570982.0,64578766.0,1,0,0,...,,,,,,,,,,
647,,TUFM,ENSG00000178952.4,chr16,28837515.0,28853732.0,28857729.0,1,0,0,...,,,,,,,,,,
754,,TMPRSS3,ENSG00000160183.9,chr21,43824106.0,43791999.0,43816955.0,1,0,0,...,,,,,,,,,,
823,,PDE9A,ENSG00000160191.13,chr21,44157187.0,44073746.0,44195619.0,1,0,1,...,,,,,,,,,,


In [33]:
# write out the master data
fn = os.path.join(outdir, 'master.tsv')
master.to_csv(fn, sep='\t', header=True, index=False)

fn = os.path.join(outdir, 'master.xlsx')
excel_master = master.sort_values('rs_id').set_index('rs_id')
excel_master.to_excel(fn, na_rep='nan')