In [1]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 10000

# make the directory to save our data
outdir = 'results/main/gwas_pieqtls/2021_chiou_et_al/2021_chandra_et_al/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Load Fine Mapped GWAS

In [2]:
gwas = 'results/main/finemapping/T1D_34012112_Gaulton/GRCh37/offset_1000000/Summary/sss/FINAL_top_snp_credible_set.txt'
gwas_df = pd.read_table(gwas)

In [3]:
gwas_bed = gwas_df.iloc[:, [4,5,5]]
gwas_bed.columns = ['chrom', 'start', 'end']
gwas_bed['start'] = gwas_bed['start'] - 1
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gwas_bed['start'] = gwas_bed['start'] - 1


## Intersect Fine Mapped GWAS and loops

In [4]:
def parse_seB(x): 
    s,e = x.split(':')[1].split('-')
    e = e.split(',')[0]
    return((s,e))

In [5]:
loops = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/'
loops += '*/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz'
loops = glob.glob(loops)

In [6]:
loop_data = []
for loop in loops:
    print(loop)
    
    cline = loop.split('/')[5]
    df = pd.read_table(loop, header=None)    
    df.columns = ['chrom', 'startA', 'endA', 'seB', 'e1', 'e2']
    
    df['chrom'] = df['chrom'].str.replace('chr', '')
    
    df['startB'], df['endB'] = zip(*df['seB'].apply(parse_seB))
    df['startB'] = df['startB'].astype(int)
    
    df['startA'] = df['startA'] + 1 
    df['endA'] = df['startA'] + res

    df['startB'] = df['startB'] + 1 
    df['endB'] = df['startB'] + res
    
    # re-organize the data into bedpe-like
    df = df.iloc[:, [0,1,2,0,6,7,3,4,5]]

    # add cell type
    df['cline'] = cline

    loop_data.append(df) 

results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TREGMEM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TH2/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NCM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TREGNAIVE/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/TH1/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/THSTAR/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/Fit

In [7]:
loop_df = pd.concat(loop_data)
loop_bed = loop_df.iloc[:, [0,1,2,3,4,5,-1]]
loop_pbt = pbt.BedTool.from_dataframe(loop_bed)

In [8]:
intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt, type='either')
#intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt.slop(b=100000, g=gsizes), type='either')
gwas_hichip = intersect_pbt.to_dataframe()

In [9]:
gwas_hichip.iloc[0]

chrom               10
start         89622500
end           89632500
name                10
score         90312500
strand        90322500
thickStart     TREGMEM
thickEnd            10
itemRgb       90319219
blockCount    90319220
Name: 0, dtype: object

In [10]:
gwas_hichip = gwas_hichip.iloc[:, [7,8,9,0,1,2,3,4,5,6]]
loop_cols = ['{}_loop'.format(x) for x in bedpe_cols]
gwas_hichip.columns = ['chr_snp', 'start_snp', 'end_snp'] + loop_cols + ['cline_loop']
gwas_hichip = gwas_hichip.merge(gwas_df, left_on=['chr_snp', 'end_snp'], right_on=['chromosome', 'position'])

In [11]:
gwas_hichip.drop('start_snp', axis=1, inplace=True)
gwas_hichip.rename(columns={'end_snp': 'position_snp'}, inplace=True)
gwas_hichip['sid'] = 'chr' +  gwas_hichip['chr_snp'].astype(str) + ':' + gwas_hichip['position_snp'].astype(str)

# add loop ids
def make_lid(sr, cols):
    lid = sr[cols].tolist()
    lid = [str(x) for x in lid]
    lid = ':'.join(lid)
    return(lid)

lid_cols = [2,3,4,5,6,7]
lids = []
for sr in gwas_hichip.values: 
    new_lid = make_lid(sr, lid_cols)
    lids.append(new_lid)
gwas_hichip['loop_id'] = lids

In [12]:
gwas_hichip.head()

Unnamed: 0,chr_snp,position_snp,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,regionID,...,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,sid,loop_id
0,10,90319220,10,89622500,89632500,10,90312500,90322500,TREGMEM,16,...,1.47094,1.0,13.5355,0.187599,3e-06,0.187599,3e-06,0.070654,chr10:90319220,10:89622500:89632500:10:90312500:90322500
1,10,90319220,10,89622500,89632500,10,90317500,90327500,TH2,16,...,1.47094,1.0,13.5355,0.187599,3e-06,0.187599,3e-06,0.070654,chr10:90319220,10:89622500:89632500:10:90317500:90327500
2,10,90319220,10,89622500,89632500,10,90312500,90322500,CD4N,16,...,1.47094,1.0,13.5355,0.187599,3e-06,0.187599,3e-06,0.070654,chr10:90319220,10:89622500:89632500:10:90312500:90322500
3,10,90319220,10,89622500,89632500,10,90317500,90327500,CD4N,16,...,1.47094,1.0,13.5355,0.187599,3e-06,0.187599,3e-06,0.070654,chr10:90319220,10:89622500:89632500:10:90317500:90327500
4,10,90319220,10,89622500,89632500,10,90317500,90327500,TREGNAIVE,16,...,1.47094,1.0,13.5355,0.187599,3e-06,0.187599,3e-06,0.070654,chr10:90319220,10:89622500:89632500:10:90317500:90327500


## Summarize the SNPs, Loops and Intersection

In [13]:
total_gwas = gwas_df.shape[0]
# # summarize the total number of GWAS loops per cell # DOESN't MAKE SENSE
# cell_summary['total_gwas'] = gwas_hichip.groupby('cline_loop').nunique('sid')['chr_snp']
# cell_summary['total_gwas'] = cell_summary['total_gwas'].to_frame()
# cell_summary['total_gwas'].columns = ['Total GWAS SNPs']
# cell_summary['total_gwas']

In [14]:
cell_summary = {}

### Summarize the Number of Loops per Cell (pre-intersection)

In [15]:
cell_summary['total_loops'] = loop_df.groupby('cline').count()['startA'].to_frame()
cell_summary['total_loops'].columns = ['total_hichip']
cell_summary['total_loops']

Unnamed: 0_level_0,total_hichip
cline,Unnamed: 1_level_1
CD4N,114421
CD8N,84599
CM,84298
NB,128288
NCM,103342
NK,129890
TFH,46172
TH1,63241
TH17,76270
TH2,58115


### Summarize the Number of SNP-Loop (SL) Pairs per Cell

In [16]:
cell_summary['sl_pairs'] = gwas_hichip['cline_loop'].value_counts().to_frame()
cell_summary['sl_pairs'].columns = ['sl_pairs']
cell_summary['sl_pairs']

Unnamed: 0,sl_pairs
NB,316
NK,237
CD8N,161
TH1,154
TH17,152
CD4N,145
TH2,141
TREGMEM,138
TREGNAIVE,112
NCM,95


### Summarize the Number of Unique GWAS SNPs which Overlap a HiChIP Loop Cell

In [17]:
cell_summary['uniq_gwas'] = gwas_hichip.groupby('cline_loop')['sid'].nunique().to_frame()
cell_summary['uniq_gwas'].columns = ['uniq_gwas_in_slpairs']
cell_summary['uniq_gwas']

Unnamed: 0_level_0,uniq_gwas_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,25
CD8N,28
CM,30
NB,58
NCM,28
NK,38
TFH,16
TH1,32
TH17,30
TH2,33


### Summarize the Number of Loops with GWAS Overlaps (per cell)

In [18]:
loop_cols = ['chrA_loop', 'startA_loop', 'endA_loop', 'chrB_loop', 'startB_loop', 'endB_loop']
cell_summary['uniq_loops'] = gwas_hichip.groupby('cline_loop')['loop_id'].nunique().to_frame()
cell_summary['uniq_loops'].columns = ['uniq_loops_in_slpairs']
cell_summary['uniq_loops']

Unnamed: 0_level_0,uniq_loops_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,110
CD8N,99
CM,46
NB,192
NCM,64
NK,168
TFH,58
TH1,99
TH17,108
TH2,80


In [19]:
concat_list = [cell_summary['total_loops'], cell_summary['sl_pairs'],
               cell_summary['uniq_gwas'], cell_summary['uniq_loops']]
summary = pd.concat(concat_list, axis=1)
summary['pct_uniq_gwas_in_slpairs'] = summary['uniq_gwas_in_slpairs'] / total_gwas * 100
summary['pct_uniq_loops_in_slpairs'] = summary['uniq_loops_in_slpairs'] / summary['total_hichip'] * 100

In [20]:
summary

Unnamed: 0,total_hichip,sl_pairs,uniq_gwas_in_slpairs,uniq_loops_in_slpairs,pct_uniq_gwas_in_slpairs,pct_uniq_loops_in_slpairs
CD4N,114421,145,25,110,25.252525,0.096136
CD8N,84599,161,28,99,28.282828,0.117023
CM,84298,69,30,46,30.30303,0.054568
NB,128288,316,58,192,58.585859,0.149663
NCM,103342,95,28,64,28.282828,0.06193
NK,129890,237,38,168,38.383838,0.12934
TFH,46172,77,16,58,16.161616,0.125617
TH1,63241,154,32,99,32.323232,0.156544
TH17,76270,152,30,108,30.30303,0.141602
TH2,58115,141,33,80,33.333333,0.137658


In [21]:
final_summary = summary.copy()

In [22]:
final_colnames = ['Total\\nHiChIP Loops', 
                  'Number of\\nGWAS-Loop Pairs',
                  'Number of\\nUnique GWAS SNPs in GL Pairs', 
                  'Number of\\nUnique loops in GL Pairs',
                  'Percentage of\\nUnique GWAS SNPs in GL Pairs', 
                  'Percentage of\\nUnique loops in GL Pairs']
final_colnames = ['Total HiChIP Loops', 
                  'Number of GWAS-Loop Pairs',
                  'Number of Unique GWAS SNPs in GL Pairs', 
                  'Number of Unique loops in GL Pairs',
                  'Percentage of Unique GWAS SNPs in GL Pairs', 
                  'Percentage of Unique loops in GL Pairs']
final_summary.columns = final_colnames

In [23]:
display(HTML(final_summary.to_html().replace("\\n","<br>")))

Unnamed: 0,Total HiChIP Loops,Number of GWAS-Loop Pairs,Number of Unique GWAS SNPs in GL Pairs,Number of Unique loops in GL Pairs,Percentage of Unique GWAS SNPs in GL Pairs,Percentage of Unique loops in GL Pairs
CD4N,114421,145,25,110,25.252525,0.096136
CD8N,84599,161,28,99,28.282828,0.117023
CM,84298,69,30,46,30.30303,0.054568
NB,128288,316,58,192,58.585859,0.149663
NCM,103342,95,28,64,28.282828,0.06193
NK,129890,237,38,168,38.383838,0.12934
TFH,46172,77,16,58,16.161616,0.125617
TH1,63241,154,32,99,32.323232,0.156544
TH17,76270,152,30,108,30.30303,0.141602
TH2,58115,141,33,80,33.333333,0.137658
