In [1]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 10000

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

# make the directory to save our data
outdir = 'results/main/gwas_pieqtls/2021_chiou_et_al/2021_chandra_et_al/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

In [2]:
chain="/mnt/BioHome/jreyna/jreyna/projects/dchallenge/results/main/2021_Nikhil_eQTL/Data/refs/ucsc/hg38ToHg19.over.chain.gz"

In [3]:
def liftover(chain, bed, lifted, unmapped):
    cmd = '/mnt/BioHome/jreyna/software/UCSC_Browser_Tools/liftOver -bedPlus=3 -tab {} {} {} {}'
    cmd = cmd.format(bed, chain, lifted, unmapped)
    sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, shell=True)
    print(sp.communicate())

## Load Coloc Datasets

In [4]:
colocs = 'results/main/coloc/Results/Colocalization_SMKN/T1D_34012112_Gaulton/'
colocs += '*/*/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
colocs = glob.glob(colocs)

In [5]:
coloc_data = []
for coloc in colocs:

    cline = coloc.split('/')[7]
    
    lifted = os.path.join(outdir, '{}.lifted.bed'.format(cline))
    unmapped = os.path.join(outdir, '{}.unmapped.bed'.format(cline))
    
    if not os.path.exists(lifted):
        
        df = pd.read_table(coloc, header=0)   
        df['start'] = df['pos'] - 1
        df = df.iloc[:, [0,-1,1] + list(range(2, 24))]

        interm_fn = os.path.join(outdir, '{}.interm.bed'.format(cline))
        df.to_csv(interm_fn, sep='\t', index=False, header=False)    
        liftover(chain, interm_fn, lifted, unmapped)
    
    lifted_df = pd.read_table(lifted, header=None)

    # add cell type
    lifted_df['cline'] = cline

    coloc_data.append(lifted_df) 


Reading liftover chains
Mapping coordinates
(None, None)


In [6]:
coloc_df = pd.concat(coloc_data)

In [7]:
coloc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,cline
0,chr9,4296429,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,rs10814917:4296430:A:G,...,G,1700,0.339457,5008,21.274727,-0.119714,0.013994,1.180000e-17,520580,monocyte_Pam3CSK4
1,chr9,4296429,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,rs10814917:4296430:A:G,...,G,1700,0.339457,5008,0.320107,-0.119714,0.013994,1.180000e-17,520580,monocyte_Pam3CSK4
2,chr9,4296429,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,rs10814917:4296430:A:G,...,G,1700,0.339457,5008,10.206081,-0.119714,0.013994,1.180000e-17,520580,monocyte_Pam3CSK4
3,chr9,4296429,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,rs10814917:4296430:A:G,...,G,1700,0.339457,5008,0.100496,-0.119714,0.013994,1.180000e-17,520580,monocyte_Pam3CSK4
4,chr9,4296429,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,rs10814917:4296430:A:G,...,G,1700,0.339457,5008,30.427727,-0.119714,0.013994,1.180000e-17,520580,monocyte_Pam3CSK4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,chr16,75281234,75281235,8.207507e-16,5.333602e-16,0.070681,0.045048,0.884271,rs11317724,rs11317724:75247337:GT:G,...,G,700,0.139776,5008,0.080635,0.170677,0.022578,4.040000e-14,520580,CD8_T-cell_naive
37,chr16,75281234,75281235,8.207507e-16,5.333602e-16,0.070681,0.045048,0.884271,rs11317724,rs11317724:75247337:GT:G,...,G,700,0.139776,5008,3.355752,0.170677,0.022578,4.040000e-14,520580,CD8_T-cell_naive
38,chr16,75281234,75281235,8.207507e-16,5.333602e-16,0.070681,0.045048,0.884271,rs11317724,rs11317724:75247337:GT:G,...,G,700,0.139776,5008,0.100064,0.170677,0.022578,4.040000e-14,520580,CD8_T-cell_naive
39,chr16,75281234,75281235,8.207507e-16,5.333602e-16,0.070681,0.045048,0.884271,rs11317724,rs11317724:75247337:GT:G,...,G,700,0.139776,5008,0.654065,0.170677,0.022578,4.040000e-14,520580,CD8_T-cell_naive


In [8]:
coloc_bed = coloc_df.iloc[:, [0,1,2,-1]]
coloc_bed.columns = ['chr', 'start', 'end', 'cline']
coloc_bed['chr'] = coloc_bed['chr'].str.replace('chr', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coloc_bed['chr'] = coloc_bed['chr'].str.replace('chr', '')


## Intersect Colocs and pieQTLs

In [9]:
pieqtls = glob.glob('results/main/pieqtls/2021_chandra_et_al/*/proximal.pieqtls.tsv')

In [10]:
pieqtl_data = []
for pieqtl in pieqtls:
    print(pieqtl)
    
    cline = pieqtl.split('/')[-2]
    df = pd.read_table(pieqtl)    
    df['cline'] = cline

    pieqtl_data.append(df) 

results/main/pieqtls/2021_chandra_et_al/B-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/NK-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/monocyte_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD4_T-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD8_T-cell_naive/proximal.pieqtls.tsv


In [11]:
pieqtl_df = pd.concat(pieqtl_data)

In [12]:
pieqtl_bed = pieqtl_df.iloc[:, [1,2,2,-1]]
pieqtl_bed.columns = ['chrom', 'start', 'end', 'cline']
pieqtl_bed.loc[:, 'chrom'] = pieqtl_bed['chrom'].str.replace('chr', '')
pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pieqtl_bed.loc[:, 'chrom'] = pieqtl_bed['chrom'].str.replace('chr', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1


In [45]:
pieqtl_bed.cline.unique()

array(['B-cell_naive', 'NK-cell_naive', 'monocyte_naive',
       'CD4_T-cell_naive', 'CD8_T-cell_naive'], dtype=object)

In [46]:
coloc_bed.cline.unique()

array(['monocyte_Pam3CSK4', 'monocyte_IAV', 'monocyte_R848', 'T-cell',
       'LCL', 'neutrophil', 'monocyte', 'pancreatic_islet',
       'B-cell_naive', 'Th2_memory', 'Th1-17_memory', 'Tfh_memory',
       'Treg_naive', 'CD4_T-cell_anti-CD3-CD28', 'NK-cell_naive',
       'Th1_memory', 'Treg_memory', 'CD4_T-cell_naive',
       'CD8_T-cell_anti-CD3-CD28', 'Th17_memory', 'CD8_T-cell_naive'],
      dtype=object)

In [47]:
pieqtl_to_coloc_comps = [('B-cell_naive', 'B-cell_naive'),
                      ('NK-cell_naive', 'NK-cell_naive'),
                      #('monocyte_naive', 'monocyte_CD16_naive'), 
                      #('monocyte_naive', 'monocyte_naive'),
                      ('monocyte_naive', 'monocyte'),
                      #('monocyte_naive', 'monocyte_LPS'),
                      ('monocyte_naive', 'monocyte_IAV'),
                      ('monocyte_naive', 'monocyte_Pam3CSK4'),
                      ('CD4_T-cell_naive', 'CD4_T-cell_anti-CD3-CD28'), 
                      ('CD4_T-cell_naive', 'CD4_T-cell_naive'),  
                      ('CD8_T-cell_naive', 'CD8_T-cell_naive'), 
                      ('CD8_T-cell_naive', 'CD8_T-cell_anti-CD3-CD28')]

In [48]:
pc_df = pd.DataFrame(pieqtl_to_coloc_comps)

In [49]:
pc_df.columns = ['pieQTL_source', 'coloc_source']

In [50]:
pc_df

Unnamed: 0,pieQTL_source,coloc_source
0,B-cell_naive,B-cell_naive
1,NK-cell_naive,NK-cell_naive
2,monocyte_naive,monocyte
3,monocyte_naive,monocyte_IAV
4,monocyte_naive,monocyte_Pam3CSK4
5,CD4_T-cell_naive,CD4_T-cell_anti-CD3-CD28
6,CD4_T-cell_naive,CD4_T-cell_naive
7,CD8_T-cell_naive,CD8_T-cell_naive
8,CD8_T-cell_naive,CD8_T-cell_anti-CD3-CD28


In [51]:
pieqtl_bed_grps = pieqtl_bed.groupby('cline')
coloc_bed_grps = coloc_bed.groupby('cline')

In [52]:
intersection_data = []
for pieqtl_cell, coloc_cell in pieqtl_to_coloc_comps: 
    
    # skip TBDs
    if 'TBD' in [pieqtl_cell, coloc_cell]:
        continue
    
    pieqtl_cline = pieqtl_bed_grps.get_group(pieqtl_cell)
    coloc_cline = coloc_bed_grps.get_group(coloc_cell)
    
    pieqtl_pbt = pbt.BedTool.from_dataframe(pieqtl_cline)
    coloc_pbt = pbt.BedTool.from_dataframe(coloc_cline)
    
    intersection = pieqtl_pbt.intersect(coloc_pbt.slop(b=0, g=gsizes), wa=True, wb=True).to_dataframe()
    
    if len(intersection) > 0: 
        intersection_data.append(intersection)
        print(pieqtl_cell, coloc_cell)


In [53]:
intersection_df = pd.concat(intersection_data)

ValueError: No objects to concatenate

In [None]:
intersection_df