In [23]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', 300)


pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 10000

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

# make the directory to save our data
outdir = 'results/main/gwas_pieqtls/2021_chiou_et_al/2021_chandra_et_al/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

In [2]:
chain = "/mnt/BioHome/jreyna/jreyna/projects/dchallenge/results/main/2021_Nikhil_eQTL/Data/refs/ucsc/hg38ToHg19.over.chain.gz"

In [3]:
def liftover(chain, bed, lifted, unmapped):
    cmd = '/mnt/BioHome/jreyna/software/UCSC_Browser_Tools/liftOver -bedPlus=3 -tab {} {} {} {}'
    cmd = cmd.format(bed, chain, lifted, unmapped)
    sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, shell=True)
    print(sp.communicate())

## Load Coloc Datasets

In [4]:
colocs = 'results/main/coloc/Results/eQTL_Catalogue/*/*/*/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
colocs = glob.glob(colocs)

In [5]:
coloc_data = []
for coloc in colocs:
    
    # getting data source information
    path_info = coloc.split('/')
    gwas_source = coloc.split('/')[5]
    eqtl_source = coloc.split('/')[6]
    ge_source = coloc.split('/')[7]    
    
    # setting paths for liftover
    lifted = os.path.join(outdir, '{}.{}.{}.lifted.bed'.format(gwas_source, eqtl_source, ge_source))
    unmapped = os.path.join(outdir, '{}.{}.{}.unmapped.bed'.format(gwas_source, eqtl_source, ge_source))
    
    if not os.path.exists(lifted):
        
        df = pd.read_table(coloc, header=0)   
        df['start'] = df['pos'] - 1
        df = df.iloc[:, [0,-1,1] + list(range(2, 24))]

        # perform the liftover
        interm_fn = os.path.join(outdir, '{}.interm.bed'.format(cline))
        df.to_csv(interm_fn, sep='\t', index=False, header=False)    
        liftover(chain, interm_fn, lifted, unmapped)
    
    lifted_df = pd.read_table(lifted, header=None)

    # add information about source
    lifted_df['gwas_source'] = gwas_source
    lifted_df['eqtl_source'] = eqtl_source
    lifted_df['ge_source'] = ge_source

    coloc_data.append(lifted_df) 

coloc_df = pd.concat(coloc_data)
coloc_df.iloc[:, 0] = coloc_df.iloc[:, 0].str.replace('chr', '').astype(int)

In [6]:
coloc_bed = coloc_df.iloc[:, [0,1,2,-3,-2,-1]]
coloc_bed.columns = ['chr', 'start', 'end', 'gwas_source', 'eqtl_source', 'ge_source']

In [7]:
coloc_bed

Unnamed: 0,chr,start,end,gwas_source,eqtl_source,ge_source
0,10,6098823,6098824,T1D_32005708,Lepik_2017,blood
0,12,56435928,56435929,T1D_32005708,GTEx,blood
0,12,56435928,56435929,T1D_32005708,GTEx,pancreas
0,10,6098823,6098824,T1D_32005708,GTEx,LCL
0,10,6106265,6106266,T1D_32005708,Quach_2016,monocyte_IAV
...,...,...,...,...,...,...
0,6,90989124,90989125,T1D_34594039_GCST90018925,GENCORD,T-cell
0,6,91014028,91014029,T1D_34594039_GCST90018925,GTEx,LCL
0,6,26483047,26483048,T1D_34594039_GCST90018925,GTEx,pancreas
0,6,90989124,90989125,T1D_34594039_GCST90018925,BLUEPRINT,monocyte


## Load pieQTLs data

In [8]:
pieqtls = glob.glob('results/main/pieqtls/2021_chandra_et_al/*/proximal.pieqtls.tsv')

In [9]:
pieqtl_data = []
for pieqtl in pieqtls:
    print(pieqtl)
    
    cline = pieqtl.split('/')[-2]
    df = pd.read_table(pieqtl)    
    df['cline'] = cline

    pieqtl_data.append(df) 

pieqtl_df = pd.concat(pieqtl_data)
pieqtl_df.loc[:, 'Chromosome'] = pieqtl_df.loc[:, 'Chromosome'].str.replace('chr', '').astype(int)

results/main/pieqtls/2021_chandra_et_al/B-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/NK-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/monocyte_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD4_T-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD8_T-cell_naive/proximal.pieqtls.tsv


In [10]:
pieqtl_bed = pieqtl_df.iloc[:, [1,2,2,-1]]
pieqtl_bed.columns = ['chrom', 'start', 'end', 'cline']
pieqtl_bed.loc[:, 'chrom'] = pieqtl_bed['chrom']
pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pieqtl_bed.loc[:, 'chrom'] = pieqtl_bed['chrom']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1


## Intersect Colocs and pieQTLs

In [11]:
pieqtl_to_coloc_comps = [('B-cell_naive', 'B-cell_naive'),
                      ('NK-cell_naive', 'NK-cell_naive'),
                      ('monocyte_naive', 'monocyte_CD16_naive'), 
                      #('monocyte_naive', 'monocyte_naive'),
                      ('monocyte_naive', 'monocyte'),
                      #('monocyte_naive', 'monocyte_LPS'),
                      ('monocyte_naive', 'monocyte_IAV'),
                      ('monocyte_naive', 'monocyte_Pam3CSK4'),
                      ('monocyte_naive', 'monocyte_R848'),
                      ('monocyte_naive', 'monocyte_LPS'),
                      ('CD4_T-cell_naive', 'CD4_T-cell_anti-CD3-CD28'), 
                      ('CD4_T-cell_naive', 'CD4_T-cell_naive'),  
                      ('CD8_T-cell_naive', 'CD8_T-cell_naive'), 
                      ('CD8_T-cell_naive', 'CD8_T-cell_anti-CD3-CD28')]
pc_df = pd.DataFrame(pieqtl_to_coloc_comps)
pc_df.columns = ['pieQTL_source', 'coloc_source']

In [12]:
pc_df

Unnamed: 0,pieQTL_source,coloc_source
0,B-cell_naive,B-cell_naive
1,NK-cell_naive,NK-cell_naive
2,monocyte_naive,monocyte_CD16_naive
3,monocyte_naive,monocyte
4,monocyte_naive,monocyte_IAV
5,monocyte_naive,monocyte_Pam3CSK4
6,monocyte_naive,monocyte_R848
7,monocyte_naive,monocyte_LPS
8,CD4_T-cell_naive,CD4_T-cell_anti-CD3-CD28
9,CD4_T-cell_naive,CD4_T-cell_naive


In [13]:
pieqtl_bed.cline.unique()

array(['B-cell_naive', 'NK-cell_naive', 'monocyte_naive',
       'CD4_T-cell_naive', 'CD8_T-cell_naive'], dtype=object)

In [14]:
coloc_bed.ge_source.unique()

array(['blood', 'pancreas', 'LCL', 'monocyte_IAV', 'T-cell',
       'pancreatic_islet', 'neutrophil', 'monocyte', 'B-cell_naive',
       'Th1_memory', 'Treg_memory', 'Treg_naive', 'Tfh_memory',
       'Th17_memory', 'monocyte_CD16_naive', 'Th1-17_memory',
       'NK-cell_naive', 'CD8_T-cell_anti-CD3-CD28',
       'CD4_T-cell_anti-CD3-CD28', 'monocyte_naive', 'Th2_memory',
       'CD8_T-cell_naive', 'monocyte_R848', 'monocyte_Pam3CSK4',
       'CD4_T-cell_naive', 'test', 'monocyte_LPS'], dtype=object)

In [15]:
pieqtl_bed_grps = pieqtl_bed.groupby('cline')
coloc_bed_grps = coloc_bed.groupby('ge_source')

In [16]:
intersection_data = []
for pieqtl_cell, coloc_cell in pieqtl_to_coloc_comps: 
    
    # skip TBDs
    if 'TBD' in [pieqtl_cell, coloc_cell]:
        continue
    
    pieqtl_cline = pieqtl_bed_grps.get_group(pieqtl_cell)
    coloc_cline = coloc_bed_grps.get_group(coloc_cell)
    
    pieqtl_pbt = pbt.BedTool.from_dataframe(pieqtl_cline)
    coloc_pbt = pbt.BedTool.from_dataframe(coloc_cline)
    
    intersection = pieqtl_pbt.intersect(coloc_pbt.slop(b=0, g=gsizes), wa=True, wb=True).to_dataframe()
    
    if len(intersection) > 0: 
        intersection_data.append(intersection)
        print(pieqtl_cell, coloc_cell)


monocyte_naive monocyte_CD16_naive
CD8_T-cell_naive CD8_T-cell_anti-CD3-CD28


In [17]:
intersection_df = pd.concat(intersection_data)
intersection_df.columns = ['chr', 'pie_start', 'pie_end', 'cline', 'chr2', 
                           'coloc_start', 'coloc_end', 'gwas_source', 'eqtl_source', 'ge_source']

In [20]:
intersection_df

Unnamed: 0,chr,pie_start,pie_end,cline,chr2,coloc_start,coloc_end,gwas_source,eqtl_source,ge_source
0,12,56435928,56435929,monocyte_naive,12,56435928,56435929,T1D_32005708,Schmiedel_2018,monocyte_CD16_naive
0,21,43836009,43836010,CD8_T-cell_naive,21,43836009,43836010,T1D_25751624,Schmiedel_2018,CD8_T-cell_anti-CD3-CD28


In [28]:
coloc_df.loc[coloc_df[2] == 43836010]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,gwas_source,eqtl_source,ge_source
0,21,43836009,43836010,0.0,0.0,0.06735,0.182167,0.750483,rs80054410,21:42415901:T:C,ENSG00000160181,64904,0.069239,0.890968,0.597151,T,C,1090,0.217652,5008,0.004242,0.149023,0.019003,4.430547e-15,42415900,T1D_25751624,van_de_Bunt_2015,pancreatic_islet
3,21,43836009,43836010,0.0,0.0,0.071876,0.127477,0.800648,rs80054410,21:42415901:T:C,ENSG00000228318,970193,0.001071,0.168597,0.644505,T,C,1090,0.217652,5008,0.003558,0.149023,0.019003,4.430547e-15,42415900,T1D_25751624,Schmiedel_2018,CD8_T-cell_anti-CD3-CD28


In [29]:
pieqtl_df.loc[pieqtl_df['pieQTL.Position'] == 43836010]

Unnamed: 0,pieQTL.ID,Chromosome,pieQTL.Position,Target_geneID,Target_geneName,TSS,pvalue,FDR (DICE),beta,ref,alt,Mean.TPM.Homozygous.Reference,Mean.TPM.Heterozygous,Mean.TPM.Homozygous.Alternative,Interaction_type,GWAS.Trait,cline
7737,rs80054410,21,43836010,ENSG00000160183.9,TMPRSS3,43816955,6.2e-05,0.049896,0.67,T,C,1.0,1.41,1.48,Direct_pieQTL,Autoimmune diseases|Diabetes mellitus type 1|...,CD4_T-cell_naive
6041,rs80054410,21,43836010,ENSG00000160183.9,TMPRSS3,43816955,7e-05,0.039679,0.63,T,C,1.5,1.99,2.33,Direct_pieQTL,Autoimmune diseases|Diabetes mellitus type 1|...,CD8_T-cell_naive
