In [1]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 
pd.set_option('display.max_columns', None)

pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

genome_sizes = 'results/refs/hg19/hg19.chrom.sizes'

# make the directory to save our data
outdir = 'results/main/gwas_pieqtls/2021_chiou_et_al/2021_chandra_et_al/'
os.makedirs(outdir, exist_ok=True)

In [2]:
gs_fn = 'results/refs/hg19/hg19.chrom.nochr.sizes'
gencode_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'
gencode = pd.read_table(gencode_fn, header=None)
gencode = gencode.drop_duplicates(5)
gencode_dict = {k:v for k,v in gencode[[5,6]].values.tolist()} 
gencode_dict.update({v:k for k,v in gencode[[5,6]].values.tolist()})

## Load Fine Mapped GWAS

In [3]:
data = []
gwas = 'results/main/finemapping/*/GRCh37/offset_1000000/Summary/sss/FINAL_top_snp_credible_set.txt'
for fn in glob.glob(gwas):
    path_info = fn.split('/')
    df = pd.read_table(fn)
    df['gwas_source'] = path_info[3]
    data.append(df)
gwas_df = pd.concat(data)
gwas_df.loc[:, 'chromosome'] = 'chr' + gwas_df.loc[:, 'chromosome'].astype(str)

# loading finemap data into bedtools
gwas_bed = gwas_df.iloc[:, [4,5,5]]
gwas_bed.columns = ['chrom', 'start', 'end']
gwas_bed['start'] = gwas_bed['start'] - 1
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gwas_bed['start'] = gwas_bed['start'] - 1


In [4]:
gwas_df.head()

Unnamed: 0,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,gwas_source
0,1,chr1:113310083-115099755,2336,1:114089649,chr1,114089649,A,G,0.016251,-0.1324,0.0749,-1.76769,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,0.9614436,T1D_34594039_GCST90018925
1,1,chr1:113310083-115099755,2829,1:114270326,chr1,114270326,A,C,0.260388,-0.1634,0.0209,-7.81818,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.0,T1D_34594039_GCST90018925
2,1,chr1:113310083-115099755,5046,1:114909703,chr1,114909703,T,C,0.131631,-0.0046,0.0261,-0.176245,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,0.5699493,T1D_34594039_GCST90018925
3,1,chr1:113310083-115099755,3035,1:114377568,chr1,114377568,G,A,0.114168,-0.4287,0.0286,-14.9895,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.0,T1D_34594039_GCST90018925
4,1,chr1:113310083-115099755,3131,1:114420328,chr1,114420328,T,C,0.325922,0.132,0.0189,6.98413,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.433164e-12,T1D_34594039_GCST90018925


In [5]:
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

## Load pieQTLs

In [6]:
pieqtls = glob.glob('results/main/pieqtls/2021_chandra_et_al/*/proximal.pieqtls.tsv')

pieqtl_data = []
for pieqtl in pieqtls:
    print(pieqtl)
    
    cline = pieqtl.split('/')[-2]
    df = pd.read_table(pieqtl)    
    df['cline'] = cline
    pieqtl_data.append(df) 

pieqtl_df = pd.concat(pieqtl_data)
pieqtl_df['Chromosome'] = pieqtl_df['Chromosome']

results/main/pieqtls/2021_chandra_et_al/B-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/NK-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/monocyte_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD4_T-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD8_T-cell_naive/proximal.pieqtls.tsv


In [7]:
pieqtl_df

Unnamed: 0,pieQTL.ID,Chromosome,pieQTL.Position,Target_geneID,Target_geneName,TSS,pvalue,FDR (DICE),beta,ref,alt,Mean.TPM.Homozygous.Reference,Mean.TPM.Heterozygous,Mean.TPM.Homozygous.Alternative,Interaction_type,GWAS.Trait,cline
0,rs9488914,chr6,116690849,ENSG00000111817.12,DSE,116575336,1.340000e-29,5.210000e-24,-1.22,C,T,21.77,12.57,4.70,Direct_pieQTL,,B-cell_naive
1,rs2304748,chr8,33369944,ENSG00000172728.11,FUT10,33330940,1.080000e-27,2.200000e-22,-1.26,T,C,5.24,3.01,1.16,Direct_pieQTL,,B-cell_naive
2,rs2581897,chr8,33371146,ENSG00000172728.11,FUT10,33330940,1.080000e-27,2.200000e-22,-1.26,T,G,5.24,3.01,1.16,Direct_pieQTL,,B-cell_naive
3,rs2581899,chr8,33371199,ENSG00000172728.11,FUT10,33330940,1.080000e-27,2.200000e-22,-1.26,T,C,5.24,3.01,1.16,Direct_pieQTL,,B-cell_naive
4,rs2732288,chr8,33370757,ENSG00000172728.11,FUT10,33330940,1.080000e-27,2.200000e-22,-1.26,G,A,5.24,3.01,1.16,Direct_pieQTL,,B-cell_naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6304,rs8100318,chr19,21690009,ENSG00000172687.9,ZNF738,21541732,6.060000e-05,4.953556e-02,-0.70,A,G,32.95,24.97,21.60,Indirect_pieQTL,,CD8_T-cell_naive
6305,rs6708331,chr2,70368923,ENSG00000116005.7,PCYOX1,70484518,7.520000e-05,4.969759e-02,-0.77,G,A,45.91,40.04,33.12,Indirect_pieQTL,,CD8_T-cell_naive
6306,rs402034,chr8,17554157,ENSG00000129422.9,MTUS1,17658426,5.300000e-05,4.976122e-02,0.69,C,A,7.23,8.91,10.10,Indirect_pieQTL,,CD8_T-cell_naive
6307,rs551244,chr19,5961883,ENSG00000187650.3,VMAC,5904869,6.090000e-05,4.976689e-02,0.62,G,C,130.13,147.45,160.35,Indirect_pieQTL,,CD8_T-cell_naive


In [8]:
pieqtl_bed = pieqtl_df.iloc[:, [1,2,2,-1]]
pieqtl_bed.columns = ['chrom', 'start', 'end', 'cline']
#pieqtl_bed.loc[:, 'chrom'] = 'chr' + pieqtl_bed.loc[:, 'chrom'].astype(str)
pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1
pieqtl_pbt = pbt.BedTool.from_dataframe(pieqtl_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1


In [9]:
pieqtl_bed.shape

(35898, 4)

## Intersect Fine Mapped GWAS and pieQTLs

In [10]:
major_cols = ['ge_source',
              'rsid',
             'chrom',
             'pos',
             'geneid',
             'genename',
             'allele1',
             'allele2',
             'maf',
             'beta_x',
             'se',
             'z',
             'prob',
             'log10bf',
             'mean',
             'sd',
             'mean_incl',
             'sd_incl',
             'pval',
             'gwas_source',
             'pieQTL.ID',
             'Chromosome',
             'pieQTL.Position',
             'Target_geneName',
             'TSS',
             'pvalue',
             'FDR (DICE)',
             'beta_y',
             'ref',
             'alt',
             'Mean.TPM.Homozygous.Reference',
             'Mean.TPM.Heterozygous',
             'Mean.TPM.Homozygous.Alternative',
             'Interaction_type',
             'GWAS.Trait',
             'cline',
             'regionID',
             'GWASLoci',
             'index']

### No Slop Version

In [12]:
intersect_pbt = gwas_pbt.intersect(pieqtl_pbt, wa=True, wb=True)
intersect_df = intersect_pbt.to_dataframe()
intersect_df.columns = ['gwas_chr', 'gwas_start', 'gwas_end',
                       'pie_chr', 'pie_start', 'pie_end', 'ge_source']
intersect_df = intersect_df.merge(gwas_df, left_on=['gwas_chr', 'gwas_end'],
                                  right_on=['chromosome', 'position'],
                                  how='left')

intersect_df = intersect_df.merge(pieqtl_df, left_on=['pie_chr', 'pie_end', 'ge_source'],
                                  right_on=['Chromosome', 'pieQTL.Position', 'cline'],
                                  how='left')

In [13]:
intersect_df.loc[:, 'geneid'] = intersect_df['Target_geneID'].str.replace('\.[0-9]*', '', regex=True)
intersect_df.loc[:, 'genename'] = [gencode_dict[x] for x in intersect_df.loc[:, 'geneid'].values.tolist()]

In [14]:
intersect_df

Unnamed: 0,gwas_chr,gwas_start,gwas_end,pie_chr,pie_start,pie_end,ge_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta_x,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,gwas_source,pieQTL.ID,Chromosome,pieQTL.Position,Target_geneID,Target_geneName,TSS,pvalue,FDR (DICE),beta_y,ref,alt,Mean.TPM.Homozygous.Reference,Mean.TPM.Heterozygous,Mean.TPM.Homozygous.Alternative,Interaction_type,GWAS.Trait,cline,geneid,genename
0,chr2,242294912,242294913,chr2,242294912,242294913,B-cell_naive,52,chr2:241778007-242778547,3666,2:242294913,chr2,242294913,G,A,0.0977,-0.066868,0.062422,-1.07122,1.0,13.6344,-0.33434,0.539277,-0.33434,0.539277,0.8579658,T1D_34012112_Gaulton,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,1.7e-09,7.49e-06,1.17,A,G,129.61,160.13,210.54,Direct_pieQTL,Leukemia chronic lymphocytic,B-cell_naive,ENSG00000168385,SEPT2
1,chr2,242294912,242294913,chr2,242294912,242294913,NK-cell_naive,52,chr2:241778007-242778547,3666,2:242294913,chr2,242294913,G,A,0.0977,-0.066868,0.062422,-1.07122,1.0,13.6344,-0.33434,0.539277,-0.33434,0.539277,0.8579658,T1D_34012112_Gaulton,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,2.14e-07,0.000716649,1.03,A,G,175.26,211.63,229.33,Direct_pieQTL,Leukemia chronic lymphocytic,NK-cell_naive,ENSG00000168385,SEPT2
2,chr2,242294912,242294913,chr2,242294912,242294913,monocyte_naive,52,chr2:241778007-242778547,3666,2:242294913,chr2,242294913,G,A,0.0977,-0.066868,0.062422,-1.07122,1.0,13.6344,-0.33434,0.539277,-0.33434,0.539277,0.8579658,T1D_34012112_Gaulton,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,1.83e-11,1.63e-07,1.28,A,G,144.53,187.66,224.97,Direct_pieQTL,Leukemia chronic lymphocytic,monocyte_naive,ENSG00000168385,SEPT2
3,chr2,242294912,242294913,chr2,242294912,242294913,CD4_T-cell_naive,52,chr2:241778007-242778547,3666,2:242294913,chr2,242294913,G,A,0.0977,-0.066868,0.062422,-1.07122,1.0,13.6344,-0.33434,0.539277,-0.33434,0.539277,0.8579658,T1D_34012112_Gaulton,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,1.16e-12,1.21e-08,1.35,A,G,133.93,197.27,244.58,Direct_pieQTL,Leukemia chronic lymphocytic,CD4_T-cell_naive,ENSG00000168385,SEPT2
4,chr2,242294912,242294913,chr2,242294912,242294913,CD8_T-cell_naive,52,chr2:241778007-242778547,3666,2:242294913,chr2,242294913,G,A,0.0977,-0.066868,0.062422,-1.07122,1.0,13.6344,-0.33434,0.539277,-0.33434,0.539277,0.8579658,T1D_34012112_Gaulton,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,9.46e-13,8.65e-09,1.33,A,G,130.83,183.62,225.51,Direct_pieQTL,Leukemia chronic lymphocytic,CD8_T-cell_naive,ENSG00000168385,SEPT2
5,chr12,56435928,56435929,chr12,56435928,56435929,monocyte_naive,26,chr12:55868078-57109885,1001,12:56435929,chr12,56435929,G,C,0.4197,-0.2461,0.0238,-10.3403,1.0,11.4253,0.838226,2.39604,0.838226,2.39604,1.0,T1D_32005708,rs1131017,chr12,56435929,ENSG00000139531.8,SUOX,56390964,7.51e-07,0.001200625,0.75,C,G,6.69,8.12,9.2,Indirect_pieQTL,Alopecia areata|Ankylosing spondylitis|Asthma|...,monocyte_naive,ENSG00000139531,SUOX
6,chr12,56435411,56435412,chr12,56435411,56435412,monocyte_naive,26,chr12:55868078-57109885,1262,12:56435412,chr12,56435412,A,G,0.3351,0.2353,0.0244,9.64344,0.909011,4.14824,0.733566,2.27471,0.806994,2.37339,2.620026e-22,T1D_32005708,rs705704,chr12,56435412,ENSG00000139531.8,SUOX,56390964,3.21e-06,0.003980193,-0.81,G,A,9.04,7.69,7.15,Indirect_pieQTL,Alopecia areata|Ankylosing spondylitis|Asthma|...,monocyte_naive,ENSG00000139531,SUOX


## Slop Version

In [19]:
intersect_pbt = gwas_pbt.slop(b=10, g=genome_sizes).intersect(pieqtl_pbt, wa=True, wb=True)
intersect_df = intersect_pbt.to_dataframe()
intersect_df.columns = ['gwas_chr', 'gwas_start', 'gwas_end',
                       'pie_chr', 'pie_start', 'pie_end', 'ge_source']
intersect_df = intersect_df.merge(gwas_df, left_on=['gwas_chr', 'gwas_end'],
                                  right_on=['chromosome', 'position'],
                                  how='left')
intersect_df = intersect_df.merge(pieqtl_df, left_on=['pie_chr', 'pie_end', 'ge_source'],
                                  right_on=['Chromosome', 'pieQTL.Position', 'cline'],
                                  how='left')

In [20]:
intersect_df.loc[:, 'geneid'] = intersect_df['Target_geneID'].str.replace('\.[0-9]*', '', regex=True)
intersect_df.loc[:, 'genename'] = [gencode_dict[x] for x in intersect_df.loc[:, 'geneid'].values.tolist()]

In [21]:
intersect_df

Unnamed: 0,gwas_chr,gwas_start,gwas_end,pie_chr,pie_start,pie_end,ge_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta_x,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,gwas_source,pieQTL.ID,Chromosome,pieQTL.Position,Target_geneID,Target_geneName,TSS,pvalue,FDR (DICE),beta_y,ref,alt,Mean.TPM.Homozygous.Reference,Mean.TPM.Heterozygous,Mean.TPM.Homozygous.Alternative,Interaction_type,GWAS.Trait,cline,geneid,genename
0,chr2,242294902,242294923,chr2,242294912,242294913,B-cell_naive,,,,,,,,,,,,,,,,,,,,,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,1.7e-09,7.49e-06,1.17,A,G,129.61,160.13,210.54,Direct_pieQTL,Leukemia chronic lymphocytic,B-cell_naive,ENSG00000168385,SEPT2
1,chr2,242294902,242294923,chr2,242294912,242294913,NK-cell_naive,,,,,,,,,,,,,,,,,,,,,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,2.14e-07,0.000716649,1.03,A,G,175.26,211.63,229.33,Direct_pieQTL,Leukemia chronic lymphocytic,NK-cell_naive,ENSG00000168385,SEPT2
2,chr2,242294902,242294923,chr2,242294912,242294913,monocyte_naive,,,,,,,,,,,,,,,,,,,,,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,1.83e-11,1.63e-07,1.28,A,G,144.53,187.66,224.97,Direct_pieQTL,Leukemia chronic lymphocytic,monocyte_naive,ENSG00000168385,SEPT2
3,chr2,242294902,242294923,chr2,242294912,242294913,CD4_T-cell_naive,,,,,,,,,,,,,,,,,,,,,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,1.16e-12,1.21e-08,1.35,A,G,133.93,197.27,244.58,Direct_pieQTL,Leukemia chronic lymphocytic,CD4_T-cell_naive,ENSG00000168385,SEPT2
4,chr2,242294902,242294923,chr2,242294912,242294913,CD8_T-cell_naive,,,,,,,,,,,,,,,,,,,,,rs3755397,chr2,242294913,ENSG00000168385.13,2020-09-02 00:00:00,242254515,9.46e-13,8.65e-09,1.33,A,G,130.83,183.62,225.51,Direct_pieQTL,Leukemia chronic lymphocytic,CD8_T-cell_naive,ENSG00000168385,SEPT2
5,chr11,108031284,108031305,chr11,108031293,108031294,B-cell_naive,,,,,,,,,,,,,,,,,,,,,rs113714417,chr11,108031294,ENSG00000149311.13,ATM,108093211,3.69e-07,0.001059554,0.72,TC,T,110.56,129.86,137.8,Direct_pieQTL,,B-cell_naive,ENSG00000149311,ATM
6,chr11,108031284,108031305,chr11,108031293,108031294,CD4_T-cell_naive,,,,,,,,,,,,,,,,,,,,,rs113714417,chr11,108031294,ENSG00000149308.12,NPAT,108093369,4.13e-09,1.61e-05,0.83,TC,T,55.6,64.65,71.83,Direct_pieQTL,,CD4_T-cell_naive,ENSG00000149308,NPAT
7,chr11,108031284,108031305,chr11,108031293,108031294,CD4_T-cell_naive,,,,,,,,,,,,,,,,,,,,,rs113714417,chr11,108031294,ENSG00000149311.13,ATM,108093211,4.34e-09,1.68e-05,0.83,TC,T,207.25,227.69,262.77,Direct_pieQTL,,CD4_T-cell_naive,ENSG00000149311,ATM
8,chr11,108031284,108031305,chr11,108031293,108031294,CD4_T-cell_naive,,,,,,,,,,,,,,,,,,,,,rs113714417,chr11,108031294,ENSG00000149308.12,NPAT,108093369,4.13e-09,1.61e-05,0.83,TC,T,55.6,64.65,71.83,Direct_pieQTL,,CD4_T-cell_naive,ENSG00000149308,NPAT
9,chr11,108031284,108031305,chr11,108031293,108031294,CD4_T-cell_naive,,,,,,,,,,,,,,,,,,,,,rs113714417,chr11,108031294,ENSG00000149311.13,ATM,108093211,4.34e-09,1.68e-05,0.83,TC,T,207.25,227.69,262.77,Direct_pieQTL,,CD4_T-cell_naive,ENSG00000149311,ATM


In [None]:
intersect_df.shape