# Extracting Credible Sets

The [Chiou et al., 2021 paper](nature.com/articles/s41588-021-00823-0) used fine-mapping with a large cohort of GWAS data to determine 81 unique loci associated with T1D and included an additional ~60 independent signals. Originally I had tried to use the Ay lab fine-mapping pipeline but we could only obtain 10 regions (did not do a direct comparison). Using these fine-mapped results we should be able to get a better list of SGLs. In this notebook I am extracting the credible sets for downstream use, this data was stored in Supplementary table 1 of the publication.

In [33]:
import os
import pandas as pd 
import glob
import json
import subprocess as sp
import numpy as np

import coolbox
from coolbox.api import *

# # add tabix to path variable 
# os.environ["PATH"] += ":/mnt/bioadhoc-temp/Groups/vd-ay/jreyna/software/mamba/envs/hichip-db/bin/"

# change the working directory
os.chdir('/mnt/BioHome/jreyna/jreyna-temp/projects/dchallenge')

In [34]:
outdir = 'results/main/chiou_2021/processing/finemapping/'
os.makedirs(outdir, exist_ok=True)

## Load the variant data

In [35]:
cs_vars = pd.read_excel('results/main/chiou_2021/Supplemental1.credible_sets.41586_2021_3552_MOESM4_ESM.xlsx',
                       skiprows=2)

In [36]:
nchroms, npos, nmajor, nminor = zip(*cs_vars['Unique ID (hg19)'].str.split(':'))

In [37]:
cs_vars['Position (hg19)'] = npos
cs_vars['Major allele'] = nmajor
cs_vars['Minor allele'] = nminor

In [38]:
cs_vars

Unnamed: 0,Marker,Unique ID (hg38),Unique ID (hg19),Chrom.,Position (hg38),Signal name,PPA,Position (hg19),Major allele,Minor allele
0,rs72657048,1:24963243:C:G,1:25289734:C:G,1,24963243,RUNX3_1:25296743:A:C,0.010221,25289734,C,G
1,rs6672420,1:24964519:A:T,1:25291010:A:T,1,24964519,RUNX3_1:25296743:A:C,0.006615,25291010,A,T
2,rs7528484,1:24966177:C:T,1:25292668:C:T,1,24966177,RUNX3_1:25296743:A:C,0.014956,25292668,C,T
3,rs7536201,1:24966593:T:C,1:25293084:T:C,1,24966593,RUNX3_1:25296743:A:C,0.015091,25293084,T,C
4,rs11249212,1:24966710:A:G,1:25293201:A:G,1,24966710,RUNX3_1:25296743:A:C,0.012456,25293201,A,G
...,...,...,...,...,...,...,...,...,...,...
46733,rs10758593,9:4292083:G:A,9:4292083:G:A,9,4292083,GLIS3_9:4283137:G:T,0.069463,4292083,G,A
46734,rs7867224,9:4292152:A:G,9:4292152:A:G,9,4292152,GLIS3_9:4283137:G:T,0.012633,4292152,A,G
46735,rs10758594,9:4295583:A:G,9:4295583:A:G,9,4295583,GLIS3_9:4283137:G:T,0.031708,4295583,A,G
46736,rs10814917,9:4296430:A:G,9:4296430:A:G,9,4296430,GLIS3_9:4283137:G:T,0.008091,4296430,A,G


In [41]:
cs_vars.loc[cs_vars['Signal name'] == 'RUNX3_1:25296743:A:C']

Unnamed: 0,Marker,Unique ID (hg38),Unique ID (hg19),Chrom.,Position (hg38),Signal name,PPA,Position (hg19),Major allele,Minor allele
0,rs72657048,1:24963243:C:G,1:25289734:C:G,1,24963243,RUNX3_1:25296743:A:C,0.010221,25289734,C,G
1,rs6672420,1:24964519:A:T,1:25291010:A:T,1,24964519,RUNX3_1:25296743:A:C,0.006615,25291010,A,T
2,rs7528484,1:24966177:C:T,1:25292668:C:T,1,24966177,RUNX3_1:25296743:A:C,0.014956,25292668,C,T
3,rs7536201,1:24966593:T:C,1:25293084:T:C,1,24966593,RUNX3_1:25296743:A:C,0.015091,25293084,T,C
4,rs11249212,1:24966710:A:G,1:25293201:A:G,1,24966710,RUNX3_1:25296743:A:C,0.012456,25293201,A,G
5,rs7414934,1:24966725:A:C,1:25293216:A:C,1,24966725,RUNX3_1:25296743:A:C,0.015211,25293216,A,C
6,rs11249213,1:24966746:G:A,1:25293237:G:A,1,24966746,RUNX3_1:25296743:A:C,0.015191,25293237,G,A
7,rs4265380,1:24966865:C:T,1:25293356:C:T,1,24966865,RUNX3_1:25296743:A:C,0.016119,25293356,C,T
8,1:25293775:GC:G,1:24967284:GC:G,1:25293775:GC:G,1,24967284,RUNX3_1:25296743:A:C,0.020549,25293775,GC,G
9,rs4648889,1:24967338:G:A,1:25293829:G:A,1,24967338,RUNX3_1:25296743:A:C,0.015543,25293829,G,A


In [39]:
save_data = cs_vars[['Chrom.' , 'Position (hg19)', 'Marker', 'PPA', 'Signal name',
                     'Major allele', 'Minor allele', 'Unique ID (hg38)', 'Position (hg38)']]

In [40]:
out_fn = os.path.join(outdir, 'finemapping.gaulton.tsv')
save_data.to_csv(out_fn, sep='\t', index=False, header=True)

In [43]:
save_data.sort_values('PPA', ascending=False)

Unnamed: 0,Chrom.,Position (hg19),Marker,PPA,Signal name,Major allele,Minor allele,Unique ID (hg38),Position (hg38)
28853,19,10463118,rs34536443,0.999996,TYK2_19:10463118:G:C,G,C,19:10352442:G:C,10352442
46737,9,135936325,rs541856133,0.999987,CEL_9:135936325:C:T,C,T,9:133060938:C:T,133060938
20350,14,101306447,rs56994090,0.998790,DLK1_14:101306447:T:C,T,C,14:100840110:T:C,100840110
37782,21,45714294,rs74203920,0.990127,AIRE_21:45714294:C:T,C,T,21:44294411:C:T,44294411
5309,11,2903060,rs140215710,0.987436,CDKN1C_11:2903060:G:A,G,A,11:2881830:G:A,2881830
...,...,...,...,...,...,...,...,...,...
3694,10,6375385,rs111736851,0.000003,IL2RA_10:6390450:G:A,G,A,10:6333423:G:A,6333423
3171,10,6131029,rs3750669,0.000003,IL2RA_10:6390450:G:A,G,T,10:6089066:G:T,6089066
3897,10,6465660,rs7920162,0.000003,IL2RA_10:6390450:G:A,C,T,10:6423698:C:T,6423698
2804,10,5982153,rs558695705,0.000003,IL2RA_10:6390450:G:A,C,G,10:5940190:C:G,5940190


In [59]:
save_data['ppa_pct'] = pd.cut(save_data.PPA,
                              bins=[0,0.1,0.2,.5,0.8,0.9,1],
                              labels=['0-10%', '10-20%', '20-50%', '50-80%', '80-90%', '90-100%'] )

In [60]:
save_data.loc[save_data['ppa_pct'] == '80-90%']

Unnamed: 0,Chrom.,Position (hg19),Marker,PPA,Signal name,Major allele,Minor allele,Unique ID (hg38),Position (hg38),ppa_pct
441,1,198598389,rs570794153,0.852503,PTPRC_1:198598389:G:GA,G,GA,1:198629259:G:GA,198629259,80-90%
3291,10,6166282,rs77710246,0.806775,IL2RA_10:6166282:T:G,T,G,10:6124319:T:G,6124319,80-90%
3730,10,6390450,rs947474,0.879125,IL2RA_10:6390450:G:A,G,A,10:6348488:G:A,6348488,80-90%
5291,11,2182224,rs689,0.861349,INS_11:2182224:A:T,A,T,11:2160994:A:T,2160994,80-90%
18972,13,100079833,rs9517712,0.833043,GPR183_13:100079833:T:C,T,C,13:99427579:T:C,99427579,80-90%
21548,16,80284024,rs8046043,0.801408,16q23_16:80284024:G:C,G,C,16:80250127:G:C,80250127,80-90%
29502,2,60146784,rs77146844,0.860183,BCL11A_2:60146784:C:G,C,G,2:59919649:C:G,59919649,80-90%
45255,6,91005743,rs6908626,0.821002,BACH2_6:91005743:G:T,G,T,6:90296024:G:T,90296024,80-90%


In [61]:
save_data.loc[save_data.ppa_pct == '50-80%']

Unnamed: 0,Chrom.,Position (hg19),Marker,PPA,Signal name,Major allele,Minor allele,Unique ID (hg38),Position (hg38),ppa_pct
817,1,206746125,rs58579536,0.569185,IL10_1:206746125:A:G,A,G,1:206572797:A:G,206572797,50-80%
1210,1,206943968,rs3024493,0.52503,IL10_1:206943968:C:A,C,A,1:206770623:C:A,206770623,50-80%
3025,10,6082953,rs6602398,0.546814,IL2RA_10:6082953:G:T,G,T,10:6040990:G:T,6040990,50-80%
3058,10,6094697,rs61839660,0.779283,IL2RA_10:6094697:C:T,C,T,10:6052734:C:T,6052734,50-80%
3076,10,6098949,rs706778,0.614066,IL2RA_10:6098949:C:T,C,T,10:6056986:C:T,6056986,50-80%
8069,12,56418678,rs71459332,0.628138,IKZF4_12:56418678:C:T,C,T,12:56024894:C:T,56024894,50-80%
20358,14,101308958,rs3783355,0.563428,DLK1_14:101308958:G:A,G,A,14:100842621:G:A,100842621,50-80%
28792,18,12818922,rs80262450,0.546048,PTPN2_18:12818922:G:A,G,A,18:12818923:G:A,12818923,50-80%
28811,18,12857758,rs62097857,0.590824,PTPN2_18:12857758:G:A,G,A,18:12857759:G:A,12857759,50-80%
33709,2,112020548,rs567088138,0.518916,ACOXL_2:112020548:G:A,G,A,2:111262971:G:A,111262971,50-80%
