In [1]:
import os
import glob
import pandas as pd 
from chromolooper import sgls
from time import sleep

pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = None

os.chdir('<project-dir>')

outdir = 'results/hg38/finemapping/snps/'
os.makedirs(outdir, exist_ok=True)

## Load all finemapped snp data

In [2]:
# add meta information
causal_metadata_fn = 'workflow/qscripts/finemap/causal_db/init.gwas_study.causal_db.immune_select_samples.tsv'
causal_metadata = pd.read_table(causal_metadata_fn, header=None)

causal_metadata_mapper =  causal_metadata.iloc[:, [2, 8, 18]]
causal_metadata_mapper.columns = ['mesh_term', 'author', 'filename']

In [3]:
fns = glob.glob('results/hg38/finemapping/snps/singles/*_total_credible_set.hg38.txt')

all_data = []
for i, fn in enumerate(fns):

    info = fn.split('/')

    causaldb_fn = info[-1].split('_')[0]

    if causaldb_fn in causal_metadata_mapper.filename.tolist():

        # loading the data
        tdf = pd.read_table(fn)
        if tdf.shape[0] > 0:
            tdf.loc[:, 'causaldb_fn'] = causaldb_fn
            all_data.append(tdf)

In [4]:
all_df = pd.concat(all_data)

In [5]:
all_df = all_df.merge(causal_metadata_mapper, left_on='causaldb_fn', right_on='filename')

all_df.loc[:, 'start'] = all_df.loc[:, 'BP'] - 1
all_df.loc[:, 'CHR'] = 'chr' + all_df.loc[:, 'CHR'].astype(str)
# all_df.loc[:, 'rsID'] = 'rs' + all_df.loc[:, 'rsID'].astype(str)
all_df.loc[:, 'score'] = 1
all_df.loc[:, 'strand'] = '+'
all_df.loc[:, 'color'] = '117,117,117'

#### Mini-summary 

In [6]:
uniq_snps_mesh_df = all_df.drop_duplicates(subset=['rsID', 'mesh_term']) 
uniq_snps_mesh_df.groupby('mesh_term').count()['CHR'].to_frame()

Unnamed: 0_level_0,CHR
mesh_term,Unnamed: 1_level_1
"Arthritis, Rheumatoid",1034
"Dermatitis, Atopic",674
"Diabetes Mellitus, Type 1",6044
Psoriasis,542


#### Checking the Muori SNP

In [7]:
mouri_snp = 90267049
mouri_snp_bach2 = all_df.loc[(all_df.BP == mouri_snp) & (all_df.CHR == 'chr6')]
mouri_snp_bach2

Unnamed: 0,CHR,BP,rsID,MAF,EA,NEA,BETA,SE,P,Zscore,PAINTOR,CAVIARBF,FINEMAP,meta_id,block_id,label,causaldb_fn,mesh_term,author,filename,start,score,strand,color
1771,chr6,90267049,rs72928038,0.1799,A,G,0.192272,0.02778,4.480078e-12,6.921145,0.820454,0.837611,0.825188,CA266,692,7.0,CA266,"Diabetes Mellitus, Type 1",Onengut Gumuscu S,CA266,90267048,1,+,117117117
2606,chr6,90267049,rs72928038,0.1799,A,G,0.172309,0.0202,1.46427e-17,8.529937,0.732125,0.738009,0.736367,GD09063,692,7.0,GD09063,"Diabetes Mellitus, Type 1",Robertson CC,GD09063,90267048,1,+,117117117
3415,chr6,90267049,rs72928038,0.1799,A,G,0.179985,0.023998,6.381783e-14,7.5,0.714601,0.723613,0.715157,CA265,692,7.0,CA265,"Diabetes Mellitus, Type 1",Onengut Gumuscu S,CA265,90267048,1,+,117117117
4876,chr6,90267049,rs72928038,0.166,A,G,0.199873,0.018481,2.93e-27,10.815053,0.183141,0.175869,0.176016,GD08996,692,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,90267048,1,+,117117117
9727,chr6,90267049,rs72928038,0.1799,A,G,0.185808,0.028992,1.465434e-10,6.408931,0.04583,0.043448,0.045107,GD09657,692,7.0,GD09657,"Diabetes Mellitus, Type 1",Inshaw JRJ,GD09657,90267048,1,+,117117117


In [None]:
mouri_snp_bach2.FINEMAP.median()

0.7151569999999999

In [13]:
all_df.loc[(all_df.CHR == 'chr11') & (all_df.BP.astype(str).str.contains('^64')), :].sort_values('FINEMAP', ascending=False)

Unnamed: 0,CHR,BP,rsID,MAF,EA,NEA,BETA,SE,P,Zscore,PAINTOR,CAVIARBF,FINEMAP,meta_id,block_id,label,causaldb_fn,mesh_term,author,filename,start,score,strand,color
6971,chr11,64340263,rs663743,0.349,A,G,-0.099964,0.015092,3.5e-11,-6.623642,0.524181,0.533177,0.533116,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64340262,1,+,117117117
6972,chr11,64340005,rs479777,0.351,C,T,-0.099132,0.015066,4.71e-11,-6.579849,0.396264,0.399335,0.399365,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64340004,1,+,117117117
6973,chr11,64329761,rs694739,0.394,G,A,-0.08564,0.014436,2.98e-09,-5.932391,0.007866,0.006959,0.006978,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64329760,1,+,117117117
6974,chr11,64254133,rs12421615,0.374,A,G,-0.087115,0.014893,4.93e-09,-5.849392,0.004901,0.004268,0.004281,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64254132,1,+,117117117
6975,chr11,64341646,rs647152,0.386,G,T,-0.084851,0.014651,6.97e-09,-5.791482,0.003537,0.003047,0.003057,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64341645,1,+,117117117
6976,chr11,64338457,rs499425,0.39,A,G,-0.084235,0.014575,7.49e-09,-5.779417,0.003306,0.002842,0.002851,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64338456,1,+,117117117
6977,chr11,64335476,rs574087,0.392,G,A,-0.084096,0.014559,7.64e-09,-5.776221,0.003247,0.00279,0.002799,GD08996,1131,7.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64335475,1,+,117117117
6978,chr11,64337016,rs61886886,0.39,T,C,-0.084134,0.014572,7.76e-09,-5.773676,0.003202,0.002749,0.002758,GD08996,1131,1.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64337015,1,+,117117117
6979,chr11,64322116,rs646153,0.39,T,C,-0.083898,0.014577,8.65e-09,-5.755505,0.002893,0.002476,0.002484,GD08996,1131,1.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64322115,1,+,117117117
6980,chr11,64343211,rs479552,0.386,C,G,-0.084312,0.014654,8.74e-09,-5.753514,0.002861,0.002447,0.002456,GD08996,1131,1.0,GD08996,"Diabetes Mellitus, Type 1",Chiou J,GD08996,64343210,1,+,117117117


## Make the Washu Track for Single Finemapped Results

In [None]:
filename_grps = all_df.groupby('filename')

singles_dir = os.path.join(outdir, 'singles/')
os.makedirs(singles_dir, exist_ok=True)

for filename, filename_df in filename_grps:

    print(filename)

    # clean the data for washu
    filename_df = filename_df[['CHR', 'start', 'BP', 'rsID']].sort_values(['CHR', 'start'])
    
    # save and compress
    out_fn = os.path.join(singles_dir, '{}.finemapped.snps.bed'.format(filename))
    filename_df.to_csv(out_fn, sep='\t', header=False, index=False)
    sgls.bgzip(out_fn)
    sgls.tabix(out_fn + '.gz')

AT258
AT280
AT452
AT454
AT548
AT551
BE151
CA104
CA265
CA266
CA391
GD08996
GD09063
GD09159
GD09415
GD09519
GD09657
PH378


## Agg and Make the Washu Track

### bedgraph Tracks

In [None]:
agg_dir = os.path.join(outdir, 'agg/')
os.makedirs(agg_dir, exist_ok=True)

for mesh, mesh_df in all_df.groupby('mesh_term'):

    print(mesh)

    # get and simplify the mesh_name
    mesh_name = mesh.replace(',', '').replace(' ', '-').lower()

    # clean the data for washu
    mesh_df = mesh_df.drop_duplicates(['CHR', 'BP', 'EA', 'NEA'])

    # bedgraph format
    mesh_df = mesh_df[['CHR', 'start', 'BP', 'score', 'strand', 'rsID']]
    mesh_df = mesh_df.sort_values(['CHR', 'start'])

    # qbed format
    #mesh_df = mesh_df[['CHR', 'start', 'BP', 'score', 'strand', 'rsID', 'mesh_term']]
    
    # save and compress
    out_fn = os.path.join(agg_dir, '{}.finemapped.snps.bedgraph'.format(mesh_name))
    mesh_df.to_csv(out_fn, sep='\t', header=False, index=False)
    sgls.bgzip(out_fn)
    sgls.tabix(out_fn + '.gz', type='bed')

Arthritis, Rheumatoid
Dermatitis, Atopic
Diabetes Mellitus, Type 1
Psoriasis


### rgbpeak Tracks

In [None]:
genome_sizes = 'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'

In [None]:
agg_dir = os.path.join(outdir, 'agg/')
os.makedirs(agg_dir, exist_ok=True)

mesh_grps = all_df.groupby('mesh_term')

for mesh, mesh_df in mesh_grps:

    print(mesh)

    # get and simplify the mesh_name
    mesh_name = mesh.replace(',', '').replace(' ', '-').lower()

    # clean the data for washu
    mesh_df = mesh_df.drop_duplicates(['CHR', 'BP', 'EA', 'NEA'])

    # rbgpeak format
    # chrom, start, end, peak_id, score, strand, thick_start, thick_end, RGB value
    mesh_df = mesh_df[['CHR', 'start', 'BP', 'rsID', 'score', 'strand', 'start', 'BP', 'color']]
    mesh_df.columns = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'thick_start', 'thick_end', 'RGB']
    mesh_df = mesh_df.sort_values(['chrom', 'start'])
    
    # save and compress
    peak_fn = os.path.join(agg_dir, '{}.finemapped.snps.rgbpeak'.format(mesh_name))
    mesh_df.to_csv(peak_fn, sep='\t', header=False, index=False)

    bigpeak_fn = os.path.join(agg_dir, '{}.finemapped.snps.rgbpeak.bb'.format(mesh_name))
    sgls.bed_to_bigbed(peak_fn, bigpeak_fn, genome_sizes, verbose=True)

    # need time between runs of bed_to_bigbed
    sleep(2)

Arthritis, Rheumatoid



pass1 - making usageList (16 chroms): 34 millis
pass2 - checking and writing primary data (1043 records, 9 fields): 7 millis


Dermatitis, Atopic



pass1 - making usageList (16 chroms): 32 millis
pass2 - checking and writing primary data (674 records, 9 fields): 6 millis


Diabetes Mellitus, Type 1



pass1 - making usageList (22 chroms): 34 millis
pass2 - checking and writing primary data (6155 records, 9 fields): 21 millis


Psoriasis



pass1 - making usageList (20 chroms): 33 millis
pass2 - checking and writing primary data (559 records, 9 fields): 7 millis
