In [1]:
import os
import glob
import pandas as pd 
from chromolooper import sgls
from time import sleep

pd.options.mode.chained_assignment = None  # default='warn'

os.chdir('/mnt/BioHome/jreyna/jreyna-temp/projects/t1d-loop-catalog/')

outdir = 'results/hg38/finemapping/snps/'
os.makedirs(outdir, exist_ok=True)

## Load all finemapped snp data

In [2]:
# add meta information
causal_metadata_fn = 'workflow/qscripts/finemap/causal_db/init.gwas_study.causal_db.immune_select_samples.tsv'
causal_metadata = pd.read_table(causal_metadata_fn, header=None)

causal_metadata_mapper =  causal_metadata.iloc[:, [2, 8, 18]]
causal_metadata_mapper.columns = ['mesh_term', 'author', 'filename']

In [3]:
fns = glob.glob('results/hg38/finemapping/snps/singles/*_total_credible_set.hg38.txt')

all_data = []
for i, fn in enumerate(fns):

    info = fn.split('/')

    causaldb_fn = info[-1].split('_')[0]

    if causaldb_fn in causal_metadata_mapper.filename.tolist():

        # loading the data
        tdf = pd.read_table(fn)
        if tdf.shape[0] > 0:
            tdf.loc[:, 'causaldb_fn'] = causaldb_fn
            all_data.append(tdf)

In [4]:
all_df = pd.concat(all_data)

In [5]:
all_df = all_df.merge(causal_metadata_mapper, left_on='causaldb_fn', right_on='filename')

all_df.loc[:, 'start'] = all_df.loc[:, 'BP'] - 1
all_df.loc[:, 'CHR'] = 'chr' + all_df.loc[:, 'CHR'].astype(str)
# all_df.loc[:, 'rsID'] = 'rs' + all_df.loc[:, 'rsID'].astype(str)
all_df.loc[:, 'score'] = 1
all_df.loc[:, 'strand'] = '+'
all_df.loc[:, 'color'] = '117,117,117'

## Make the Washu Track for Single Finemapped Results

In [6]:
filename_grps = all_df.groupby('filename')

singles_dir = os.path.join(outdir, 'singles/')
os.makedirs(singles_dir, exist_ok=True)

for filename, filename_df in filename_grps:

    print(filename)

    # clean the data for washu
    filename_df = filename_df[['CHR', 'start', 'BP', 'rsID']].sort_values(['CHR', 'start'])
    
    # save and compress
    out_fn = os.path.join(singles_dir, '{}.finemapped.snps.bed'.format(filename))
    filename_df.to_csv(out_fn, sep='\t', header=False, index=False)
    sgls.bgzip(out_fn)
    sgls.tabix(out_fn + '.gz')

AT258
AT280
AT452
AT454
AT548
AT551
BE151
CA104
CA265
CA266
CA391
GD08996
GD09063
GD09159
GD09415
GD09519
GD09657
PH378


## Agg and Make the Washu Track

### bedgraph Tracks

In [7]:
agg_dir = os.path.join(outdir, 'agg/')
os.makedirs(agg_dir, exist_ok=True)

for mesh, mesh_df in all_df.groupby('mesh_term'):

    print(mesh)

    # get and simplify the mesh_name
    mesh_name = mesh.replace(',', '').replace(' ', '-').lower()

    # clean the data for washu
    mesh_df = mesh_df.drop_duplicates(['CHR', 'BP', 'EA', 'NEA'])

    # bedgraph format
    mesh_df = mesh_df[['CHR', 'start', 'BP', 'score', 'strand', 'rsID']]
    mesh_df = mesh_df.sort_values(['CHR', 'start'])

    # qbed format
    #mesh_df = mesh_df[['CHR', 'start', 'BP', 'score', 'strand', 'rsID', 'mesh_term']]
    
    # save and compress
    out_fn = os.path.join(agg_dir, '{}.finemapped.snps.bedgraph'.format(mesh_name))
    mesh_df.to_csv(out_fn, sep='\t', header=False, index=False)
    sgls.bgzip(out_fn)
    sgls.tabix(out_fn + '.gz', type='bed')

Arthritis, Rheumatoid
Dermatitis, Atopic
Diabetes Mellitus, Type 1
Psoriasis


### rgbpeak Tracks

In [20]:
#genome_sizes = '/mnt/BioHome/jreyna/hichip-db-loop-calling/results/refs/reference_genomes/RefGenome/chrsize/hg38.chrom.sizes'
genome_sizes = 'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'

In [31]:
agg_dir = os.path.join(outdir, 'agg/')
os.makedirs(agg_dir, exist_ok=True)

mesh_grps = all_df.groupby('mesh_term')

for mesh, mesh_df in mesh_grps:

    print(mesh)

    # get and simplify the mesh_name
    mesh_name = mesh.replace(',', '').replace(' ', '-').lower()

    # clean the data for washu
    mesh_df = mesh_df.drop_duplicates(['CHR', 'BP', 'EA', 'NEA'])

    # rbgpeak format
    # chrom, start, end, peak_id, score, strand, thick_start, thick_end, RGB value
    mesh_df = mesh_df[['CHR', 'start', 'BP', 'rsID', 'score', 'strand', 'start', 'BP', 'color']]
    mesh_df.columns = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand', 'thick_start', 'thick_end', 'RGB']
    mesh_df = mesh_df.sort_values(['chrom', 'start'])
    
    # save and compress
    peak_fn = os.path.join(agg_dir, '{}.finemapped.snps.rgbpeak'.format(mesh_name))
    mesh_df.to_csv(peak_fn, sep='\t', header=False, index=False)

    bigpeak_fn = os.path.join(agg_dir, '{}.finemapped.snps.rgbpeak.bb'.format(mesh_name))
    sgls.bed_to_bigbed(peak_fn, bigpeak_fn, genome_sizes, verbose=True)

    # need time between runs of bed_to_bigbed
    sleep(2)


Arthritis, Rheumatoid



pass1 - making usageList (16 chroms): 34 millis
pass2 - checking and writing primary data (1043 records, 9 fields): 7 millis


Dermatitis, Atopic



pass1 - making usageList (16 chroms): 32 millis
pass2 - checking and writing primary data (674 records, 9 fields): 6 millis


Diabetes Mellitus, Type 1



pass1 - making usageList (22 chroms): 34 millis
pass2 - checking and writing primary data (6155 records, 9 fields): 21 millis


Psoriasis



pass1 - making usageList (20 chroms): 33 millis
pass2 - checking and writing primary data (559 records, 9 fields): 7 millis
