In [1]:
import os 
import json
import glob
import gzip
import pandas as pd 

outdir = 'results/main/hubs/eqtl_hub/'
link_path = 'https://informaticsdata.liai.org/BioAdHoc/Groups/vd-ay/jreyna/projects/dchallenge/results/main/hubs/eqtl_hub/'
os.makedirs(outdir, exist_ok=True)

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

## Get all WashU Files

## Making the Hub

In [2]:
def create_washu_longrange_dict(name, file):
    d = {'type': 'longrange',
         'showOnHubLoad': True,
         'name': name,
         'filename': file,
         'options': {'displayMode': 'arc'}}
    return(d)

def create_washu_bed_dict(name, file):
    d = {'type': 'bed',
         'showOnHubLoad': True,
         'name': name,
         'filename': file}
    return(d)

def create_washu_vcf_dict(name, file):
    d = {'type': 'vcf',
         'showOnHubLoad': True,
         'name': name,
         'filename': file,
         'options': {'ensemblStyle': True, 'height': 100, 'displayMode': 'auto'}}
    return(d)

def create_washu_geneAnnotation_dict(name, file):
    d = {'type': 'refbed',
         'showOnHubLoad': True,
         'name': name,
         'filename': file}
    return(d)

In [3]:
def create_remote_washu_vcf_dict(name, url):
    d = {'type': 'vcf',
         'showOnHubLoad': True,
         'name': name,
         'url': url,
         'options': {'ensemblStyle': True, 'height': 100, 'displayMode': 'auto'}}
    return(d)

In [4]:
def create_remote_washu_vcf_dict(name, url):
    d = {'type': 'vcf',
         'showOnHubLoad': True,
         'name': name,
         'url': url,
         'options': {'ensemblStyle': True, 'height': 100, 'displayMode': 'auto'}}
    return(d)

## Making the Hub

## Making the BedRef based-Hub

In [5]:
# setting template names
eqtl_tpl = 'results/main/eqtl/*/ge/*_ge_*.all.dist.fdr.track.bed.gz'

In [6]:
glob.glob(eqtl_tpl.format(eqtl_source='*'))

['results/main/eqtl/GTEx/ge/GTEx_ge_LCL.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/GTEx/ge/GTEx_ge_blood.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/GTEx/ge/GTEx_ge_pancreas.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/Lepik_2017/ge/Lepik_2017_ge_blood.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_Pam3CSK4.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_naive.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_R848.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_IAV.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_LPS.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/BLUEPRINT/ge/BLUEPRINT_ge_neutrophil.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/BLUEPRINT/ge/BLUEPRINT_ge_monocyte.all.dist.fdr.track.bed.gz',
 'results/main/eqtl/BLUEPRINT/ge/BLUEPRINT_ge_T-cell.all.dist.fdr.track.bed.gz',
 'res

In [7]:
eqtl_data = []
for eqtl in glob.glob(eqtl_tpl.format(eqtl_source='*')):
        
    run_info = eqtl.split('/')
    eqtl_source = run_info[3]  
    ge_source = run_info[-1].split('.')[0].split('ge')[1].replace('_', '', 1)
    eqtl_data.append([eqtl_source, ge_source, eqtl])
    
eqtl_df = pd.DataFrame(eqtl_data)
eqtl_df.columns = ['eqtl_source', 'ge_source', 'file']

In [8]:
eqtl_df

Unnamed: 0,eqtl_source,ge_source,file
0,GTEx,LCL,results/main/eqtl/GTEx/ge/GTEx_ge_LCL.all.dist...
1,GTEx,blood,results/main/eqtl/GTEx/ge/GTEx_ge_blood.all.di...
2,GTEx,pancreas,results/main/eqtl/GTEx/ge/GTEx_ge_pancreas.all...
3,Lepik_2017,blood,results/main/eqtl/Lepik_2017/ge/Lepik_2017_ge_...
4,Quach_2016,monocyte_Pam3CSK4,results/main/eqtl/Quach_2016/ge/Quach_2016_ge_...
5,Quach_2016,monocyte_naive,results/main/eqtl/Quach_2016/ge/Quach_2016_ge_...
6,Quach_2016,monocyte_R848,results/main/eqtl/Quach_2016/ge/Quach_2016_ge_...
7,Quach_2016,monocyte_IAV,results/main/eqtl/Quach_2016/ge/Quach_2016_ge_...
8,Quach_2016,monocyte_LPS,results/main/eqtl/Quach_2016/ge/Quach_2016_ge_...
9,BLUEPRINT,neutrophil,results/main/eqtl/BLUEPRINT/ge/BLUEPRINT_ge_ne...


In [9]:
def create_remote_washu_refbed_dict(name, url):
    d = {'type': 'refbed',
         'showOnHubLoad': True,
         'name': name,
         'url': url,
         'options': {'ensemblStyle': True,
                     'height': 100,
                     'displayMode': 'auto',
                     'categoryColors': {
                        'Sig': 'rgb(255,0,0)',
                        'Not Sig.': 'rgb(0,0,255)'}}}
    return(d)

In [10]:
json_files = []

for i, sr in eqtl_df.iterrows():
    
    print(sr.file)
    
    basename = os.path.basename(sr.file)
    
    old_path = sr.file
    old_index_path = sr.file + '.tbi'
    
    new_basename = '{}.{}'.format(sr.eqtl_source, basename)
    new_path = os.path.join(outdir, new_basename)
    new_index_path = new_path + '.tbi'
    
    print('new_path:', new_path)
    print()

    if os.path.exists(old_path) and os.path.exists(old_index_path):
        if not os.path.exists(new_path) and not os.path.exists(new_index_path):
                
            os.link(old_path, new_path)
            os.link(old_index_path, new_index_path)

        # add the file to the json hub config
        name = 'eQTL SNPs - {} {}'.format(sr.eqtl_source, sr.ge_source)
        url = os.path.join(link_path, new_basename)
        d = create_remote_washu_refbed_dict(name, url)
        json_files.append(d)

results/main/eqtl/GTEx/ge/GTEx_ge_LCL.all.dist.fdr.track.bed.gz
new_path: results/main/hubs/eqtl_hub/GTEx.GTEx_ge_LCL.all.dist.fdr.track.bed.gz

results/main/eqtl/GTEx/ge/GTEx_ge_blood.all.dist.fdr.track.bed.gz
new_path: results/main/hubs/eqtl_hub/GTEx.GTEx_ge_blood.all.dist.fdr.track.bed.gz

results/main/eqtl/GTEx/ge/GTEx_ge_pancreas.all.dist.fdr.track.bed.gz
new_path: results/main/hubs/eqtl_hub/GTEx.GTEx_ge_pancreas.all.dist.fdr.track.bed.gz

results/main/eqtl/Lepik_2017/ge/Lepik_2017_ge_blood.all.dist.fdr.track.bed.gz
new_path: results/main/hubs/eqtl_hub/Lepik_2017.Lepik_2017_ge_blood.all.dist.fdr.track.bed.gz

results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_Pam3CSK4.all.dist.fdr.track.bed.gz
new_path: results/main/hubs/eqtl_hub/Quach_2016.Quach_2016_ge_monocyte_Pam3CSK4.all.dist.fdr.track.bed.gz

results/main/eqtl/Quach_2016/ge/Quach_2016_ge_monocyte_naive.all.dist.fdr.track.bed.gz
new_path: results/main/hubs/eqtl_hub/Quach_2016.Quach_2016_ge_monocyte_naive.all.dist.fdr.trac

In [11]:
hub_path = os.path.join(outdir, 'hub.config.json')
with open(hub_path, 'w') as fw:
    print(json.dumps(json_files, indent='\t'), file=fw)