In [62]:
import os 
import json
import glob
import gzip
import pandas as pd 

outdir = 'results/main/hubs/coloc_hub/'
link_path = 'https://informaticsdata.liai.org/BioAdHoc/Groups/vd-ay/jreyna/projects/dchallenge/results/main/hubs/coloc_hub/'

os.makedirs(outdir, exist_ok=True)

In [63]:
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

In [64]:
# setting template names
coloc_tpl = 'results/main/GRCh37/coloc/Results/*/*/*/*/FINAL_Summary_Coloc_Gene_SNP_Pairs.longrange.bed.gz'

## Getting all WashU Files

#### Coloc Files

In [65]:
coloc_data = []
for coloc in glob.glob(coloc_tpl):
    
    run_info = coloc.split('/')
    gwas_source = run_info[6]
    eqtl_source = run_info[7]
    ge_source = run_info[8]   
    if eqtl_source == 'ImmuNexUT':
        eqtl_db = 'ImmuNexUT'
    else:
        eqtl_db = 'eQTL_Catalogue'    
    
    coloc_data.append([gwas_source, eqtl_db, eqtl_source, ge_source, coloc])
coloc_df = pd.DataFrame(coloc_data)
coloc_df.columns = ['gwas_source', 'eqtl_db', 'eqtl_source', 'ge_source', 'file']

In [66]:
coloc_df.shape

(165, 5)

## Making the Hub

In [67]:
def create_washu_longrange_dict(name, file):
    d = {"type": "longrange",
         "showOnHubLoad": True,
         "name": name,
         "filename": file,
         "options": {"displayMode": "arc"}}
    return(d)

def create_washu_bed_dict(name, file):
    d = {"type": "bed",
         "showOnHubLoad": True,
         "name": name,
         "filename": file}
    return(d)

def create_washu_vcf_dict(name, file):
    d = {"type": "vcf",
         "showOnHubLoad": True,
         "name": name,
         "filename": file}
    return(d)

def create_washu_geneAnnotation_dict(name, file):
    d = {"type": "refbed",
         "showOnHubLoad": True,
         "name": name,
         "filename": file}
    return(d)



In [68]:
def create_remote_washu_longrange_dict(name, url):
    d = {'type': 'longrange',
         'showOnHubLoad': True,
         'name': name,
         'url': url,
         'options': {'ensemblStyle': True, 'height': 100, 'displayMode': 'arc'}}
    return(d)

def create_remote_washu_refbed_dict(name, url):
    d = {'type': 'refbed',
         'showOnHubLoad': True,
         'name': name,
         'url': url,
         'options': {'ensemblStyle': True,
                     'height': 100,
                     'displayMode': 'auto',
                     'categoryColors': {
                        'Left': 'rgb(255,0,0)',
                        'Right': 'rgb(0,0,255)',
                        'Middle': 'rgb(255,0,0)'}}}
    return(d)

## Make Hub For SNP-Gene Pair Arcs

In [69]:
def zcount(filename):
    with gzip.open(filename) as f:
        return(len(f.readlines()))

In [70]:
json_files = []
curr_outdir = os.path.join(outdir, 'snp_gene_pairs')
os.makedirs(curr_outdir, exist_ok=True)

curr_linkpath = os.path.join(link_path, 'snp_gene_pairs')

for i, sr in coloc_df.iterrows():

    #print(sr.gwas_source, sr.eqtl_db, sr.eqtl_source, sr.ge_source)
                
    # get coloc files (ready)
    new_basename = 'colocs.{}.{}.{}.{}.longrange.bed.gz'.format(sr. gwas_source, sr.eqtl_db,
                                                      sr.eqtl_source, sr.ge_source)
    new_path = os.path.join(curr_outdir, new_basename)

    # hard link over the file
    old_path = sr.file

    num_lines = zcount(sr.file)
    if num_lines == 0:
        continue

    old_index_path = sr.file + '.tbi'
    new_path = os.path.join(curr_outdir, new_basename)
    new_index_path = new_path + '.tbi'

    if os.path.exists(old_path) and os.path.exists(old_index_path):

        if not os.path.exists(new_path) and not os.path.exists(new_index_path):

            os.link(old_path, new_path)
            os.link(old_index_path, new_index_path)

        # add the file to the json hub config
        url = os.path.join(curr_linkpath, new_basename)

        d = create_remote_washu_longrange_dict(new_basename, url)
        json_files.append(d)
                
hub_path = os.path.join(curr_outdir, 'hub.config.json')
with open(hub_path, 'w') as fw:
    print(json.dumps(json_files, indent='\t'), file=fw)

## Make Hub For SNP-Gene Pair Flat Representations

In [77]:
# setting template names
coloc_tpl = 'results/main/GRCh37/coloc/Results/*/*/*/*/FINAL_Summary_Coloc_Gene_SNP_Pairs.gene_sorted.with_gene_meta.bed.gz'

In [78]:
coloc_data = []
for coloc in glob.glob(coloc_tpl):
    
    run_info = coloc.split('/')
    gwas_source = run_info[6]
    eqtl_source = run_info[7]
    ge_source = run_info[8]   
    if eqtl_source == 'ImmuNexUT':
        eqtl_db = 'ImmuNexUT'
    else:
        eqtl_db = 'eQTL_Catalogue'    
    
    coloc_data.append([gwas_source, eqtl_db, eqtl_source, ge_source, coloc])
        
coloc_df = pd.DataFrame(coloc_data)
coloc_df.columns = ['gwas_source', 'eqtl_db', 'eqtl_source', 'ge_source', 'file']

In [79]:
coloc_df

Unnamed: 0,gwas_source,eqtl_db,eqtl_source,ge_source,file
0,T1D_34012112_Gaulton,eQTL_Catalogue,BLUEPRINT,T-cell,results/main/GRCh37/coloc/Results/eQTL_Catalog...
1,T1D_34012112_Gaulton,eQTL_Catalogue,BLUEPRINT,neutrophil,results/main/GRCh37/coloc/Results/eQTL_Catalog...
2,T1D_34012112_Gaulton,eQTL_Catalogue,Quach_2016,monocyte_R848,results/main/GRCh37/coloc/Results/eQTL_Catalog...
3,T1D_34012112_Gaulton,eQTL_Catalogue,Quach_2016,monocyte_Pam3CSK4,results/main/GRCh37/coloc/Results/eQTL_Catalog...
4,T1D_34012112_Gaulton,eQTL_Catalogue,van_de_Bunt_2015,pancreatic_islet,results/main/GRCh37/coloc/Results/eQTL_Catalog...
...,...,...,...,...,...
160,T1D_25751624,ImmuNexUT,ImmuNexUT,NC_Mono,results/main/GRCh37/coloc/Results/ImmuNexUT/T1...
161,T1D_25751624,ImmuNexUT,ImmuNexUT,CD16p_Mono,results/main/GRCh37/coloc/Results/ImmuNexUT/T1...
162,T1D_25751624,ImmuNexUT,ImmuNexUT,CM_CD8,results/main/GRCh37/coloc/Results/ImmuNexUT/T1...
163,T1D_25751624,ImmuNexUT,ImmuNexUT,Plasmablast,results/main/GRCh37/coloc/Results/ImmuNexUT/T1...


In [80]:
json_files = []
curr_outdir = os.path.join(outdir, 'snp_gene_pairs_flat')
os.makedirs(curr_outdir, exist_ok=True)

curr_linkpath = os.path.join(link_path, 'snp_gene_pairs_flat')

for i, sr in coloc_df.iterrows():

    #print(sr.gwas_source, sr.eqtl_db, sr.eqtl_source, sr.ge_source)
                
    # get coloc files (ready)
    new_basename = 'colocs.{}.{}.{}.{}.refbed.bed.gz'.format(sr. gwas_source, sr.eqtl_db,
                                                      sr.eqtl_source, sr.ge_source)
    new_path = os.path.join(curr_outdir, new_basename)

    # hard link over the file
    old_path = sr.file

    num_lines = zcount(sr.file)
    if num_lines == 0:
        continue

    old_index_path = sr.file + '.tbi'
    new_path = os.path.join(curr_outdir, new_basename)
    new_index_path = new_path + '.tbi'

    if os.path.exists(old_path) and os.path.exists(old_index_path):

        if not os.path.exists(new_path) and not os.path.exists(new_index_path):

            os.link(old_path, new_path)
            os.link(old_index_path, new_index_path)

        # add the file to the json hub config
        url = os.path.join(curr_linkpath, new_basename)

        d = create_remote_washu_refbed_dict(new_basename, url)
        json_files.append(d)
                
hub_path = os.path.join(curr_outdir, 'hub.config.json')
with open(hub_path, 'w') as fw:
    print(json.dumps(json_files, indent='\t'), file=fw)