In [2]:
import os 
import json
import glob
import gzip
import pandas as pd 

outdir = 'results/main/hubs/gwas_hub/'
link_path = 'https://informaticsdata.liai.org/BioAdHoc/Groups/vd-ay/jreyna/projects/dchallenge/results/main/hubs/gwas_hub/'
os.makedirs(outdir, exist_ok=True)

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

## Get all WashU Files

In [4]:
# setting template names
gwas_tpl = 'results/main/gwas/source/{gwas_source}/GRCh37/GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz'

In [5]:
gwas_data = []
for gwas in glob.glob(gwas_tpl.format(gwas_source='*')):
        
    run_info = gwas.split('/')
    gwas_source = run_info[4]    
    gwas_data.append([gwas_source, gwas])
    
gwas_df = pd.DataFrame(gwas_data)
gwas_df.columns = ['gwas_source', 'file']

## Get GWAS Meta Data

In [26]:
gwas_meta = pd.read_table('config/gwas_samplesheets/gwas.samplesheet.tsv')
gwas_meta.set_index('gwas_id', inplace=True)
gwas_meta

Unnamed: 0_level_0,authorship,link
gwas_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T1D_34012112_Gaulton,"Chiou et al., 2021",https://www.ebi.ac.uk/gwas/publications/34012112
T1D_34594039_GCST90018925,"Sakaue et al., 2021",https://www.ebi.ac.uk/gwas/publications/34594039
T1D_32005708,"Forgetta et al., 2020",https://www.ebi.ac.uk/gwas/publications/32005708
T1D_25751624,"Onengut-Gumuscu et al., 2015",https://www.ebi.ac.uk/gwas/publications/25751624


## Making the Hub

### Creating a remote hub

In [45]:
def create_washu_longrange_dict(name, file):
    d = {'type': 'longrange',
         'showOnHubLoad': True,
         'name': name,
         'filename': file,
         'options': {'displayMode': 'arc'}}
    return(d)

def create_washu_bed_dict(name, file):
    d = {'type': 'bed',
         'showOnHubLoad': True,
         'name': name,
         'filename': file}
    return(d)

def create_washu_vcf_dict(name, file):
    d = {'type': 'vcf',
         'showOnHubLoad': True,
         'name': name,
         'filename': file,
         'options': {'ensemblStyle': True, 'height': 100, 'displayMode': 'auto'}}
    return(d)

def create_washu_geneAnnotation_dict(name, file):
    d = {'type': 'refbed',
         'showOnHubLoad': True,
         'name': name,
         'filename': file}
    return(d)

### Creating a Remote Hub

In [49]:
def create_remote_washu_vcf_dict(name, url):
    d = {'type': 'vcf',
         'showOnHubLoad': True,
         'name': name,
         'url': url,
         'options': {'ensemblStyle': True, 'height': 100, 'displayMode': 'auto'}}
    return(d)

In [53]:
json_files = []

for i, sr in gwas_df.iterrows():
    
    print(sr.file)
    
    basename = os.path.basename(sr.file)
    
    old_path = sr.file
    old_index_path = sr.file + '.tbi'
    
    new_basename = '{}.{}'.format(sr.gwas_source, basename)
    new_path = os.path.join(outdir, new_basename)
    new_index_path = new_path + '.tbi'
    
    print('new_path:', new_path)
    print()

    if os.path.exists(old_path) and os.path.exists(old_index_path):
        if not os.path.exists(new_path) and not os.path.exists(new_index_path):
                
            os.link(old_path, new_path)
            os.link(old_index_path, new_index_path)

        # add the file to the json hub config
        authorship = gwas_meta.loc[sr.gwas_source, 'authorship']
        name = 'OneKG GWAS SNPs (Sigs Only) - {}'.format(authorship)
        url = os.path.join(link_path, new_basename)
        d = create_remote_washu_vcf_dict(name, url)
        json_files.append(d)

results/main/gwas/source/T1D_34012112_Gaulton/GRCh37/GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz
new_path: results/main/hubs/gwas_hub/T1D_34012112_Gaulton.GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz

results/main/gwas/source/T1D_25751624/GRCh37/GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz
new_path: results/main/hubs/gwas_hub/T1D_25751624.GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz

results/main/gwas/source/T1D_32005708/GRCh37/GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz
new_path: results/main/hubs/gwas_hub/T1D_32005708.GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz

results/main/gwas/source/T1D_34594039_GCST90018925/GRCh37/GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz
new_path: results/main/hubs/gwas_hub/T1D_34594039_GCST90018925.GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz



In [54]:
hub_path = os.path.join(outdir, 'hub.config.json')
with open(hub_path, 'w') as fw:
    print(json.dumps(json_files, indent='\t'), file=fw)

In [57]:
!zcat results/main/hubs/gwas_hub/T1D_34012112_Gaulton.GWAS_input_colocalization_pval_lt_5eMinus8.onekg.anno.vcf.gz | head -n 100

##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=20210319
##source=ensembl;version=104;url=https://e104.ensembl.org/homo_sapiens
##reference=ftp://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/
##INFO=<ID=COSMIC_92,Number=0,Type=Flag,Description="Somatic mutations found in human cancers from the COSMIC catalogue">
##INFO=<ID=ClinVar_202012,Number=0,Type=Flag,Description="Variants of clinical significance imported from ClinVar">
##INFO=<ID=dbSNP_154,Number=0,Type=Flag,Description="Variants (including SNPs and indels) imported from dbSNP">
##INFO=<ID=HGMD-PUBLIC_20204,Number=0,Type=Flag,Description="Variants from HGMD-PUBLIC dataset December 2020">
##INFO=<ID=TSA,Number=1,Type=String,Description="Type of sequence alteration. Child of term sequence_alteration as defined by the sequence ontology project.">
##INFO=<ID=E_Cited,Number=0,Type=Flag,Description="Cited.https://www.ensembl.org/info/genome/variation/prediction/variant_quality.ht