In [1]:
# Commandline Specification
# Input
# ----------------------------------------------
# 1) colocalization file  x
# 2) gencode gene annotation file in bed format x
# 3) loop data x 
# 4) spp network data x 
# 5) reference chrom sizes x 
# Output
# ----------------------------------------------
# 1) snp-gene loop summary file
# 2) snp-gene pairs longrange file with index
# 3) snp-gene loops longrange file with index

In [7]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp
import json
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')
pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strand1', 'strand2']

# default values for the command line
sys.argv = [0] * 7
sys.argv[1] =  'results/main/2021_Nikhil_eQTL/Results/Colocalization/T1D_34012112_Gaulton/'
sys.argv[1] += 'BLUEPRINT_eQTL_Monocyte/FINAL_Summary_Coloc_Gene_SNP_Pairs.bed'
sys.argv[2] = 'results/refs/ensembl/gencode.v19.annotation.bed'
sys.argv[3] = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_L/FitHiChIP.interactions_FitHiC_Q0.01.bed'
sys.argv[4] = 'results/refs/spp/SPP_D-Challenge_networks.xlsx'
sys.argv[5] = 'results/refs/hg19/hg19.chrom.sizes'
sys.argv[6] = 'results/main/loop_analysis/washU/'

# parsing the commandline arguments
coloc_fn = sys.argv[1]
genes_fn = sys.argv[2]
loop_fn = sys.argv[3]
spp_fn = sys.argv[4]
gs_fn = sys.argv[5]
outdir = sys.argv[6]

# setting the output file names
os.makedirs(outdir, exist_ok=True)
summary_fn = os.path.join(outdir, 'sgloop_summary.xlsx')
sg_pairs_fn = os.path.join(outdir, 'gs_pairs.longrange.bed')
sg_loops_fn = os.path.join(outdir, 'gs_loops.longrange.bed')

## Load the colocalization data

In [9]:
# load the colocalization data
coloc = pd.read_table(coloc_fn)

# extract the most significant according the H4 
coloc_sig_df = coloc[coloc['pp_H4_Coloc_Summary'] > 0.75]
coloc_sig_full = coloc_sig_df.copy(deep=True)
coloc_sig_df = coloc_sig_df[['chr', 'pos', 'rs_id', 'variant_id', 'geneName']]
coloc_sig_df.rename(columns={'pos': 'end'}, inplace=True)
coloc_sig_df.loc[:, 'start'] = coloc_sig_df.loc[:, 'end'] - 1
coloc_sig_df = coloc_sig_df[['chr', 'start', 'end', 'rs_id', 'variant_id', 'geneName']]
coloc_sig_pbt = pbt.BedTool.from_dataframe(coloc_sig_df).sort()

In [10]:
coloc_sig_df.head()

Unnamed: 0,chr,start,end,rs_id,variant_id,geneName
0,chr1,114426000,114426001,rs11102694,rs11102694:114426001:G:A,AP4B1
1,chr1,192538495,192538496,rs2760530,rs2760530,RGS1
2,chr1,114426000,114426001,rs11102694,rs11102694:114426001:G:A,AP4B1
3,chr6,26383249,26383250,rs9467740,rs9467740:26383250:A:T,BTN2A2
4,chr6,33548089,33548090,rs3216621,rs3216621:33548090:A:AG,BAK1


In [11]:
# list of SNP-GENE colocalized pairs 
sg_coloc_set = coloc_sig_full[['rs_id', 'geneName']].values.tolist()
sg_coloc_set = set([tuple(x) for x in sg_coloc_set])

In [12]:
csnp_slop_pbt = coloc_sig_pbt.slop(b=500000, g=gs_fn)

In [13]:
print('In total there are {} colocalized SNPs.'.format(len(coloc_sig_df)))

In total there are 18 colocalized SNPs.


## Load the gene data

In [14]:
# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gene_name']
gencode = genes_df = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode.type.isin(['gene'])]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5]]
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


## Intersect SNPs and Genes

In [15]:
# bedtools intersect for SNP (with 500kb slop) and genes
csnps_gene_pairs = csnp_slop_pbt.intersect(genes_pbt, wa=True, wb=True)

# extract the SNP and gene data
csnps_gene_pairs_df = csnps_gene_pairs.to_dataframe()
csnps_gene_pairs_df = csnps_gene_pairs_df.iloc[:, [0,1,2,6,7,8,3,5,9]]

# remove the slop that you added previously. This has to be done before 
# intersecting the snp-gene pairs with the loops
csnps_gene_pairs_df['start'] += 500000
csnps_gene_pairs_df['end'] -= 500000

# converting to bedpe10 plus data format (allows pairtopair to work)
csnps_gene_pairs_df.columns = bedpe_6cols + ['rs_id', 'coloc_gene', 'sg_gene']
csnps_gene_pairs_df['name'] = '.'
csnps_gene_pairs_df['score'] = '.'
csnps_gene_pairs_df['strand1'] = '.'
csnps_gene_pairs_df['strand2'] = '.'
csnps_gene_pairs_df = csnps_gene_pairs_df[bedpe_10cols + ['rs_id', 'coloc_gene', 'sg_gene']]
csnps_gene_pairs_pbt = pbt.BedTool.from_dataframe(csnps_gene_pairs_df)

In [16]:
csnps_gene_pairs_df.head()

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,name,score,strand1,strand2,rs_id,coloc_gene,sg_gene
0,chr1,114426000,114426001,chr1,113992322,113993350,.,.,.,.,rs11102694,AP4B1,RP11-512F24.1
1,chr1,114426000,114426001,chr1,114119379,114119521,.,.,.,.,rs11102694,AP4B1,MTND5P20
2,chr1,114426000,114426001,chr1,114241506,114242253,.,.,.,.,rs11102694,AP4B1,RP4-730K3.3
3,chr1,114426000,114426001,chr1,114447763,114456708,.,.,.,.,rs11102694,AP4B1,DCLRE1B
4,chr1,114426000,114426001,chr1,114466622,114472114,.,.,.,.,rs11102694,AP4B1,RP5-1073O3.7


## Loading the Loop Data

In [17]:
# load the loop data
loops = pd.read_table(loop_fn)
tmp_loops = loops[['chr1', 's1', 'e1', 'chr2', 's2', 'e2']]
tmp_loops.rename(columns={'p': 'score'}, inplace=True)
tmp_loops.loc[:, 'name'] = '.'
tmp_loops.loc[:, 'score'] = loops['p']
tmp_loops.loc[:, 'strand1'] = '.'
tmp_loops.loc[:, 'strand2'] = '.'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.or

In [18]:
tmp_loops

Unnamed: 0,chr1,s1,e1,chr2,s2,e2,name,score,strand1,strand2
0,chr1,710000,715000,chr1,735000,740000,.,4.895006e-07,.,.
1,chr1,710000,715000,chr1,750000,755000,.,3.771037e-07,.,.
2,chr1,710000,715000,chr1,755000,760000,.,3.431682e-07,.,.
3,chr1,710000,715000,chr1,760000,765000,.,3.114478e-07,.,.
4,chr1,710000,715000,chr1,765000,770000,.,2.822039e-07,.,.
...,...,...,...,...,...,...,...,...,...,...
498662,chr9,140525000,140530000,chr9,140550000,140555000,.,4.895006e-07,.,.
498663,chr9,140640000,140645000,chr9,140755000,140760000,.,1.109468e-07,.,.
498664,chr9,140955000,140960000,chr9,140990000,140995000,.,4.129931e-07,.,.
498665,chr9,140970000,140975000,chr9,140990000,140995000,.,5.290671e-07,.,.


In [19]:
loops = pbt.BedTool.from_dataframe(tmp_loops)

In [20]:
print('FitHiChIP found {} significant loops.'.format(tmp_loops.shape[0]))

FitHiChIP found 498667 significant loops.


## Intersect SNP-Gene Pairs with Loops

In [21]:
# extract loops contain a snp and gene
sgloops_both = csnps_gene_pairs_pbt.pair_to_pair(loops, type='both', **{'is':True})
sgloops_both_df = sgloops_both.to_dataframe(disable_auto_names=True, header=None)

# extract coordinate and meta data columns
cols = list(range(0,6)) + list(range(10,19)) + [20]
sgloops_both_df = sgloops_both_df[cols]

sgloops_both_df.columns = ['chrS', 'startS', 'endS', 'chrG', 'startG', 'endG', 'rs_id', 'coloc_gene', 'sg_gene', 
                           'chrLA', 'startLA', 'endLA', 'chrRA', 'startRA', 'endRA', 'p-value']
                           #'nameL', 'scoreL', 'strandLA', 'strandLB']
sgloops_both_df

Unnamed: 0,chrS,startS,endS,chrG,startG,endG,rs_id,coloc_gene,sg_gene,chrLA,startLA,endLA,chrRA,startRA,endRA,p-value
0,chr1,114426000,114426001,chr1,114447763,114456708,rs11102694,AP4B1,DCLRE1B,chr1,114425000,114430000,chr1,114445000,114450000,5.290671e-07
1,chr1,114426000,114426001,chr1,114466622,114472114,rs11102694,AP4B1,RP5-1073O3.7,chr1,114425000,114430000,chr1,114470000,114475000,3.431682e-07
2,chr1,114426000,114426001,chr1,114304454,114355098,rs11102694,AP4B1,RSBN1,chr1,114300000,114305000,chr1,114425000,114430000,1.011634e-07
3,chr1,114426000,114426001,chr1,114356433,114414381,rs11102694,AP4B1,PTPN22,chr1,114410000,114415000,chr1,114425000,114430000,5.684843e-07
4,chr1,114426000,114426001,chr1,114437370,114447823,rs11102694,AP4B1,AP4B1,chr1,114425000,114430000,chr1,114445000,114450000,5.290671e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,chr6,33548089,33548090,chr6,33588142,33664351,rs3216621,BAK1,ITPR3,chr6,33545000,33550000,chr6,33610000,33615000,2.321081e-07
394,chr6,33548089,33548090,chr6,33662070,33679504,rs3216621,BAK1,UQCC2,chr6,33545000,33550000,chr6,33670000,33675000,1.011634e-07
395,chr6,33548089,33548090,chr6,33662070,33679504,rs3216621,BAK1,UQCC2,chr6,33545000,33550000,chr6,33675000,33680000,9.701485e-08
396,chr6,33548089,33548090,chr6,33286335,33297046,rs3216621,BAK1,DAXX,chr6,33290000,33295000,chr6,33545000,33550000,4.594176e-08


## Characterize the GSLoops

In [22]:
# loading SPP Differentially expressed genes 
spp = pd.read_excel(spp_fn, sheet_name='3. Experiment DEGs', usecols=[5])
spp_genes = set([x[0] for x in spp.iloc[7:].values])

In [23]:
# getting the SNP-GENE pairs which overlap a fithic loop
sg_hichip_set = sgloops_both_df[['rs_id', 'sg_gene']].values.tolist()
sg_hichip_set = set([tuple(x) for x in sg_hichip_set])

char_data = []
for i, sr in csnps_gene_pairs_df.iterrows():
    
    # check whether this snp-gene pair has a colocalized record
    if (sr.rs_id, sr.sg_gene) in sg_coloc_set:
        coloc_sg = 1
    else:
        coloc_sg = 0
        
    # check whether this snp-gene pair has a sgloop
    if (sr.rs_id, sr.sg_gene) in sg_hichip_set:
        fithichip = 1
    else:
        fithichip = 0
        
    # check whether this gene is differntially expressed according to SPP
    if sr.sg_gene in spp_genes:
        spp = 1
    else:
        spp = 0
    
    char_data.append([coloc_sg, fithichip, spp])

char_data = pd.DataFrame(char_data)
char_data = pd.concat([csnps_gene_pairs_df, char_data], axis=1)
char_data.columns = bedpe_10cols + ['rs_id', 'coloc_egene', 'sg_gene', 'coloc', 'fithichip', 'spp']

In [24]:
final_data = char_data[['rs_id', 'coloc_egene', 'sg_gene', 'coloc', 'fithichip', 'spp']]
final_data = final_data.sort_values(['rs_id', 'coloc', 'fithichip', 'coloc_egene', 'sg_gene'], 
                                    ascending=[True, False, False, True, True])
final_data = final_data.set_index(['rs_id', 'coloc_egene', 'sg_gene'])

# remove duplicate SNP-GENE combination which are tested twice by 
# the colocalization analysis
final_data = final_data[~final_data.reset_index().duplicated(subset=['rs_id', 'sg_gene']).values]
final_data.to_excel(summary_fn, index=True, header=True)

In [25]:
final_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,coloc,fithichip,spp
rs_id,coloc_egene,sg_gene,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs10085721,SKAP2,SKAP2,1,0,1
rs10085721,SKAP2,AC004079.1,0,0,0
rs10085721,SKAP2,AC004540.4,0,0,0
rs10085721,SKAP2,AC004540.5,0,0,0
rs10085721,SKAP2,AC004947.2,0,0,0
...,...,...,...,...,...
rs9926245,SULT1A2,SPNS1,0,0,1
rs9926245,SULT1A2,TPRKBP2,0,0,0
rs9926245,SULT1A2,TUFM,0,0,1
rs9926245,SULT1A2,XPO6,0,0,1


## Convert all results to output and WashU visualization results

In [26]:
def bedpe_to_WashU_longrange(fn, df):
    """
        Convert from a loop bedpe file into WashU longrange, 
        includes bgzip and tabix of the fn. 
        
        Params
        -------
        fn: str
            path to the longrange output file (without gz)
            
        df: dataframe
            columns 1-6 are as expected and column 7 is the p or q-value. 
            
        Output
        ------
        gzfn: str
            path to the longrange with bgzip compression
        tabix_fn: str
            path to the index of the longrange file
            
    """

    # parsing the data into WashU longrage format
    data = []
    for sr in df.values.tolist():

        # calculate the -log(FDR)
        qval = -np.log(sr[6])

        # get the first pair data
        second_pair_str = '{}:{}-{},{:.5f}'.format(*sr[3:6], qval)
        first_row = sr[0:3] + [second_pair_str]

        # get the second pair data
        first_pair_str = '{}:{}-{},{:.5f}'.format(*sr[0:3], qval)
        second_row = sr[3:6] + [first_pair_str]

        # add each data row
        data.append(first_row)
        data.append(second_row)

    data = sorted(data, key=lambda x: (x[0], x[1], x[2]))

    # writing out the data
    with open(fn, 'w') as f:
        for line in data:
            info = [str(x) for x in line]
            info = '\t'.join(info)
            f.write(info + '\n')
            
    # run bgzip
    cmd = '{} {}'.format(bgzip, fn)
    print(cmd)
    job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

    out, err = job.communicate()
    print('out:', out.decode())
    print('err:', err.decode())
    
    # run tabix
    lrange_gzfn = fn + '.gz'
    cmd = '{} {}'.format(tabix, lrange_gzfn)
    print(cmd)
    job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

    out, err = job.communicate()
    print('out:', out.decode())
    print('err:', err.decode())

    print('Created the gzfn: {}'.format(fn + '.gz'))
    print('Created the tabix: {}'.format(fn + '.gz.tbi'))

In [68]:
def bed_WashU_bedgz(fn, df):
    """
        Convert from a bed dataframe into WashU longrange file 
        includes bgzip and tabix of the fn. 
        
        Params
        -------
        fn: str
            path to the longrange output file (without gz)
            
        df: dataframe
            columns 1-3 are as expected and column 7 is the p or q-value. 
            
        Output
        ------
        gzfn: str
            path to the longrange with bgzip compression
        tabix_fn: str
            path to the index of the longrange file
            
    """

    # parsing the data into WashU longrage format
    data = []
    for sr in df.values.tolist():
        data.append(sr[0:4])
    data = sorted(data, key=lambda x: (x[0], x[1], x[2]))

    # writing out the data
    with open(fn, 'w') as f:
        for line in data:
            info = [str(x) for x in line]
            info = '\t'.join(info)
            f.write(info + '\n')
            
    # run bgzip
    cmd = '{} {}'.format(bgzip, fn)
    print(cmd)
    job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

    out, err = job.communicate()
    print('out:', out.decode())
    print('err:', err.decode())
    
    # run tabix
    gzfn = fn + '.gz'
    cmd = '{} {}'.format(tabix, gzfn)
    print(cmd)
    job = sp.Popen(cmd, stderr=sp.PIPE,stdout=sp.PIPE, shell=True)

    out, err = job.communicate()
    print('out:', out.decode())
    print('err:', err.decode())

    print('Created the gzfn: {}'.format(fn + '.gz'))
    print('Created the tabix: {}'.format(fn + '.gz.tbi'))

In [27]:
# make the longrange file
final_sg_pairs = sgloops_both_df.iloc[:, [0,1,2,3,4,5,-1]]
bedpe_to_WashU_longrange(sg_pairs_fn, final_sg_pairs)

/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/loop_analysis/washU/gs_pairs.longrange.bed
out: 
err: [bgzip] results/main/loop_analysis/washU/gs_pairs.longrange.bed.gz already exists; do you wish to overwrite (y or n)? [bgzip] not overwritten

/mnt/BioApps/tabix/tabix-0.2.6/tabix results/main/loop_analysis/washU/gs_pairs.longrange.bed.gz
out: 
err: [tabix] the index file exists. Please use '-f' to overwrite.

Created the gzfn: results/main/loop_analysis/washU/gs_pairs.longrange.bed.gz
Created the tabix: results/main/loop_analysis/washU/gs_pairs.longrange.bed.gz.tbi


In [112]:
# make the longrange file
final_sg_loops = sgloops_both_df.iloc[:, [9,10,11,12,13,14,-1]]
bedpe_to_WashU_longrange(sg_loops_fn, final_sg_loops)

/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/loop_analysis/washU/gs_loops.longrange.bed
out: 
err: [bgzip] results/main/loop_analysis/washU/gs_loops.longrange.bed.gz already exists; do you wish to overwrite (y or n)? [bgzip] not overwritten

/mnt/BioApps/tabix/tabix-0.2.6/tabix results/main/loop_analysis/washU/gs_loops.longrange.bed.gz
out: 
err: [tabix] the index file exists. Please use '-f' to overwrite.

Created the gzfn: results/main/loop_analysis/washU/gs_loops.longrange.bed.gz
Created the tabix: results/main/loop_analysis/washU/gs_loops.longrange.bed.gz.tbi


In [86]:
# add the snps as elements to plot
final_sg_snps = sgloops_both_df.iloc[:, [0,1,2,6]]
final_sg_snps = final_sg_snps.loc[~final_sg_snps.duplicated()]
final_sg_snps = pbt.BedTool.from_dataframe(final_sg_snps)
final_sg_snps = final_sg_snps.slop(b=500, g=gs_fn)
final_sg_snps = final_sg_snps.to_dataframe()

In [87]:
sg_snps_fn = 'results/main/loop_analysis/washU/gs_snps.bed'
bed_WashU_bedgz(sg_snps_fn, final_sg_snps)

/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/loop_analysis/washU/gs_genes.longrange.bed
out: 
err: [bgzip] results/main/loop_analysis/washU/gs_genes.longrange.bed.gz already exists; do you wish to overwrite (y or n)? [bgzip] not overwritten

/mnt/BioApps/tabix/tabix-0.2.6/tabix results/main/loop_analysis/washU/gs_genes.longrange.bed.gz
out: 
err: [tabix] the index file exists. Please use '-f' to overwrite.

Created the gzfn: results/main/loop_analysis/washU/gs_genes.longrange.bed.gz
Created the tabix: results/main/loop_analysis/washU/gs_genes.longrange.bed.gz.tbi


In [126]:
final_sg_genes = csnps_gene_pairs_df.iloc[:, [3,4,5,12]]
final_sg_genes = final_sg_genes.loc[~final_sg_genes.duplicated()]
final_sg_genes['chr'] = final_sg_genes['chrG'] 
final_sg_genes['transcript_start'] = final_sg_genes['startG']
final_sg_genes['transcript_stop'] = final_sg_genes['endG']
final_sg_genes['translation_start'] = final_sg_genes['startG']
final_sg_genes['translation_stop'] = final_sg_genes['endG']
final_sg_genes['strand'] = '+'
final_sg_genes['gene_name'] = final_sg_genes['coloc_gene']
final_sg_genes['transcript_id'] = final_sg_genes['coloc_gene']
final_sg_genes['type'] = 'coding'
final_sg_genes['exon_startG'] = final_sg_genes['startG']
final_sg_genes['exon_stops'] = final_sg_genes['endG']
refcols = ['chr', 'transcript_start', 'transcript_stop', 'translation_start',
           'translation_stop', 'strand', 'gene_name', 'transcript_id',
           'type', 'exon_startG', 'exon_stops']
final_sg_genes = final_sg_genes.loc[:, refcols]

In [69]:
sg_genes_fn = 'results/main/loop_analysis/washU/gs_genes.refbed'
bed_WashU_bedgz(sg_genes_fn, final_sg_genes)

/mnt/BioApps/tabix/tabix-0.2.6/bgzip results/main/loop_analysis/washU/gs_genes.longrange.bed
out: 
err: [bgzip] results/main/loop_analysis/washU/gs_genes.longrange.bed.gz already exists; do you wish to overwrite (y or n)? [bgzip] not overwritten

/mnt/BioApps/tabix/tabix-0.2.6/tabix results/main/loop_analysis/washU/gs_genes.longrange.bed.gz
out: 
err: [tabix] the index file exists. Please use '-f' to overwrite.

Created the gzfn: results/main/loop_analysis/washU/gs_genes.longrange.bed.gz
Created the tabix: results/main/loop_analysis/washU/gs_genes.longrange.bed.gz.tbi


In [30]:
sg_pairs_json = {'type': 'longrange', 
                 'filename': sg_pairs_fn,
                 'name': 'SNP-Gene Pairs'}

sg_loops_json = {'type': 'longrange', 
                 'filename': sg_loops_fn,
                 'name': 'SNP-Gene Loops'}

hub_json = [sg_pairs_json, sg_loops_json]

hub_json_fn = os.path.join(outdir, 'hub.config.json')
with open(hub_json_fn, 'w') as f:
    f.write(json.dumps(hub_json, indent=4))

In [47]:
loops_link = os.path.join(outdir, os.path.basename(loop_fn + '.gz'))
os.link(loop_fn, loops_link)

In [48]:
json_encoder.encode()

TypeError: encode() missing 1 required positional argument: 'o'

In [None]:
[
    {
    "type": "bigwig",
    "url": "https://vizhub.wustl.edu/public/tmp/TW463_20-5-bonemarrow_MeDIP.bigWig",
    "name": "MeDIP",
    "options": {
        "color": "red",
        "backgroundColor":"#FFE7AB"
        },
    "metadata": {
        "sample": "bone",
        "assay": "MeDIP"
        }
    },
    {
    "type": "bigwig",
    "url": "https://vizhub.wustl.edu/public/tmp/TW551_20-5-bonemarrow_MRE.CpG.bigWig",
    "name": "MRE",
    "options": {
        "color": "blue",
        "backgroundColor":"#C0E3CC"
        },
    "metadata": {
        "sample": "bone",
        "assay": "MRE"
        }
    }
]

In [94]:
s = 'chr, transcript_start, transcript_stop, translation_start, translation_stop, strand, gene_name, transcript_id, type, exon(including UTR bases) starts, exon(including UTR bases) stops, and additional gene info (optional)'.split(',')

In [95]:
s

['chr',
 ' transcript_start',
 ' transcript_stop',
 ' translation_start',
 ' translation_stop',
 ' strand',
 ' gene_name',
 ' transcript_id',
 ' type',
 ' exon(including UTR bases) starts',
 ' exon(including UTR bases) stops',
 ' and additional gene info (optional)']

In [96]:
len(s)

12