In [90]:
import os
import pandas as pd 
import glob
import json
import subprocess as sp
import numpy as np
os.chdir('/mnt/BioHome/jreyna/jreyna-temp/projects/dchallenge')

outdir = 'results/main/pc_hic/2016_javierre/processing/'
os.makedirs(outdir, exist_ok=True)

## load the cell name conversion for Javierre et al 2016

In [54]:
cell_meta = pd.read_table('results/refs/meta/cell_types.dict.tsv')
cell_dict = cell_meta.loc[cell_meta.source == 'Javierre et al., 2016']
cell_dict = cell_dict[['source-name', 'project-name']]
cell_dict.set_index('source-name', inplace=True)
cell_dict = cell_dict['project-name'].to_dict()

In [97]:
# loading the cell colors
cell_colors = pd.read_table('results/refs/dice_colors/colors.revised.t1d-project.V2.tsv')
cell_colors.set_index('color-name', inplace=True)

In [98]:
cell_colors

Unnamed: 0_level_0,dice-name,hex,rgb,full-name
color-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
monocyte,CM,#B22222,1783434,Monocyte
b-cell,BN,#32CD32,5020550,B cell
cd4t-cell,CD4N,#00BFFF,191255,CD4 T-cell
cd8t-cell,CD8N,#EE82EE,238130238,CD8 T-cells
nk-cell,NK,#FFD700,2552150,NK cell


## Load and extract necessary data

In [3]:
loops = pd.read_table('results/main/pc_hic/2016_javierre/data/PCHiC_peak_matrix_cutoff5.tsv')

# reformatting dataframe into bedpe format
bedpe_reformat_cols = ['baitChr',
         'baitStart',
         'baitEnd',
         'oeChr',
         'oeStart',
         'oeEnd',
         'Mon',
         'Mac0',
         'Mac1',
         'Mac2',
         'Neu',
         'MK',
         'EP',
         'Ery',
         'FoeT',
         'nCD4',
         'tCD4',
         'aCD4',
         'naCD4',
         'nCD8',
         'tCD8',
         'nB',
         'tB',
         'baitID',
         'baitName',
         'oeID',
         'oeName',
         'dist',
         'clusterID',
         'clusterPostProb']

main_bedpe = loops.loc[:, bedpe_reformat_cols]

  loops = pd.read_table('results/main/pc_hic/2016_javierre/data/PCHiC_peak_matrix_cutoff5.tsv')


In [4]:
# add chr to the chromosome column names
main_bedpe.iloc[:, 0] = 'chr' + main_bedpe.iloc[:, 0].astype(str)
main_bedpe.iloc[:, 3] = 'chr' + main_bedpe.iloc[:, 3].astype(str)

# add the longrange 4th column
main_bedpe['lr-right'] = main_bedpe['oeChr'].astype(str) + ':' + \
                            main_bedpe['oeStart'].astype(str) + '-' + \
                            main_bedpe['oeEnd'].astype(str)

## Save interactions as bedpe

In [5]:
# listing bedpe columns with pc-hic columns 
bedpe_cols = ['baitChr', 'baitStart', 'baitEnd',
                 'oeChr', 'oeStart', 'oeEnd']

for cell_name, proj_name in cell_dict.items():
    if cell_name in main_bedpe.columns:
        cell_bedpe = main_bedpe.loc[:, bedpe_cols + [cell_name]]
        bedpe_fn = os.path.join(outdir, '{}.bedpe'.format(proj_name))
        cell_bedpe.to_csv(bedpe_fn, sep='\t', header=False, index=False)

## Save interactions as longrange

In [6]:
# listing bedpe columns with pc-hic columns 
longrange_cols = ['baitChr', 'baitStart', 'baitEnd', 'lr-right']

for cell_name, proj_name in cell_dict.items():
    if cell_name in main_bedpe.columns:

        cell_bedpe = main_bedpe.loc[:, longrange_cols + [cell_name]]
        cell_bedpe['lr-right'] = cell_bedpe['lr-right'] + ',' + cell_bedpe[cell_name].astype(str)
        
        bedpe_fn = os.path.join(outdir, '{}.longrange.bed'.format(proj_name))
        cell_bedpe.to_csv(bedpe_fn, sep='\t', header=False, index=False)

In [34]:
# also make an index file for reach longrange file

In [39]:
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'

In [40]:
for fn in glob.glob(os.path.join(outdir, '*.longrange.bed')):
    
    # make a gz file
    cmd = '{0} -c {1} > {1}.gz'.format(bgzip, fn)
    j = sp.check_output(cmd, shell=True)
    print(j)
    
    # make an index file
    cmd = '{0} {1}.gz'.format(tabix, fn)
    g = sp.check_output(cmd, shell=True)
    print(g)

b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''


## Make a hub for longrange loops

In [110]:
# collect json formatted information
url_root = 'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/pc_hic/2016_javierre/processing/'
hub = []
for fn in glob.glob(os.path.join(outdir, '*.longrange.bed.gz')):
    
    # get basename for url
    bn = os.path.basename(fn)
    
    # create the whole URL
    full_url = os.path.join(url_root, bn)                       
    print(full_url)
    
    # get cell name 
    cell_name = bn.split('.')[0]
    
    # use cell name to get cell metadata info
    cell_info = cell_meta.loc[(cell_meta.source == 'Javierre et al., 2016') & \
                              (cell_meta['project-name'] == cell_name)]
    cell_info = cell_info.iloc[0] 
    
    # fill in the JSON details
    entry = {}
    entry['type'] = 'longrange'
    entry['name'] = 'PC HiC Loops for {}'.format(cell_name)
    entry['url'] = full_url
    
    print(cell_info)
    
    
    # add color name if cell has been pre-assigned a color
    if str(cell_info['color-name']) == 'nan': # starting with nan cases
        entry['showOnHubLoad'] = False
        entry['options'] = {'ensembleStyle': True, 'height': 100, 'displayMode': 'arc'}
    else:
        entry['showOnHubLoad'] = True
        color = cell_colors.loc[cell_info['color-name'], 'hex']
        entry['options'] = {'ensembleStyle': True, 'height': 100, 'displayMode': 'arc', 'color': color}

    hub.append(entry)

https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/pc_hic/2016_javierre/processing/neutrophil.longrange.bed.gz
source          Javierre et al., 2016
source-name                       Neu
full-name                  Neutrophil
project-name               neutrophil
color-name                        NaN
Name: 10, dtype: object
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/pc_hic/2016_javierre/processing/monocyte.longrange.bed.gz
source          Javierre et al., 2016
source-name                       Mon
full-name                    Monocyte
project-name                 monocyte
color-name                   monocyte
Name: 6, dtype: object
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/pc_hic/2016_javierre/processing/total-b-cell.longrange.bed.gz
source          Javierre et al., 2016
source-name                        tB
full-name        

In [111]:
## save the hub
hub_fn = os.path.join(outdir, 'hub.config.txt')
with open(hub_fn, 'w') as fw:
    s = json.dumps(hub,  indent=True)
    fw.write(s) 

In [112]:
hub_link = os.path.join(url_root, os.path.basename(hub_fn))

In [113]:
print(hub_link)

https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/pc_hic/2016_javierre/processing/hub.config.txt
