In [None]:
import os
import pandas as pd 
import glob
import json
import subprocess as sp
import numpy as np
os.chdir('/mnt/BioHome/jreyna/jreyna-temp/projects/dchallenge')

Data was processed using GRCh37.

In [4]:
outdir = 'results/main/chiou_2021/processing/'
os.makedirs(outdir, exist_ok=True)

## Loading the cell meta-data

In [38]:
# getting the cell name dictionary
cell_names = pd.read_table('results/refs/meta/cell_types.dict.tsv')
cell_names = cell_names.loc[cell_names['source'] == 'Chiou et al., 2021 - SuppData3']
t1d_cells = cell_names.loc[~cell_names['color-name'].isna()]

In [60]:
t1d_cells.set_index('project-name', inplace=True)
t1d_cells

Unnamed: 0_level_0,source,source-name,full-name,color-name
project-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
activated-cd4t-cell,"Chiou et al., 2021 - SuppData3",activated CD4 T,Activated CD4+ T-cell,cd4t-cell
adaptive-nk-cell,"Chiou et al., 2021 - SuppData3",adaptive NK,Adaptive Natural Killer cell,nk-cell
monocyte,"Chiou et al., 2021 - SuppData3",classical monocyte,Monocyte,monocyte
dendrictic,"Chiou et al., 2021 - SuppData3",conventional dendritic,Dendritic cell,monocyte
cyto-nk-cell,"Chiou et al., 2021 - SuppData3",cytotoxic NK,Cytotoxic Natural Killer cell,nk-cell
mem-b-cell,"Chiou et al., 2021 - SuppData3",memory B,Memory B cell,b-cell
mem-cd8t-cell,"Chiou et al., 2021 - SuppData3",memory CD8 T,Memory CD8+ T-cell,cd8t-cell
b-cell,"Chiou et al., 2021 - SuppData3",naive B,B cell,b-cell
non-class-monocyte,"Chiou et al., 2021 - SuppData3",non-classical monocyte,Non-classical Monocyte,monocyte
pancreatic-cd8t-cell,"Chiou et al., 2021 - SuppData3",pancreatic CD8 T,Pancreatic CD8+ T-cell,cd8t-cell


In [40]:
# loading the cell colors
cell_colors = pd.read_table('results/refs/dice_colors/colors.revised.t1d-project.V2.tsv')
cell_colors.set_index('color-name', inplace=True)

In [41]:
cell_colors

Unnamed: 0_level_0,dice-name,hex,rgb,full-name
color-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
monocyte,CM,#B22222,1783434,Monocyte
b-cell,BN,#32CD32,5020550,B cell
cd4t-cell,CD4N,#00BFFF,191255,CD4 T-cell
cd8t-cell,CD8N,#EE82EE,238130238,CD8 T-cells
nk-cell,NK,#FFD700,2552150,NK cell


## Processing Supplementary table 3

In [12]:
# supp table 4
# table = pd.read_excel('results/main/chiou_2021/Supplemental4.celltype.cCREs.41586_2021_3552_MOESM7_ESM.xlsx',
#                      skiprows=2)

# supp table 3
table = pd.read_excel('results/main/chiou_2021/Supplemental3.cCREs.41586_2021_3552_MOESM6_ESM.xlsx', skiprows=2)

In [69]:
table['chr'] = 'chr' + table['chrom.'].astype(str)

#### Extract the cell peaks and save to bed

In [72]:
for project_name, sr in t1d_cells.iterrows():
    
    # extract the peaks
    source_name = sr['source-name']
    cell_peaks = table.loc[:, ['chr', 'start', 'end', source_name]]
    cell_peaks = cell_peaks.loc[cell_peaks[source_name] == 1]
    cell_peaks.drop(source_name, axis=1, inplace=True)
    
    # save the peaks to bed file
    bed_fn = os.path.join(outdir, '{}.bed'.format(project_name))
    
    cell_peaks.to_csv(bed_fn, sep='\t', index=False, header=False)

#### Make a bgzip version of files

In [73]:
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'

In [74]:
for fn in glob.glob(os.path.join(outdir, '*.bed')):
    
    # make a gz file
    cmd = '{0} -c {1} > {1}.gz'.format(bgzip, fn)
    j = sp.check_output(cmd, shell=True)
    print(j)
    
    # make an index file
    cmd = '{0} -f {1}.gz'.format(tabix, fn)
    g = sp.check_output(cmd, shell=True)
    print(g)

b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''


#### Make a hub for longrange loops

In [79]:
# collect json formatted information
url_root = 'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/chiou_2021/processing'
hub = []
for fn in glob.glob(os.path.join(outdir, '*.bed.gz')):
    
    # get basename for url
    bn = os.path.basename(fn)
    
    # create the whole URL
    full_url = os.path.join(url_root, bn)                       
    print(full_url)
    
    # get cell name 
    cell_name = bn.split('.')[0]
    
    # use cell name to get cell metadata info
    cell_info = t1d_cells.loc[cell_name]
        
    # fill in the JSON details
    entry = {}
    entry['type'] = 'bed'
    entry['name'] = 'snATACseq peaks for {}'.format(cell_info['full-name'])
    entry['url'] = full_url
    
    print(cell_info)
    
    
    # add color name if cell has been pre-assigned a color
    if str(cell_info['color-name']) == 'nan': # starting with nan cases
        entry['showOnHubLoad'] = False
        entry['options'] = {'ensembleStyle': True, 'height': 200}
    else:
        entry['showOnHubLoad'] = True
        color = cell_colors.loc[cell_info['color-name'], 'hex']
        entry['options'] = {'ensembleStyle': True, 'height': 200, 'color': color}

    hub.append(entry)

https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/chiou_2021/processing/activated-cd4t-cell.bed.gz
source         Chiou et al., 2021 - SuppData3
source-name                   activated CD4 T
full-name               Activated CD4+ T-cell
color-name                          cd4t-cell
Name: activated-cd4t-cell, dtype: object
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/chiou_2021/processing/mem-cd8t-cell.bed.gz
source         Chiou et al., 2021 - SuppData3
source-name                      memory CD8 T
full-name                  Memory CD8+ T-cell
color-name                          cd8t-cell
Name: mem-cd8t-cell, dtype: object
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/chiou_2021/processing/pancreatic-macrophage.bed.gz
source         Chiou et al., 2021 - SuppData3
source-name             pancreatic macrophage
full-name            

In [80]:
## save the hub
hub_fn = os.path.join(outdir, 'hub.config.txt')
with open(hub_fn, 'w') as fw:
    s = json.dumps(hub,  indent=True)
    fw.write(s) 

In [81]:
hub_link = os.path.join(url_root, os.path.basename(hub_fn))

In [82]:
hub_link

'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/chiou_2021/processing/hub.config.txt'