In [1]:
import os 
import numpy as np
import pandas as pd
import glob
import subprocess as sp
import json

# change the working directory
os.chdir('<project-dir>')

pd.set_option('display.max_columns', 1000)

In [2]:
outdir = 'results/main/loop_catalog/processing/'
os.makedirs(outdir, exist_ok=True)

In [3]:
resolutions = [5000, 10000, 25000]

stringency_no = [0, 1]
stringency_str = ['S5', 'S10', 'S25', 'L5', 'L10', 'L25']

# getting the template paths for different loops
paths = {}
paths['HiCCUPS'] = '<loops-dir>/loops/hiccups/{sample_name}/postprocessed_pixels_{res}.bedpe'
paths['FitHiChIP using ChIP-Seq peaks'] = '<loops-dir>/loops/fithichip/{sample_name}_chipseq.peaks/{stg_str}/FitHiChIP_Peak2ALL_b{res}_L20000_U2000000/P2PBckgr_{stg_no}/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-{stg_str}.interactions_FitHiC_Q0.01.bed'
paths['FitHiChIP using HiChIP-Peaks peaks'] = '<loops-dir>/loops/fithichip/{sample_name}_hichip-peaks.peaks/{stg_str}/FitHiChIP_Peak2ALL_b{res}_L20000_U2000000/P2PBckgr_{stg_no}/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-{stg_str}.interactions_FitHiC_Q0.01.bed'

In [4]:
# loading sample data
samples = pd.read_excel('results/main/loop_catalog/master-table.immune-samples-only.xlsx')
cline, gse, species, pulldown, biorep = zip(*samples['full.sample.name'].str.split('.'))

samples['cline'] = cline
samples['gse'] = gse
samples['pulldown'] = pulldown
samples['biorep'] = biorep

In [5]:
samples.pulldown.value_counts()

H3K27ac    22
CTCF       11
SMC1A       2
H3K4me1     1
KLF4        1
NANOG       1
OCT4        1
Rad21       1
Name: pulldown, dtype: int64

In [6]:
enhancer_samples = samples.loc[samples['pulldown'] == 'H3K27ac']

In [7]:
enhancer_samples.cline.unique().tolist()

['GM12878',
 'HARA',
 'MyLa',
 'Naive_Tcells',
 'Th17',
 'Treg',
 'lgs000379',
 'lgs101645',
 'lgs102580',
 'lgs102943',
 'lgs301283',
 'lgs301315']

## Looking at 5kb data

In [8]:
# extra FitHiChIP loops for best filtering parameters
res = 5000
stg_no = 1
stg_str = 'S5'
full_data = []
for sample_name in enhancer_samples['full.sample.name']:
    
    hiccups = paths['HiCCUPS'].format(sample_name=sample_name,
                                      stg_str=stg_str,
                                      stg_no=stg_no,
                                      res=res)
    
    if os.path.exists(hiccups):
        
        hiccups_df = pd.read_table(hiccups)
        hiccups_df = hiccups_df.loc[~hiccups_df['#chr1'].str.match('^#')]
        hiccups_df.loc[:, '#chr1'] = 'chr' + hiccups_df.loc[:, '#chr1']
        hiccups_df.loc[:, 'chr2'] = 'chr' + hiccups_df.loc[:, 'chr2']
        hiccups_df.loc[:, 'x1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'x2'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'y1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'y2'] = hiccups_df['x1'].astype(int)

        hiccups_lr = hiccups_df[['#chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'fdrH']]
        #hiccups_lr.loc[hiccups_lr['fdrH'] == 0, 'fdrH'] = 1 * 10**(-8) # assign super sign to 0 p-value regions
        
        hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])

        # assign max score to inf
        s = hiccups_lr['-log10(fdrH)']
        s = s[s != np.inf]     
        hiccups_lr.loc[hiccups_lr['-log10(fdrH)'] == np.inf, '-log10(fdrH)'] = max(s)
        
        
        full_data.append(hiccups_lr)
         


  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata

In [9]:
full_df = pd.concat(full_data)

In [11]:
full_df['dist'] = full_df['y1'] - full_df['x1']

In [13]:
full_df.loc[full_df['dist'] > 0 ]

Unnamed: 0,#chr1,x1,x2,chr2,y1,y2,fdrH,-log10(fdrH),dist


## Looking at 10kb data

In [14]:
# extra FitHiChIP loops for best filtering parameters
res = 10000
stg_no = 1
stg_str = 'S5'
full_data = []
for sample_name in enhancer_samples['full.sample.name']:
    
    hiccups = paths['HiCCUPS'].format(sample_name=sample_name,
                                      stg_str=stg_str,
                                      stg_no=stg_no,
                                      res=res)
    
    if os.path.exists(hiccups):
        
        hiccups_df = pd.read_table(hiccups)
        hiccups_df = hiccups_df.loc[~hiccups_df['#chr1'].str.match('^#')]
        hiccups_df.loc[:, '#chr1'] = 'chr' + hiccups_df.loc[:, '#chr1']
        hiccups_df.loc[:, 'chr2'] = 'chr' + hiccups_df.loc[:, 'chr2']
        hiccups_df.loc[:, 'x1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'x2'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'y1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'y2'] = hiccups_df['x1'].astype(int)

        hiccups_lr = hiccups_df[['#chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'fdrH']]
        #hiccups_lr.loc[hiccups_lr['fdrH'] == 0, 'fdrH'] = 1 * 10**(-8) # assign super sign to 0 p-value regions
        
        hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])

        # assign max score to inf
        s = hiccups_lr['-log10(fdrH)']
        s = s[s != np.inf]     
        hiccups_lr.loc[hiccups_lr['-log10(fdrH)'] == np.inf, '-log10(fdrH)'] = max(s)
        
        
        full_data.append(hiccups_lr)
         


  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata

In [15]:
full_df = pd.concat(full_data)

In [16]:
full_df['dist'] = full_df['y1'] - full_df['x1']

In [17]:
full_df.loc[full_df['dist'] > 0 ]

Unnamed: 0,#chr1,x1,x2,chr2,y1,y2,fdrH,-log10(fdrH),dist


## Looking at 25kb data

In [18]:
# extra FitHiChIP loops for best filtering parameters
res = 25000
stg_no = 1
stg_str = 'S5'
full_data = []
for sample_name in enhancer_samples['full.sample.name']:
    
    hiccups = paths['HiCCUPS'].format(sample_name=sample_name,
                                      stg_str=stg_str,
                                      stg_no=stg_no,
                                      res=res)
    
    if os.path.exists(hiccups):
        
        hiccups_df = pd.read_table(hiccups)
        hiccups_df = hiccups_df.loc[~hiccups_df['#chr1'].str.match('^#')]
        hiccups_df.loc[:, '#chr1'] = 'chr' + hiccups_df.loc[:, '#chr1']
        hiccups_df.loc[:, 'chr2'] = 'chr' + hiccups_df.loc[:, 'chr2']
        hiccups_df.loc[:, 'x1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'x2'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'y1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'y2'] = hiccups_df['x1'].astype(int)

        hiccups_lr = hiccups_df[['#chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'fdrH']]
        #hiccups_lr.loc[hiccups_lr['fdrH'] == 0, 'fdrH'] = 1 * 10**(-8) # assign super sign to 0 p-value regions
        
        hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])

        # assign max score to inf
        s = hiccups_lr['-log10(fdrH)']
        s = s[s != np.inf]     
        hiccups_lr.loc[hiccups_lr['-log10(fdrH)'] == np.inf, '-log10(fdrH)'] = max(s)
        
        
        full_data.append(hiccups_lr)
         


  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata

In [19]:
full_df = pd.concat(full_data)

In [20]:
full_df['dist'] = full_df['y1'] - full_df['x1']

In [21]:
full_df.loc[full_df['dist'] > 0 ]

Unnamed: 0,#chr1,x1,x2,chr2,y1,y2,fdrH,-log10(fdrH),dist
