In [1]:
import os 
import pandas as pd
import numpy as np
import glob

## Loading the samplesheet data

In [2]:
# load the sample sheet with the number of loops
wc = pd.read_csv('../samplesheets/samplesheet.fithichip_cp.wc.txt',
                 sep='\t',
                 names=['sample', 'num_loops'],
                 header=None)

# extract metadata from the file names
def get_meta(x):
    meta = x.split('/')
    bn = meta[-1]
    bn_split = bn.split('.')
    res = bn_split[5]
    ref = meta[2]
    loop_source = meta[4]
    peak_source = meta[5]
    peak_caller = meta[6]
    stringency = meta[7]
    loop_caller = 'fithichip' if 'FitHiC' in meta[8] else 'other'
    return((ref, res, loop_source, peak_source, peak_caller, stringency, loop_caller))
metadata = wc['sample'].apply(get_meta).values.tolist()
metadata = list(zip(*metadata))
refs, res, loop_sources, peak_sources, peak_callers, stringencies, loop_callers = metadata

wc['ref'] = refs
wc['res'] = res
wc['loop_source'] = loop_sources
wc['peak_source'] = peak_sources
wc['peak_caller'] = peak_callers
wc['stringency'] = stringencies
wc['loop_caller'] = loop_callers

## Sample-level Summary For Loop Counts


In [3]:
# count the number of samples with zero and non-zero number of loops
grps = wc.groupby(['ref', 'res', 'stringency', 'peak_caller', 'loop_caller'])

# perform the value count across groups
value_counts = grps.num_loops.value_counts(bins=[-np.inf, 0, np.inf]).to_frame()
value_counts.columns = ['count']
value_counts.reset_index(inplace=True)
value_counts = value_counts.pivot(index=['ref', 'res', 'stringency', 'peak_caller', 'loop_caller'],
                   columns=['num_loops'],
                   values=['count'])
value_counts.columns = value_counts.columns.set_levels(['zero', 'non-zero'], level=1)

# add a total column
value_counts[('count', 'total')] = value_counts.sum(axis=1)

In [4]:
value_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,num_loops,zero,non-zero,total
ref,res,stringency,peak_caller,loop_caller,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
hg38,10000,loose,macs2,fithichip,14,229,243
hg38,10000,stringent,macs2,fithichip,13,230,243
hg38,25000,loose,macs2,fithichip,4,239,243
hg38,25000,stringent,macs2,fithichip,6,237,243
hg38,5000,loose,macs2,fithichip,20,223,243
hg38,5000,stringent,macs2,fithichip,20,223,243


In [5]:
value_counts.sum(axis=0).to_frame().T

Unnamed: 0_level_0,count,count,count
num_loops,zero,non-zero,total
0,77,1381,1458


The most important number will be non-zero, we will use this number to ensure that we correctly
producing the correct number of visualization files.

## Check visualization files

In [6]:
# count the main interactions files
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/chip-seq/macs2/*/*.*.interactions_FitHiC_Q0.01.bed'
num_main = len(glob.glob(glob_str.format(stringency='*')))

# count the biginteractions files
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/chip-seq/macs2/*/*.*.interaction.bb'
biginter_list = glob.glob(glob_str.format(stringency='*'))
num_biginter = len(biginter_list)

# count the longrange files
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/chip-seq/macs2/*/*.*.longrange.bed.gz'
longrange_list = glob.glob(glob_str.format(stringency='*'))
num_longrange = len(longrange_list)

# print the counts
print(num_main, num_biginter, num_longrange)

1458 1381 1381


## Further investigate problematic samples

In [7]:
wc['biginter_file'] = wc['sample'].replace('bed$', 'interaction.bb', regex=True)
# check if the file exists
def check_file(x, root_dir='../../../../'):
    fn = os.path.join(root_dir, x)
    return(os.path.exists(fn))
wc['biginter_exists'] = wc['biginter_file'].apply(check_file)

In [8]:
nonzero_data = wc.loc[(wc.num_loops > 0)]

missing_data = wc.loc[(wc.num_loops > 0) & (wc.biginter_exists == False)]
missing_arrayids = missing_data.index.values + 1

print(missing_arrayids)
missing_arrayids = missing_arrayids.astype(str)
rerun_ids = ','.join(missing_arrayids)

print(rerun_ids)

[]



: 