In [1]:
import os 
import pandas as pd
import numpy as np
import glob

## Loading the samplesheet data

In [2]:
# load the sample sheet with the number of loops
wc = pd.read_csv('../samplesheets/samplesheet.fithichip_cp.wc.txt',
                 sep='\t',
                 names=['sample', 'num_loops'],
                 header=None)

# extract metadata from the file names
def get_meta(x):
    meta = x.split('/')
    bn = meta[-1]
    bn_split = bn.split('.')
    res = bn_split[5]
    ref = meta[2]
    loop_source = meta[4]
    peak_source = meta[5]
    peak_caller = meta[6]
    stringency = meta[7]
    loop_caller = 'fithichip' if 'FitHiC' in meta[8] else 'other'
    return((ref, res, loop_source, peak_source, peak_caller, stringency, loop_caller))
metadata = wc['sample'].apply(get_meta).values.tolist()
metadata = list(zip(*metadata))
refs, res, loop_sources, peak_sources, peak_callers, stringencies, loop_callers = metadata

wc['ref'] = refs
wc['res'] = res
wc['loop_source'] = loop_sources
wc['peak_source'] = peak_sources
wc['peak_caller'] = peak_callers
wc['stringency'] = stringencies
wc['loop_caller'] = loop_callers

## Sample-level Summary For Loop Counts


In [3]:
# count the number of samples with zero and non-zero number of loops
grps = wc.groupby(['ref', 'res', 'stringency', 'peak_caller', 'loop_caller'])

# perform the value count across groups
value_counts = grps.num_loops.value_counts(bins=[-np.inf, 0, np.inf]).to_frame()
value_counts.columns = ['count']
value_counts.reset_index(inplace=True)
value_counts = value_counts.pivot(index=['ref', 'res', 'stringency', 'peak_caller', 'loop_caller'],
                   columns=['num_loops'],
                   values=['count'])
value_counts.columns = value_counts.columns.set_levels(['zero', 'non-zero'], level=1)

# add a total column
value_counts[('count', 'total')] = value_counts.sum(axis=1)

In [4]:
value_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,num_loops,zero,non-zero,total
ref,res,stringency,peak_caller,loop_caller,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
hg38,10000,loose,macs2,fithichip,14,229,243
hg38,10000,stringent,macs2,fithichip,13,230,243
hg38,25000,loose,macs2,fithichip,4,239,243
hg38,25000,stringent,macs2,fithichip,6,237,243
hg38,5000,loose,macs2,fithichip,20,223,243
hg38,5000,stringent,macs2,fithichip,20,223,243


The most important number will be non-zero, we will use this number to ensure that we correctly
producing the correct number of visualization files.

## Check visualization files

In [10]:
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/chip-seq/macs2/*/*.*.interactions_FitHiC_Q0.01.bed'
num_biginter = len(glob.glob(glob_str.format(stringency='*')))

In [11]:
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/chip-seq/macs2/*/*.*.longrange.bed.gz'
num_longrange = len(glob.glob(glob_str.format(stringency='*')))

In [12]:
print(num_biginter, num_longrange)

1458 0


## Further investigate problematic samples

In [9]:
wc['bb_file'] = wc['sample'].replace('bed$', 'interaction.bb', regex=True)

# check if the file exists
def check_file(x, root_dir='../../../../'):
    fn = os.path.join(root_dir, x)
    return(os.path.exists(fn))
wc['bb_file_exists'] = wc['bb_file'].apply(check_file)

In [10]:
nonzero_data = wc.loc[(wc.num_loops > 0)]

In [11]:
missing_data = wc.loc[(wc.num_loops > 0) & (wc.bb_file_exists == False)]

In [12]:
missing_arrayids = missing_data.index.values + 1

In [13]:
missing_arrayids

array([], dtype=int64)

In [14]:
missing_arrayids = missing_arrayids.astype(str)

In [15]:
rerun_ids = ','.join(missing_arrayids)
print(rerun_ids)




In [16]:
len(missing_arrayids)

0

In [17]:
nonzero_data.shape

(1381, 11)

In [18]:
missing_data.shape

(0, 11)

In [19]:
wc

Unnamed: 0,sample,num_loops,ref,res,loop_source,peak_source,peak_caller,stringency,loop_caller,bb_file,bb_file_exists
0,results/shortcuts/hg38/loops/hichip/chip-seq/m...,0,hg38,10000,hichip,chip-seq,macs2,loose,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,False
1,results/shortcuts/hg38/loops/hichip/chip-seq/m...,0,hg38,25000,hichip,chip-seq,macs2,loose,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,False
2,results/shortcuts/hg38/loops/hichip/chip-seq/m...,0,hg38,5000,hichip,chip-seq,macs2,loose,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,False
3,results/shortcuts/hg38/loops/hichip/chip-seq/m...,0,hg38,10000,hichip,chip-seq,macs2,loose,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,False
4,results/shortcuts/hg38/loops/hichip/chip-seq/m...,0,hg38,25000,hichip,chip-seq,macs2,loose,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,False
...,...,...,...,...,...,...,...,...,...,...,...
1453,results/shortcuts/hg38/loops/hichip/chip-seq/m...,14155,hg38,25000,hichip,chip-seq,macs2,stringent,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,True
1454,results/shortcuts/hg38/loops/hichip/chip-seq/m...,4238,hg38,5000,hichip,chip-seq,macs2,stringent,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,True
1455,results/shortcuts/hg38/loops/hichip/chip-seq/m...,8768,hg38,10000,hichip,chip-seq,macs2,stringent,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,True
1456,results/shortcuts/hg38/loops/hichip/chip-seq/m...,8879,hg38,25000,hichip,chip-seq,macs2,stringent,fithichip,results/shortcuts/hg38/loops/hichip/chip-seq/m...,True
