In [1]:
import os 
import pandas as pd
import numpy as np
import glob

## Loading the samplesheet data

In [2]:
# load the sample sheet with the number of loops
wc = pd.read_csv('../samplesheets/samplesheet.fithichip_hp.wc.txt',
                 sep='\t',
                 names=['sample', 'num_loops'],
                 header=None)

# extract metadata from the file names
def get_meta(x):
    meta = x.split('/')
    bn = meta[-1]
    bn_split = bn.split('.')
    res = bn_split[5]
    ref = meta[2]
    loop_source = meta[4]
    peak_source = meta[5]
    peak_caller = meta[6]
    stringency = meta[7]
    loop_caller = 'fithichip' if 'FitHiC' in meta[8] else 'other'
    return((ref, res, loop_source, peak_source, peak_caller, stringency, loop_caller))
metadata = wc['sample'].apply(get_meta).values.tolist()
metadata = list(zip(*metadata))
refs, res, loop_sources, peak_sources, peak_callers, stringencies, loop_callers = metadata

wc['ref'] = refs
wc['res'] = res
wc['loop_source'] = loop_sources
wc['peak_source'] = peak_sources
wc['peak_caller'] = peak_callers
wc['stringency'] = stringencies
wc['loop_caller'] = loop_callers

## Sample-level Summary For Loop Counts


In [3]:
# count the number of samples with zero and non-zero number of loops
grps = wc.groupby(['ref', 'res', 'stringency', 'peak_caller', 'loop_caller'])

# perform the value count across groups
value_counts = grps.num_loops.value_counts(bins=[-np.inf, 0, np.inf]).to_frame()
value_counts.columns = ['count']
value_counts.reset_index(inplace=True)
value_counts = value_counts.pivot(index=['ref', 'res', 'stringency', 'peak_caller', 'loop_caller'],
                   columns=['num_loops'],
                   values=['count'])
value_counts.columns = value_counts.columns.set_levels(['zero', 'non-zero'], level=1)

# add a total column
value_counts[('count', 'total')] = value_counts.sum(axis=1)

In [4]:
value_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,num_loops,zero,non-zero,total
ref,res,stringency,peak_caller,loop_caller,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
hg38,10000,loose,fithichip-utility,fithichip,23,455,478
hg38,10000,stringent,fithichip-utility,fithichip,56,422,478
hg38,25000,loose,fithichip-utility,fithichip,14,464,478
hg38,25000,stringent,fithichip-utility,fithichip,44,434,478
hg38,5000,loose,fithichip-utility,fithichip,34,444,478
hg38,5000,stringent,fithichip-utility,fithichip,81,397,478


The most important number will be non-zero, we will use this number to ensure that we correctly
producing the correct number of visualization files.

After this, the next most important number is the total. For:
- hg38 we should have 478 samples * 3 resolutions = 1434 files
- B
- C

Count the existence of loops in the sample sheet

In [5]:
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/hichip/fithichip-utility/{stringency}/*.interactions_FitHiC_Q0.01.interaction.bb'
num_stringent = len(glob.glob(glob_str.format(stringency='stringent')))
num_loose = len(glob.glob(glob_str.format(stringency='loose')))

In [6]:
num_stringent, num_loose

(1253, 1363)

## Further investigate problematic samples

In [7]:
wc['bb_file'] = wc['sample'].replace('bed$', 'interaction.bb', regex=True)

# check if the file exists
def check_file(x, root_dir='../../../../'):
    fn = os.path.join(root_dir, x)
    return(os.path.exists(fn))
wc['bb_file_exists'] = wc['bb_file'].apply(check_file)

In [8]:
nonzero_data = wc.loc[(wc.num_loops > 0)]

In [9]:
missing_data = wc.loc[(wc.num_loops > 0) & (wc.bb_file_exists == False)]

In [10]:
missing_arrayids = missing_data.index.values + 1

In [11]:
missing_arrayids

array([], dtype=int64)

In [12]:
missing_arrayids = missing_arrayids.astype(str)

In [13]:
rerun_ids = ','.join(missing_arrayids)
print(rerun_ids)




In [14]:
len(missing_arrayids)

0

In [15]:
nonzero_data.shape

(2616, 11)

In [16]:
missing_data.shape

(0, 11)

In [17]:
wc

Unnamed: 0,sample,num_loops,ref,res,loop_source,peak_source,peak_caller,stringency,loop_caller,bb_file,bb_file_exists
0,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,10000,hichip,hichip,fithichip-utility,loose,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,False
1,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,25000,hichip,hichip,fithichip-utility,loose,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,False
2,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,5000,hichip,hichip,fithichip-utility,loose,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,False
3,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,10000,hichip,hichip,fithichip-utility,loose,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,False
4,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,25000,hichip,hichip,fithichip-utility,loose,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,False
...,...,...,...,...,...,...,...,...,...,...,...
2863,results/shortcuts/hg38/loops/hichip/hichip/fit...,150,hg38,25000,hichip,hichip,fithichip-utility,stringent,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,True
2864,results/shortcuts/hg38/loops/hichip/hichip/fit...,39,hg38,5000,hichip,hichip,fithichip-utility,stringent,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,True
2865,results/shortcuts/hg38/loops/hichip/hichip/fit...,216,hg38,10000,hichip,hichip,fithichip-utility,stringent,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,True
2866,results/shortcuts/hg38/loops/hichip/hichip/fit...,424,hg38,25000,hichip,hichip,fithichip-utility,stringent,fithichip,results/shortcuts/hg38/loops/hichip/hichip/fit...,True
