In [63]:
import os 
import pandas as pd
import numpy as np
import glob

## Loading the samplesheet data

In [13]:
# load the sample sheet with the number of loops
wc = pd.read_csv('samplesheet.fithichip.wc.txt',
                 sep='\t',
                 names=['sample', 'num_loops'],
                 header=None)

# extract metadata from the file names
def get_meta(x):
    meta = x.split('/')
    ref = meta[2]
    loop_source = meta[4]
    peak_source = meta[5]
    peak_caller = meta[6]
    stringency = meta[7]
    loop_caller = 'fithichip' if 'FitHiC' in meta[8] else 'other'
    return((ref, loop_source, peak_source, peak_caller, stringency, loop_caller))
metadata = wc['sample'].apply(get_meta).values.tolist()
metadata = list(zip(*metadata))
refs, loop_sources, peak_sources, peak_callers, stringencies, loop_callers = metadata

wc['ref'] = refs
wc['loop_source'] = loop_sources
wc['peak_source'] = peak_sources
wc['peak_caller'] = peak_callers
wc['stringency'] = stringencies
wc['loop_caller'] = loop_callers

In [14]:
wc.iloc[0,0]

'results/shortcuts/hg38/loops/hichip/hichip/fithichip-utility/loose/293T.GSE128106.Homo_Sapiens.YY1.b1.10000.interactions_FitHiC_Q0.01.bed'

In [15]:
wc.head()

Unnamed: 0,sample,num_loops,ref,loop_source,peak_source,peak_caller,stringency,loop_caller
0,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,hichip,hichip,fithichip-utility,loose,fithichip
1,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,hichip,hichip,fithichip-utility,loose,fithichip
2,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,hichip,hichip,fithichip-utility,loose,fithichip
3,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,hichip,hichip,fithichip-utility,loose,fithichip
4,results/shortcuts/hg38/loops/hichip/hichip/fit...,0,hg38,hichip,hichip,fithichip-utility,loose,fithichip


## Sample-level Summary For Loop Counts


In [16]:
# count the number of samples with zero and non-zero number of loops
grps = wc.groupby(['ref', 'stringency', 'peak_caller', 'loop_caller'])

# perform the value count across groups
value_counts = grps.num_loops.value_counts(bins=[-np.inf, 0, np.inf]).to_frame()
value_counts.columns = ['count']
value_counts.reset_index(inplace=True)
value_counts = value_counts.pivot(index=['ref', 'stringency', 'peak_caller', 'loop_caller'],
                   columns=['num_loops'],
                   values=['count'])
value_counts.columns = value_counts.columns.set_levels(['zero', 'non-zero'], level=1)

# add a total column
value_counts[('count', 'total')] = value_counts.sum(axis=1)

In [17]:
value_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,num_loops,zero,non-zero,total
ref,stringency,peak_caller,loop_caller,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
hg38,loose,fithichip-utility,fithichip,71,1363,1434
hg38,stringent,fithichip-utility,fithichip,181,1253,1434


The most important number will be non-zero, we will use this number to ensure that we correctly
producing the correct number of visualization files.

After this, the next most important number is the total. For:
- hg38 we should have 478 samples * 3 resolutions = 1434 files
- B
- C

Count the existence of loops in the sample sheet

In [45]:
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/hichip/fithichip-utility/{stringency}/*.interactions_FitHiC_Q0.01.interaction.bb'
num_stringent = len(glob.glob(glob_str.format(stringency='stringent')))
num_loose = len(glob.glob(glob_str.format(stringency='loose')))

In [46]:
num_stringent, num_loose

(452, 1320)

## Further investigate problematic samples

In [58]:
wc['bb_file'] = wc['sample'].replace('bed$', 'interaction.bb', regex=True)

# check if the file exists
def check_file(x, root_dir='../../../../'):
    fn = os.path.join(root_dir, x)
    return(os.path.exists(fn))
wc['bb_file_exists'] = wc['bb_file'].apply(check_file)

In [68]:
missing_data = wc.loc[(wc.num_loops > 0) & (wc.bb_file_exists == False)]

In [70]:
missing_arrayids = missing_data.index.values + 1

In [71]:
missing_arrayids

array([ 112,  113,  114,  136,  328,  329,  341,  344,  361,  433,  434,
        435,  451,  452,  466,  467,  468,  472,  473,  474,  475,  476,
        481,  482,  483,  778,  779,  782,  886,  887,  888,  889,  890,
        891, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1298, 1763,
       1775, 1867, 1868, 2213, 2226, 2227, 2228, 2229, 2230, 2231, 2232,
       2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243,
       2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254,
       2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265,
       2266, 2267, 2268, 2269, 2270, 2271, 2272, 2292, 2293, 2294, 2295,
       2296, 2297, 2298, 2299, 2300, 2301, 2305, 2306, 2307, 2308, 2309,
       2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320,
       2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331,
       2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2343,
       2344, 2345, 2346, 2347, 2348, 2349, 2350, 23