In [1]:
import os 
import pandas as pd
import numpy as np
import glob

In [2]:
wc = pd.read_csv('samplesheet.hiccup.wc.txt',
                 sep='\t',
                 names=['sample', 'num_loops'],
                 header=None)
wc

Unnamed: 0,sample,num_loops
0,results/shortcuts/hg38/loops/hichip/hiccups/A6...,2116
1,results/shortcuts/hg38/loops/hichip/hiccups/A6...,3880
2,results/shortcuts/hg38/loops/hichip/hiccups/A6...,1056
3,results/shortcuts/hg38/loops/hichip/hiccups/Ao...,6026
4,results/shortcuts/hg38/loops/hichip/hiccups/Ao...,5584
...,...,...
433,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,2551
434,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,1339
435,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,3050
436,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,4330


## Notes on Files used to create Inventory for hiccup

## Loading the samplesheet data

In [4]:
# load the sample sheet with the number of loops
wc = pd.read_csv('samplesheet.hiccups.wc.txt',
                 sep='\t',
                 names=['sample', 'num_loops'],
                 header=None)

# extract metadata from the file names
def get_meta(x):
    meta = x.split('/')
    ref = meta[2]
    loop_source = meta[4]
    peak_source = meta[5]
    #peak_caller = meta[6]
    #stringency = meta[7]
    loop_caller = 'hiccups' if 'hiccups' in meta[5] else 'other'
    return((ref, loop_source, peak_source, loop_caller))
metadata = wc['sample'].apply(get_meta).values.tolist()
metadata = list(zip(*metadata))
refs, loop_sources, peak_sources, loop_callers = metadata

wc['ref'] = refs
wc['loop_source'] = loop_sources
wc['peak_source'] = peak_sources
#wc['peak_caller'] = peak_callers
#wc['stringency'] = stringencies
wc['loop_caller'] = loop_callers

In [5]:
wc.iloc[0,0]

'results/shortcuts/hg38/loops/hichip/hiccups/A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2.10000.post_processed.hiccups.bed'

In [6]:
grps = wc.groupby(['ref'])#, 'loop_caller'])

# perform the value count across groups
#value_counts = grps.num_loops.value_counts(bins=[-np.inf, 0, np.inf])#.to_frame()
value_counts = wc.num_loops.value_counts(bins=[-np.inf, 0, np.inf]).to_frame()

In [7]:
value_counts

Unnamed: 0,num_loops
"(0.0, inf]",438
"(-inf, 0.0]",0


In [None]:
grps.num_loops.value_counts(bins=[-np.inf,0,np.inf])

ref   loop_caller  num_loops  
hg38  hiccups      (-inf, 0.0]    438
                   (0.0, inf]       0
Name: num_loops, dtype: int64

In [None]:
grps.num_loops.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
ref,loop_caller,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hg38,hiccups,438.0,5007.840183,6309.19432,258.0,2218.0,3398.0,5756.25,78984.0


In [None]:
wc["num_loops"] = pd.to_numeric(wc["num_loops"])

## Sample-level Summary For Loop Counts


In [None]:
# count the number of samples with zero and non-zero number of loops
grps = wc.groupby(['ref', 'loop_caller'])

# perform the value count across groups
value_counts = grps.num_loops.value_counts(bins=[-np.inf, 0, np.inf]).to_frame()
value_counts.columns = ['count']
value_counts.reset_index(inplace=True)
value_counts = value_counts.pivot(index=['ref', 'loop_caller'],
                   columns=['num_loops'],
                   values=['count'])
value_counts.columns = value_counts.columns.set_levels(['zero', 'non-zero'], level=1)

# add a total column
value_counts[('count', 'total')] = value_counts.sum(axis=1)

In [None]:
value_counts

Unnamed: 0_level_0,count,count,count
num_loops,zero,non-zero,total
ref,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
hg38,438,0,438


The most important number will be non-zero, we will use this number to ensure that we correctly
producing the correct number of visualization files.

After this, the next most important number is the total. For:
- hg38 we should have 478 samples * 3 resolutions = 1434 files
- B
- C

Count the existence of loops in the sample sheet

In [None]:
glob_str = '../../../../'
glob_str += 'results/shortcuts/hg38/loops/hichip/hichip/fithichip-utility/{stringency}/*.interactions_FitHiC_Q0.01.interaction.bb'
num_stringent = len(glob.glob(glob_str.format(stringency='stringent')))
num_loose = len(glob.glob(glob_str.format(stringency='loose')))

In [None]:
num_stringent, num_loose

(1253, 1363)

In [None]:
1363 + 1253

2616

## Further investigate problematic samples

In [None]:
wc['bb_file'] = wc['sample'].replace('bed$', 'interaction.bb', regex=True)

# check if the file exists
def check_file(x, root_dir='../../../../'):
    fn = os.path.join(root_dir, x)
    return(os.path.exists(fn))
wc['bb_file_exists'] = wc['bb_file'].apply(check_file)

In [None]:
nonzero_data = wc.loc[(wc.num_loops > 0)]

In [None]:
missing_data = wc.loc[(wc.num_loops > 0) & (wc.bb_file_exists == False)]

In [None]:
missing_arrayids = missing_data.index.values + 1

In [None]:
missing_arrayids

array([], dtype=int64)

In [None]:
missing_arrayids = missing_arrayids.astype(str)

In [None]:
rerun_ids = ','.join(missing_arrayids)
print(rerun_ids)

TypeError: sequence item 0: expected str instance, numpy.int64 found

In [None]:
len(missing_arrayids)

8

In [None]:
nonzero_data.shape

(438, 4)

In [None]:
missing_data.shape

(8, 4)

In [None]:
wc

Unnamed: 0,sample,num_loops,bb_file,bb_file_exists
0,results/shortcuts/hg38/loops/hichip/hiccups/A6...,2116,results/shortcuts/hg38/loops/hichip/hiccups/A6...,True
1,results/shortcuts/hg38/loops/hichip/hiccups/A6...,3880,results/shortcuts/hg38/loops/hichip/hiccups/A6...,True
2,results/shortcuts/hg38/loops/hichip/hiccups/A6...,1056,results/shortcuts/hg38/loops/hichip/hiccups/A6...,True
3,results/shortcuts/hg38/loops/hichip/hiccups/Ao...,6026,results/shortcuts/hg38/loops/hichip/hiccups/Ao...,True
4,results/shortcuts/hg38/loops/hichip/hiccups/Ao...,5584,results/shortcuts/hg38/loops/hichip/hiccups/Ao...,True
...,...,...,...,...
433,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,2551,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,True
434,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,1339,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,True
435,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,3050,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,True
436,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,4330,results/shortcuts/hg38/loops/hichip/hiccups/Tr...,True


In [None]:
for i in missing_arrayids:
    out = '../../../../results/shortcuts/logs/interactions_to_bigInteract.o6088215-{}'.format(i)
    err = '../../../../results/shortcuts/logs/interactions_to_bigInteract.e6088215-{}'.format(i)
    os.remove(out)
    os.remove(err)