In [1]:
from ConfigDataset import ConfigDataset
from HiCDataset import HiCDataset
from HiCMap import HiCMap
from DataLoader_HiC import DataLoader
import matplotlib.pyplot as plt 
import torch
from Sample import Sample
from tqdm.auto import tqdm

In [2]:
# Training data locations
config_fp = '../../data/processed_data.hdf5'
hic_fp = '../../data/outside/GM12878_hg19.mcool'

# Supporting data locations 
mean_dist_fp = '../../data/mean_dists.pt'
mean_sq_dist_fp='../../data/squares.pt'

training_chroms = ['22']#[str(k) for k in range(1,23)] 

# Training iteration details 
segment_length = 64
batch_size = 128#16#64
shuffle_data = True

In [3]:
config_ds = ConfigDataset(
    config_fp,
    segment_length=segment_length,
    remove_diagonal=False,
    batch_size=0,
    normalize_distances=False, #True, # Apparently this doesn't work
    geos=None,
    organisms=None,
    cell_types=None,
    cell_numbers=None,
    chroms=training_chroms,
    replicates=None,
    shuffle=True,
    allow_overlap=False, #True,
    two_channels=False,
    try_GPU=True,
    mean_dist_fp=mean_dist_fp,
    mean_sq_dist_fp=mean_sq_dist_fp
)

In [4]:
exp_hic = HiCDataset() 

In [5]:
dl = DataLoader(
    config_ds,
    exp_hic,
    #drop_unmatched_pairs=True,
    shuffle = shuffle_data,
    batch_size=batch_size,
    interp_hic_nans = True
)

In [6]:
def remove_diagonal(mat):
    n = mat.shape[-1]
    i,j = torch.triu_indices(n,n,1)
    mat2 = torch.empty(*mat.shape[:-2],n-1,n-1,dtype=mat.dtype,device=mat.device)
    mat2[...,i,j-1] = mat[...,i,j]
    mat2[...,j-1,i] = mat[...,j,i] 
    return mat2

def a_in_b(dist_map,all_dist_maps):
    for i in range(all_dist_maps.shape[0]):
        if torch.allclose(dist_map,all_dist_maps[i]):
            return True
    return False
    #return bool(torch.isclose(dist_map,all_dist_maps).all(-1).all(-1).any())

def all_a_in_b():

    dl = DataLoader(
        config_ds,
        exp_hic,
        shuffle = shuffle_data,
        batch_size=batch_size,
        interp_hic_nans = True
    )

    n_errant_nexts = 0 
    n_errant_calls_next = 0 
    n_errant_calls_fetch = 0
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    for i in tqdm(range(len(dl)), f"Current errors: {(n_errant_nexts + n_errant_calls_next + n_errant_calls_fetch)}. Progress: "):
        dist_map,_ = next(dl)
        
        chrom, start_idx = dl.index[['Chromosome','Genomic_Index']].iloc[i].values
        dist_map1 = dl.configs.fetch(dl.coord_idx[i:i+1].tolist()).to(device)
        n_errant_nexts+= 1 - bool(torch.allclose(dist_map,dist_map1))

        # Get all distance maps associated with this region from the ConfigDataset class
        all_configs = dl.configs.fetch_specific_coords(chrom,start_idx)[1].to(dist_map.device)
        all_configs = torch.cdist(all_configs,all_configs).to(dist_map.dtype)
        ii = torch.arange(all_configs.shape[-1])
        all_configs[...,ii,ii] = 0 

        n_errant_calls_next+= 1 - a_in_b(dist_map,all_configs)
        n_errant_calls_fetch+= 1 - a_in_b(dist_map1,all_configs)

    print(f'n_errant_nexts: {n_errant_nexts}')
    print(f'n_errant_calls_next: {n_errant_calls_next}')
    print(f'n_errant_calls_fetch: {n_errant_calls_fetch}')
    print(f'Total size: {len(dl)}')        

def plot_index(index,dl=dl):

    # Get Hi-C data as performed in dl class
    i = index 
    chrom, start_idx = dl.index[['Chromosome','Genomic_Index']].iloc[i].values
    dist_map = dl.configs.fetch(dl.coord_idx[i:i+1].tolist())
    prob_map = HiCMap(dl.get_hic_map(chrom,start_idx))

    # Get all distance maps associated with this region from the ConfigDataset class
    all_configs = dl.configs.fetch_specific_coords(chrom,start_idx)[1].to(dist_map.device)
    all_configs = torch.cdist(all_configs,all_configs).to(dist_map.dtype)
    ii = torch.arange(all_configs.shape[-1])
    all_configs[...,ii,ii] = 0 
    
    # Ensure the og config is in the called configs
    assert a_in_b(dist_map,all_configs), "Configuration doesn't match the stated region!"

    # Convert to probabilities & plot
    probs2 = Sample(data=remove_diagonal(torch.cdist(all_configs,all_configs)).unsqueeze(1)).contact_probabilities(r_c=1.6,sigma=6)
    _,ax,*_ = prob_map.plot_with(probs2)
    ax.set_title(f'NSamples: {all_configs.shape[0]}')


In [7]:
plot_index(0)

AssertionError: The provided probability map should have exactly two nonsingleton dimensions, but has dimensions torch.Size([1, 1, 2080])

In [None]:
plot_index(1)

In [None]:
all_a_in_b()

In [11]:
import time
dl.internal_idx = 0 
_ = next(dl) 
t = time.time()
n=0
while dl.internal_idx != 0:
    _ = next(dl) 
    n+=1
    print(dl.internal_idx)
t-= time.time()
print(f'{-t} seconds')
n

256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
0
1.0611422061920166 seconds


18

In [9]:
a,b = next(dl) 

In [None]:
a.shape

In [None]:
b.shape

In [None]:
c = exp_hic.fetch('1',0,20_000*64)

In [None]:
c.shape

In [None]:
torch.prod((1,2,3))

In [None]:
int(torch.prod(torch.tensor((1,2,3))))

In [None]:
import os
os.environ['SLURM_CPUS_PER_TASK']

In [None]:
dl.internal_idx

In [None]:
dl.reset_index()
dl.internal_idx