In [1]:
import cooler
import cooltools
import bioframe
import pandas as pd
import numpy as np

### Calculate P(s) and derivative for RCMC, Micro-C and Hi-C data
All analyses in this section are more or less taken straight from the [cooltools](https://cooltools.readthedocs.io/en/latest/notebooks/contacts_vs_distance.html) documentation  
Plotting was performed in the contact_map_comparisons_plotting.Rmd file

In [2]:
celltypes = ['GM12878', 'HCT116', 'K562', 'H1']

In [3]:
clrs = {}
res = 1000

for celltype in celltypes:
    clrs[celltype] = cooler.Cooler(f'/mnt/md0/clarice/realigned_rcmc_merged/{celltype}_merged_realigned.50.mcool::resolutions/{res}') 

microc_clrs = {}

for celltype in celltypes[:3]:
    microc_clrs[celltype] = cooler.Cooler(f'/mnt/md0/clarice/realigned_microc/{celltype}_microc.50.mcool::resolutions/{res}')

microc_clrs['H1'] = cooler.Cooler(f'/mnt/md0/clarice/comparison_datasets/krietenstein_H1_microc.50.mcool::resolutions/{res}')

harris_hic = cooler.Cooler(f'/mnt/md0/clarice/comparison_datasets/GM12878_Harris_hg38_hic_200bp_matrix.mcool::resolutions/{res}')

In [4]:
hg38_chromsizes = bioframe.fetch_chromsizes('hg38')
region_df = pd.read_csv('/mnt/md0/clarice/src/region_idx.txt', sep='\t').rename(columns={'region_id':'name'})
region_df = bioframe.sort_bedframe(region_df, view_df=hg38_chromsizes)

In [5]:
def calculate_cvd(clr):
    cvd_smooth_agg = cooltools.expected_cis(
        clr=clr,
        view_df=region_df,
        smooth=True,
        aggregate_smoothed=True,
        smooth_sigma=0.1,
        nproc=8
    )

    return cvd_smooth_agg

In [None]:
for celltype, clr in clrs.items():
    cvd = calculate_cvd(clr)
    cvd.to_csv(f'region_ps/{celltype}_rcmc_cvd_table.txt', sep='\t', index=False)

In [None]:
for celltype, clr in microc_clrs.items():
    cvd = calculate_cvd(clr)
    cvd.to_csv(f'region_ps/{celltype}_microc_regions_cvd_table.txt', sep='\t', index=False)

In [None]:
cvd = calculate_cvd(harris_hic)
cvd.to_csv(f'region_ps/harris_hic_regions_cvd_table.txt', sep='\t', index=False)

In [None]:
rcmc_cvds = {}
for celltype in celltypes:
    rcmc_cvds[celltype] = pd.read_csv(f'region_ps/{celltype}_rcmc_cvd_table.txt', sep='\t')

microc_cvds = {}
for celltype in celltypes:
    microc_cvds[celltype] = pd.read_csv(f'region_ps/{celltype}_microc_regions_cvd_table.txt', sep='\t')

harris_cvd = pd.read_csv(f'region_ps/harris_hic_regions_cvd_table.txt', sep='\t')

In [6]:
def calculate_der(cvd):
     cvd['balanced.avg.smoothed.agg'].loc[cvd['dist'] < 2] = np.nan
     cvd_merged = cvd.drop_duplicates(subset=['dist'])[['dist_bp', 'balanced.avg.smoothed.agg']]
     # Calculate derivative in log-log space
     der = np.gradient(np.log(cvd_merged['balanced.avg.smoothed.agg']),
                    np.log(cvd_merged['dist_bp']))
     
     cvd_merged['der'] = der
     
     return cvd_merged

In [None]:
for celltype, cvd in rcmc_cvds.items():
    current_der = calculate_der(cvd)
    current_der.to_csv(f'region_ps/{celltype}_rcmc_der_table.txt', sep='\t', index=False)

In [None]:
for celltype, cvd in microc_cvds.items():
    current_der = calculate_der(cvd)
    current_der.to_csv(f'region_ps/{celltype}_microc_der_table.txt', sep='\t', index=False)

In [None]:
harris_der = calculate_der(harris_cvd)
harris_der.to_csv(f'region_ps/harris_hic_der_table.txt', sep='\t', index=False)

### Calculate fraction of bins with >1 read for each distance from the diagonal in RCMC, Micro-C and Hi-C data
Plotting was performed in the contact_map_comparisons_plotting.Rmd file

In [None]:
rcmc_coolers = {}

for celltype in ['GM12878', 'HCT116', 'K562', 'H1']:
    rcmc_coolers[celltype] = f'/mnt/md0/clarice/realigned_rcmc_merged/{celltype}_merged_realigned.50.mcool'

microc_coolers = {}

for celltype in ['GM12878', 'HCT116', 'K562']:
    microc_coolers[celltype] = f'/mnt/md0/clarice/realigned_microc/{celltype}_microc.50.mcool'

microc_coolers['H1'] = '/mnt/md0/clarice/comparison_datasets/krietenstein_H1_microc.50.mcool'

hic_cooler = {'GM12878': '/mnt/md0/clarice/comparison_datasets/GM12878_Harris_hg38_hic_200bp_matrix.mcool'}

In [None]:
def calculate_distance_frac(clrs, regions, distances, res, save=False):
    """
    Caclulate fraction of bins filled for each distance. Code modified from Goel et al., 2013
    """
    full_df = pd.DataFrame(columns = distances)

    for clr_name, clr_filename in clrs.items():
        
        clr = cooler.Cooler(clr_filename + '::resolutions/' + str(res))
        frac_across_distances = {}

        for distance in distances:
            
            current_total_bins = 0 
            current_nonzero_bins = 0       
            
            for reg in regions:        
                reg_mat = clr.matrix(balance=False).fetch(reg)
                bins = len(reg_mat)
                offset = distance // res - 1
                binsAtDiag = bins - offset
                current_total_bins += binsAtDiag

                x = offset
                y = 0
                nonZeroCount = 0
                while x < len(reg_mat):
                    val = reg_mat[y][x]
                    if val > 0:
                        nonZeroCount += 1
                    x += 1
                    y += 1
                current_nonzero_bins += nonZeroCount
        
            frac_across_distances[distance] = current_nonzero_bins/current_total_bins
        df = pd.DataFrame.from_records([frac_across_distances])
        df.index = ['frac_filled']
        df = df.T
        df['clr'] = clr_name
        full_df = pd.concat([full_df, df], axis = 0)

    full_df.index.name = 'distance'
    full_df.reset_index(inplace = True)
    full_df = full_df[['clr', 'distance', 'frac_filled']]

    if save == False:
        return full_df
    else:
        full_df.to_csv(save, sep = '\t')

In [None]:
distances = [1000,2500,5000,10000,25000,50000,100000,250000,500000,1000000] # input the desired contact distance diagonal in bp
regions = ['chr6:25111952-28621868','chr5:157036231-160147469','chr1:207626481-210338298','chr4:61369236-64434279','chr19:36027639-39756265',
               'chr8:124988367-129795630', 'chr6:29678698-32256197', 'chrX:47081617-49442070', 'chr1:237286814-240531042', 'chr7:10480243-13506262', 
               'chr8:62918949-66566105', 'chr4:181289951-184014354',
               'chr3:118624552-121533938', 'chr9:106603383-109909256']

calculate_distance_frac(rcmc_coolers, regions, distances, 200, save = 'RCMC_regions_all_fraction_filled_by_distance.tsv')
calculate_distance_frac(microc_coolers, regions, distances, 200, save = 'microc_regions_all_fraction_filled_by_distance.tsv')
calculate_distance_frac(hic_cooler, regions, distances, 200, save = 'harris_hic_fraction_filled_by_distance.tsv')