In [None]:
import cooltools
from collections import defaultdict
import cooler
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
regions = pd.read_csv('/mnt/md0/clarice/src/region_idx.txt', sep='\t')

Example with GM12878 RCMC vs holdout predictions, can be used for all other coolers

In [None]:
res = 500

gm12878_rcmc = cooler.Cooler('/mnt/md0/clarice/fan_prediction_analyses/RCMC_matrices_to_clrs/GM12878/GM12878_500kb_RCMC_smoothed.mcool::resolutions/' + str(res))
gm12878_pred = cooler.Cooler('/mnt/md0/clarice/fan_prediction_analyses/holdout_model_coolers/25_GM_newTestSet_43/25_GM_newTestSet_43_merged.mcool::resolutions/' + str(res))

In [None]:
def calculate_corrs(rcmc_clr, pred_clr, region, res=500, max_range=1000):
    """
    Caclulate correlations for each distance from the diagonal
    """
    distance_corr = defaultdict(list)

    rcmc_mat = rcmc_clr.matrix(balance=False).fetch(region)
    pred_mat = pred_clr.matrix(balance=False).fetch(region)

    for offset in range(20, max_range):
        distance = offset*res
        rcmc_diag = cooltools.lib.numutils.get_diag(rcmc_mat, i=offset)
        pred_diag = cooltools.lib.numutils.get_diag(pred_mat, i=offset)
        current_df = pd.DataFrame({'rcmc': rcmc_diag, 'pred': pred_diag})
        current_df = current_df[current_df['pred'] != 0]
        correlation = current_df['rcmc'].corr(current_df['pred'], method = 'pearson')
        distance_corr[distance].append(correlation)
    df = pd.DataFrame.from_dict(distance_corr, orient='index', columns=['corr'])
    df['distance'] = df.index

    return df


In [None]:
def calculate_corrs_by_diag(rcmc, pred, res=500, max_range=1000, save=False):
    """
    Calculate correlations for all regions that we are comparing
    """
    all_corrs = pd.DataFrame(columns=['corr', 'distance', 'region_id'])

    for idx, region in regions.iterrows():
        current_region = region['chrom'] + ':' + str(region['start']) + '-' + str(region['end'])
        current_corrs = calculate_corrs(rcmc, pred, current_region, res=res, max_range=max_range)
        current_corrs['region_id'] = region['region_id']
        all_corrs = pd.concat([all_corrs, current_corrs])

    all_corrs.reset_index(inplace=True)

    if save != False:
        all_corrs.to_csv(save, sep='\t', index=False)
        
    return all_corrs

In [None]:
# write to file to plot in cleopatra_correlations.Rmd

gm_pred_model_corrs = calculate_corrs_by_diag(gm12878_rcmc, gm12878_pred, save='diag_corrs/GM12878_model_25_diag_corrs.tsv')