In [None]:
import cooler
import cooltools
import coolpuppy
from coolpuppy import coolpup
from coolpuppy import plotpup
import pandas as pd
import bioframe
import matplotlib.pyplot as plt
import numpy as np
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

plt.rc('pdf',fonttype = 42)
from matplotlib.font_manager import FontProperties

default_font = FontProperties(fname="/mnt/md0/clarice/packages/fonts/Helvetica.ttf")

In [2]:
epis = ['3epi', '5epi', '6epi', 'less_input']

In [None]:
rcmc_clrs = {}

for epi in epis:
    rcmc_clrs[epi] = cooler.Cooler(f'/mnt/md0/clarice/rcmc_data/less_input_mcools/GM12878_{epi}_500bp_merged.mcool::resolutions/500')

rcmc_clrs['full'] = cooler.Cooler('/mnt/md0/clarice/rcmc_data/predicted_mcools/25_GM_newTestSet_43_merged.mcool::resolutions/500')

In [4]:
hg38_chromsizes = bioframe.fetch_chromsizes('hg38')
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms = bioframe.make_chromarms(hg38_chromsizes, hg38_cens)
hg38_arms = hg38_arms[hg38_arms.chrom.isin(rcmc_clrs['less_input'].chromnames)].reset_index(drop=True)
hg38_arms = hg38_arms[hg38_arms['chrom'] != 'chrY']

In [5]:
region_df = pd.read_csv('/mnt/md0/clarice/src/500kb_prediction_regions.txt', sep='\t').rename(columns={'region_id':'name'})
region_df = bioframe.sort_bedframe(region_df, view_df=hg38_chromsizes)

In [6]:
holdout_regions = region_df[region_df['name'].isin(['region3', 'region5', 'region10'])]
holdout_regions = bioframe.sort_bedframe(holdout_regions, view_df=hg38_chromsizes)

In [7]:
all_loops = pd.read_csv('all_rcmc_annotated_loops.tsv', sep = '\t')
GM12878_loops = all_loops[all_loops['celltype'] == 'GM12878']

In [8]:
GM12878_loops_for_subset = GM12878_loops[['chrom1', 'start1', 'start2', 'name']]
GM12878_loops_for_subset = GM12878_loops_for_subset.rename(columns={'start1': 'start', 'start2': 'end', 'chrom1': 'chrom'})

GM12878_loops_holdout_names = bioframe.overlap(GM12878_loops_for_subset, holdout_regions, how='inner')

In [9]:
GM12878_loops_in_holdout = GM12878_loops[GM12878_loops['name'].isin(GM12878_loops_holdout_names['name'])]

In [10]:
GM12878_loops_in_holdout_anch1 = GM12878_loops_in_holdout[['chrom1', 'start1', 'end1', 'name']]
GM12878_loops_in_holdout_anch1 = GM12878_loops_in_holdout_anch1.rename(columns={'start1': 'start', 'end1': 'end', 'chrom1': 'chrom'})

GM12878_loops_in_holdout_anch2 = GM12878_loops_in_holdout[['chrom2', 'start2', 'end2', 'name']]
GM12878_loops_in_holdout_anch2 = GM12878_loops_in_holdout_anch2.rename(columns={'start2': 'start', 'end2': 'end', 'chrom2': 'chrom'})

In [11]:
CTCF_ChIP = pd.read_csv('/mnt/md0/clarice/GM12878_genomic_datasets/CTCF_ChIP.bed.gz', sep = '\t', 
names = ['chrom', 'start', 'end', 'crap1', 'crap2', 'crap3', 'crap4', 'crap5', 'crap6', 'crap7'])
CTCF_ChIP = CTCF_ChIP[['chrom', 'start', 'end']]

In [12]:
GM12878_loops_in_holdout_anch1_CTCF = bioframe.overlap(GM12878_loops_in_holdout_anch1, CTCF_ChIP, how = 'inner')
GM12878_loops_in_holdout_anch2_CTCF = bioframe.overlap(GM12878_loops_in_holdout_anch2, CTCF_ChIP, how = 'inner')

In [13]:
GM12878_holdout_ctcf_loops = pd.merge(GM12878_loops_in_holdout_anch1_CTCF, GM12878_loops_in_holdout_anch2_CTCF, how = 'inner', on = 'name', suffixes=['1', '2'])
GM12878_holdout_ctcf_loops = GM12878_holdout_ctcf_loops.drop(columns=list(GM12878_holdout_ctcf_loops.filter(regex='_')))

In [164]:
H3K4me1_ChIP = pd.read_csv('/mnt/md0/clarice/GM12878_genomic_datasets/H3K4me1_ChIP.bed.gz', sep = '\t', 
names = ['chrom', 'start', 'end', 'crap1', 'crap2', 'crap3', 'crap4', 'crap5', 'crap6', 'crap7'])
H3K4me1_ChIP = H3K4me1_ChIP[['chrom', 'start', 'end']]

In [165]:
GM12878_loops_in_holdout_anch1_H3K4me1 = bioframe.overlap(GM12878_loops_in_holdout_anch1, H3K4me1_ChIP, how = 'inner')
GM12878_loops_in_holdout_anch2_H3K4me1 = bioframe.overlap(GM12878_loops_in_holdout_anch2, H3K4me1_ChIP, how = 'inner')

In [167]:
GM12878_holdout_H3K4me1_loops = pd.merge(GM12878_loops_in_holdout_anch1_H3K4me1, GM12878_loops_in_holdout_anch2_H3K4me1, how = 'inner', on = 'name', suffixes=['1', '2'])
GM12878_holdout_H3K4me1_loops = GM12878_holdout_H3K4me1_loops.drop(columns=list(GM12878_holdout_H3K4me1_loops.filter(regex='_')))

In [173]:
H3K4me3_ChIP = pd.read_csv('/mnt/md0/clarice/GM12878_genomic_datasets/H3K4me3_ChIP.bed.gz', sep = '\t', 
names = ['chrom', 'start', 'end', 'crap1', 'crap2', 'crap3', 'crap4', 'crap5', 'crap6', 'crap7'])
H3K4me3_ChIP = H3K4me3_ChIP[['chrom', 'start', 'end']]

GM12878_loops_in_holdout_anch1_H3K4me3 = bioframe.overlap(GM12878_loops_in_holdout_anch1, H3K4me3_ChIP, how = 'inner')
GM12878_loops_in_holdout_anch2_H3K4me3 = bioframe.overlap(GM12878_loops_in_holdout_anch2, H3K4me3_ChIP, how = 'inner')

GM12878_holdout_H3K4me3_loops = pd.merge(GM12878_loops_in_holdout_anch1_H3K4me3, GM12878_loops_in_holdout_anch2_H3K4me3, how = 'inner', on = 'name', suffixes=['1', '2'])
GM12878_holdout_H3K4me3_loops = GM12878_holdout_H3K4me3_loops.drop(columns=list(GM12878_holdout_H3K4me3_loops.filter(regex='_')))

In [None]:
# no loops
H3K9me3_ChIP = pd.read_csv('/mnt/md0/clarice/GM12878_genomic_datasets/H3K9me3_ChIP.bed.gz', sep = '\t', 
names = ['chrom', 'start', 'end', 'crap1', 'crap2', 'crap3', 'crap4', 'crap5', 'crap6', 'crap7'])
H3K9me3_ChIP = H3K9me3_ChIP[['chrom', 'start', 'end']]

GM12878_loops_in_holdout_anch1_H3K9me3 = bioframe.overlap(GM12878_loops_in_holdout_anch1, H3K9me3_ChIP, how = 'inner')
GM12878_loops_in_holdout_anch2_H3K9me3 = bioframe.overlap(GM12878_loops_in_holdout_anch2, H3K9me3_ChIP, how = 'inner')

GM12878_holdout_H3K9me3_loops = pd.merge(GM12878_loops_in_holdout_anch1_H3K9me3, GM12878_loops_in_holdout_anch2_H3K9me3, how = 'inner', on = 'name', suffixes=['1', '2'])
GM12878_holdout_H3K9me3_loops = GM12878_holdout_H3K9me3_loops.drop(columns=list(GM12878_holdout_H3K9me3_loops.filter(regex='_')))

In [None]:
# no loops
H3K27me3_ChIP = pd.read_csv('/mnt/md0/clarice/GM12878_genomic_datasets/H3K27me3_ChIP.bed.gz', sep = '\t', 
names = ['chrom', 'start', 'end', 'crap1', 'crap2', 'crap3', 'crap4', 'crap5', 'crap6', 'crap7'])
H3K27me3_ChIP = H3K27me3_ChIP[['chrom', 'start', 'end']]

GM12878_loops_in_holdout_anch1_H3K27me3 = bioframe.overlap(GM12878_loops_in_holdout_anch1, H3K27me3_ChIP, how = 'inner')
GM12878_loops_in_holdout_anch2_H3K27me3 = bioframe.overlap(GM12878_loops_in_holdout_anch2, H3K27me3_ChIP, how = 'inner')

GM12878_holdout_H3K27me3_loops = pd.merge(GM12878_loops_in_holdout_anch1_H3K27me3, GM12878_loops_in_holdout_anch2_H3K27me3, how = 'inner', on = 'name', suffixes=['1', '2'])
GM12878_holdout_H3K27me3_loops = GM12878_holdout_H3K27me3_loops.drop(columns=list(GM12878_holdout_H3K27me3_loops.filter(regex='_')))

In [None]:
# Need this to replace the expected vectors with what we used to generate the coolers
oe_vecs = {}

for idx, row in holdout_regions.iterrows():
    region_id = row['name'].strip('region')
    f_oe = f'/mnt/md0/clarice/florence_tmp/RCMC_norm_factors/500/GM12878_{region_id}.npy'
    oe_vecs[row['name']] = list(np.load(f_oe))

expected_cis = cooltools.expected_cis(
                clr=rcmc_clrs['experimental'],
                view_df=region_df,
                smooth=True,
                aggregate_smoothed=True,
                smooth_sigma=0.1,
                nproc=16
            )

fake_expected_all_regions = pd.DataFrame()

for idx, row in holdout_regions.iterrows():
    current_df = expected_cis[expected_cis['region1'] == row['name']]
    oe = oe_vecs[row['name']]
    pad = len(current_df) - 4000
    oe = oe + [0]*pad
    current_df['count.avg'] = oe
    current_df['balanced.avg'] = oe
    fake_expected_all_regions = pd.concat([fake_expected_all_regions, current_df])

fake_expected_all_regions.reset_index(inplace=True)

INFO:root:creating a Pool of 16 workers


In [None]:
all_pups = {}

# change for different loops
for clr_type, clr in rcmc_clrs.items():
    pup = coolpup.pileup(clr, GM12878_holdout_ctcf_loops, features_format='bedpe', view_df=holdout_regions, expected_df=fake_expected_all_regions, nproc=8, flank=20000, clr_weight_name=False)
    all_pups[clr_type] = pup['data'][0]

In [None]:
def get_enrichment(amap, n, dec=2):
    """
    Function directly from the coolpuppy package
    """
    c = int(np.floor(amap.shape[0]/2))
    return np.round(np.nanmean(amap[c-n//2:c+n//2+1, c-n//2:c+n//2+1]), decimals=dec)

def annotate_enrichment(ax, amap, n, dec=2, size=6, bold=False):
    """
    Function modified from the coolpuppy package, to control where the enrichment annotation goes
    """
    enr = get_enrichment(amap, n, dec)
    if bold == False:
        ax.text(10, 40, enr, ha='left', va='bottom', fontsize=size, fontproperties = default_font)
    else:
        ax.text(10, 40, enr, ha='left', va='bottom', fontsize=size, fontproperties = FontProperties(fname="/mnt/md0/clarice/packages/fonts/helvetica-bold.ttf"))

In [None]:
fig, axs = plt.subplots(figsize=[5, 10],
                        nrows=1,
                        ncols=5)

# limits empirically determined - change for different plot

for col_idx, epi_type in enumerate(['experimental', 'full', 'less_input', '5epi', '3epi']):
    ax = axs[col_idx]
    # limits for CTCF/H3K4me1 loops: (-1.5, 1.5)
    # limits for H3K4me3 loops: (-1, 1)
    amin, amax = (-1.5, 1.5)
    data = all_pups[epi_type]
    m = ax.imshow(np.log2(data), cmap='coolwarm', vmax = amax, vmin = amin)
    ax.set_yticks([], [])
    ax.set_xticks([], [])

    annotate_enrichment(ax, data, 3, size=10)

    ax.set_title(epi_type, rotation=0, size=12, fontproperties = default_font)
    
# Define a new axis for the colorbar using `add_axes`
cax = fig.add_axes([ax.get_position().x1 + 0.01,  # x-position of colorbar
                    ax.get_position().y0,        # y-position of colorbar
                    0.01,                        # width of colorbar
                    ax.get_position().height])   # height of colorbar

# Add the colorbar to the specified axis
# cbar = plt.colorbar(im, cax=cax)

cb = plt.colorbar(m, cax=cax)
mid = round((amin + amax)/2, 2)
cb.set_ticks([amin, mid, amax])
cb.set_ticklabels([amin, mid, amax], fontproperties = default_font)
cb.ax.minorticks_off()

plt.savefig('figures/CTCF_less_input_pileup.pdf')