# README

This notebook contains a pipeline to process and visualise Hi‑C contact data
for a set of samples, tissues and chromosomes.

The steps performed here are:

    1. load sample/tissue/spot labels and for each chromosome extract the
         intra‑chromosomal contact pairs from compressed TSV files;
    2. convert the extracted `.pairs` files to `.hic` using Juicer Tools and
         then to `.cool` format with hic2cool, running these conversions in
         parallel using multiprocessing;
    3. balance the resulting coolers and draw contact heatmaps for all
         chromosomes for every sample‑tissue combination;
    4. as an example, focus on chromosome 1 and apply smoothing / imputation
         before plotting a combined figure across samples and tissues.

Variables such as `sampleid_list`, `chrom_id_list`, `tissue_list`,
`label_dir`, `save_dir` etc. are defined at the top of the notebook and
used throughout.

The purpose of the script is to automate the extraction, format conversion
and visual exploration of spatial interaction data, producing
publication‑ready heatmaps and facilitating downstream analysis.

Usage:

    • adjust the directory paths and lists of samples/tissues/chromosomes
        as needed,
    • execute the cells sequentially – earlier cells set up imports and
        parameters used by later ones,
    • inspect or modify plotting parameters for custom figures.

In [None]:
import os
import math
import gzip
import subprocess
import numpy as np
import pandas as pd
import multiprocessing

import cooler
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
from cooltools.lib.numutils import adaptive_coarsegrain, interp_nan


resolution = 1_000_000
sampleid_list = [
    'E11.5L1', 'E11.5L2',
    'E12.5L5', 'E12.5L6', 'E12.5L9', 'E12.5S1',
    'E13.5C1', 'E13.5C4', 'E13.5C6', 
    'E14.5F5', 'E14.5F6', 'E14.5L7', 'E14.5L8', 'E14.5L10']
chrom_id_list = [
    'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
    'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chrX']
tissue_list = ['Brain', 'Liver']

label_dir = '/home/xuyuetong/CRICK_Data_v3/Paper_Fig/'
save_dir = '/home/xuyuetong/CRICK_Data_v3/Paper_Fig/Contact_Heatmap/'
pairs_dir = '/home/goubo/CRICK/CRICK/spaceA/hagash_pre_v2/'


In [None]:
def extract_tissue_pairs(sample_id, chrom_id, save_path, spotid_list, pairs_dir):
    with open(save_path, 'w') as s:
        s.write('## pairs format v1.0\n')
        s.write('#columns: readID chr1 position1 chr2 position2 strand1 strand2\n')
        for spot_id in spotid_list:
            pairs_spot_path = '{0}{1}/{2}.contact.tsv.gz'.format(pairs_dir, sample_id, spot_id)
            with gzip.open(pairs_spot_path, 'rt') as sp:
                for line in sp:
                    lines = line.strip().split('\t')
                    line_chrom = lines[0]
                    if line_chrom == chrom_id:
                        line = '\t'.join([lines[1], lines[3]]) + '\n'
                        s.write('. {0} {1} {0} {2} . .\n'.format(chrom_id, lines[1], lines[3]))

                    
params_list = []
for s, sample_id in enumerate(sampleid_list):
    label_path = '{0}{1}_hagashi_obs_sub.tsv'.format(label_dir, sample_id)
    label_data = pd.read_csv(label_path, header=0, index_col=None, sep='\t')
    label_data = label_data[['spot_id', 'sample', 'hagashi_leiden_anno_man']]
    for tissue_id in tissue_list:
        label_idata = label_data[label_data['hagashi_leiden_anno_man'] == tissue_id]
        spotid_ilist = label_idata['spot_id'].values
        for chrom_id in chrom_id_list:
            params_list.append((sample_id, tissue_id, chrom_id, spotid_ilist))

processes_num = len(params_list)
max_processes = 100
processes = []
index = 0
while index < processes_num:
    for _ in range(min(max_processes, processes_num - index)):
        sample_id, tissue_id, chrom_id, spotid_ilist = params_list[index]
        save_ipath = '{0}Tissue_Pairs/{1}_{2}_{3}.pairs'.format(
            save_dir, sample_id, tissue_id, chrom_id)
        p = multiprocessing.Process(target=extract_tissue_pairs, args=(
            sample_id, chrom_id, save_ipath, spotid_ilist, pairs_dir))
        p.start()
        processes.append(p)
        index += 1
    for p in processes:
        p.join()
    processes = []

In [None]:
# Juicer Tools Pre and hic2cool to convert the pairs into cool format.

def pairs2cool(pairs_path, save_hic, save_cool, resolution):
    subprocess.run("java -Xmx2g -jar /home/xuyuetong/Tools/JuicerTools/juicer_tools_1.22.01.jar pre {0} {1} mm10 -r {2} -j 30".format(
        pairs_path, save_hic, resolution), shell=True)
    subprocess.run("hic2cool convert {0} {1} -p 30".format(save_hic, save_cool), shell=True)


pairs_dir = '{0}Tissue_Pairs/'.format(save_dir)
pairs_path_list = []
for root, dirs, files in os.walk(pairs_dir):
    for file_name in files:
        pairs_path_list.append(file_name)

processes_num = len(pairs_path_list)
max_processes = 100
processes = []
index = 0
while index < processes_num:
    for _ in range(min(max_processes, processes_num - index)):
        pairs_path_i = pairs_path_list[index]
        sample_id = pairs_path_i.split('_')[0]
        tissue_id = pairs_path_i.split('_')[1]
        chrom_id = pairs_path_i.split('_')[2].split('.')[0]
        pairs_path_i = os.path.join(pairs_dir, pairs_path_i)
        save_hic = '{0}Tissue_HiC/{1}_{2}_{3}.hic'.format(save_dir, sample_id, tissue_id, chrom_id)
        save_cool = '{0}Tissue_Cool/{1}_{2}_{3}.cool'.format(save_dir, sample_id, tissue_id, chrom_id)
        p = multiprocessing.Process(target=pairs2cool, args=(
            pairs_path_i, save_hic, save_cool, resolution))
        p.start()
        processes.append(p)
        index += 1
    for p in processes:
        p.join()
    processes = []


In [None]:
# Plotting Heatmap

fruitpunch = sns.blend_palette(['white', 'red'], as_cmap=True)

for s, sample_id in enumerate(sampleid_list):
    for t, tissue_id in enumerate(tissue_list):
        
        fig_nrow = 4
        fig_ncol = 5
        f, ax = plt.subplots(figsize=(6*fig_ncol, 5*fig_nrow), ncols=fig_ncol, nrows=fig_nrow, dpi=100)

        for c, chrom_id in enumerate(chrom_id_list):
            
            cool_path = '{0}Tissue_Cool/{1}_{2}_{3}.cool'.format(save_dir, sample_id, tissue_id, chrom_id)
            clr = cooler.Cooler(cool_path)
            cooler.balance_cooler(clr, store=True)

            # Plot
            row_idx = c // fig_ncol
            col_idx = c % fig_ncol
            ax_tissue = ax[row_idx, col_idx]
            im = ax_tissue.matshow(clr.matrix(balance=True).fetch(chrom_id), cmap=fruitpunch)
            ax_tissue.set_title(chrom_id, fontsize=16)
            ax_tissue.xaxis.set_visible(False)

            divider = make_axes_locatable(ax_tissue)
            ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
            plt.colorbar(im, cax=ax_colorbar)

        plot_path = '{0}Contact_Heatmap/{1}_{2}_Heatmap.png'.format(save_dir, sample_id, tissue_id)
        plt.savefig(plot_path)
        plt.close(f)

In [None]:
chrom_id = 'chr1'
norm = LogNorm(vmin=0.0035, vmax=0.030)
fruitpunch = sns.blend_palette(['white', 'red'], as_cmap=True)

fig_nrow = math.ceil(len(sampleid_list) / 3)
fig_ncol = 6
f, ax = plt.subplots(figsize=(6*fig_ncol, 5*fig_nrow), ncols=fig_ncol, nrows=fig_nrow, dpi=100)
for s, sample_id in enumerate(sampleid_list):
    for t, tissue_id in enumerate(tissue_list):
          
        cool_path = '{0}Tissue_Cool/{1}_{2}_{3}.cool'.format(save_dir, sample_id, tissue_id, chrom_id)
        clr = cooler.Cooler(cool_path)
        cooler.balance_cooler(clr, store=True)
        # Smooth and Imputation
        cg = adaptive_coarsegrain(clr.matrix(balance=True).fetch(chrom_id),
                                  clr.matrix(balance=False).fetch(chrom_id),
                                  cutoff=3, max_levels=8)
        cgi = interp_nan(cg, pad_zeros=True)

        # Plot
        row_idx = s*2 // fig_ncol
        col_idx = s*2 % fig_ncol
        ax_tissue = ax[row_idx, col_idx + t]
        im = ax_tissue.matshow(cgi, cmap=fruitpunch, norm=norm)
        ax_tissue.set_title('{0}  {1}'.format(sample_id, tissue_id), fontsize=16)
        ax_tissue.xaxis.set_visible(False)

        divider = make_axes_locatable(ax_tissue)
        ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
        plt.colorbar(im, cax=ax_colorbar)

plot_path = '{0}Contact_Heatmap/Heatmap_Chrom1.pdf'.format(save_dir)
plt.savefig(plot_path)
plt.close(f)