# README

This notebook is part of a project to investigate the relationship between
compartment eigenvector (E1) values and epigenetic signals in mouse
embryonic tissues at the E14.5 stage.  Earlier analyses showed large
systematic differences between time points, so the current focus is on
within‑sample comparisons of different organs (liver vs. brain) and
chromosomes.

## Purpose

* compute Pearson correlation coefficients between E1 values and
    ChIP‑seq signals (H3K27ac, H3K27me3, H3K9me3) for each chromosome and
    tissue,
* generate line plots of E1, GC content and selected histone marks for a
    representative sample (E14.5F5) on a given chromosome (chr1, chr7, …),
    and
* overlay “switch” regions where compartment status changes.

The notebook reads pre‑computed E1 eigenvectors, GC coverage and
histone‑mark bed files at 1 Mb resolution, builds a combined `h3k_df`
DataFrame, evaluates correlations, and exports publication‑quality PDFs
showing the profiles.

## Usage

1. Adjust the directory paths at the top of the script to point to your
     data repositories.
2. Modify `sample_list`, `tissue_list`, `h3k_list` and `time` as needed.
3. Run each cell in order; the correlation results will print to the
     console and the line‑plot PDFs will be saved in `save_dir`.

The analysis supports further exploration of epigenetic‑compartment
relationships and selection of high‑correlation samples/regions for
figures or downstream analysis.

In [17]:
import cooler
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
mpl.rcParams['pdf.fonttype'] = 42


dir_path = '/home/goubo/CRICK/CRICK/spaceA/higashi_v2/higashi/'
gc_dir = '/home/xuyuetong/CRICK_Data_v4/Split_Cluster/'
h3k_dir = '/home/goubo/CRICK/CRICK/spaceA/onlineData/epi_nature_BingRen_mouseEmbryo/bed_1mb/'
switch_dir = '/home/xuyuetong/CRICK_Data_v3/Paper_Fig/NoSwitchRange_Expression/'
save_dir = '/home/xuyuetong/CRICK_Data_v3/Paper_Fig/Lineplot_E1value/'

time = '14.5'
sample_list = ['E14.5F5', 'E14.5F6']
tissue_list = ['Liver', 'Brain']
h3k_list = ['H3K27ac', 'H3K27me3', 'H3K9me3']


In [6]:
# prepare h3k data

h3k_df = pd.DataFrame()
for h3k_i in h3k_list:
    for chrom_i in range(1, 20):
        chrom_id = 'chr{0}'.format(chrom_i)
        for tissue in tissue_list:
            if tissue == 'Liver':
                h3k_path = '{0}ChIP-seq_{1}_liver_e{2}.1mb.bed'.format(h3k_dir, h3k_i, time)
                h3k_liver_df = pd.read_csv(h3k_path, header=0, index_col=None, sep='\t')
                h3k_array = h3k_liver_df[h3k_liver_df['chrom'] == chrom_id]['mean_signal'].values
                win_start = h3k_liver_df[h3k_liver_df['chrom'] == chrom_id]['window_start'].values
            else:
                h3k_fore_path = '{0}ChIP-seq_{1}_forebrain_e{2}.1mb.bed'.format(h3k_dir, h3k_i, time)
                h3k_mid_path = '{0}ChIP-seq_{1}_midbrain_e{2}.1mb.bed'.format(h3k_dir, h3k_i, time)
                h3k_hind_path = '{0}ChIP-seq_{1}_hindbrain_e{2}.1mb.bed'.format(h3k_dir, h3k_i, time)
                h3k_fore_df = pd.read_csv(h3k_fore_path, header=0, index_col=None, sep='\t')
                h3k_fore_sign = h3k_fore_df[h3k_fore_df['chrom'] == chrom_id]['total_signal'].values
                h3k_fore_peak = h3k_fore_df[h3k_fore_df['chrom'] == chrom_id]['peak_count'].values
                h3k_mid_df = pd.read_csv(h3k_mid_path, header=0, index_col=None, sep='\t')
                h3k_mid_sign = h3k_mid_df[h3k_mid_df['chrom'] == chrom_id]['total_signal'].values
                h3k_mid_peak = h3k_mid_df[h3k_mid_df['chrom'] == chrom_id]['peak_count'].values
                h3k_hind_df = pd.read_csv(h3k_hind_path, header=0, index_col=None, sep='\t')
                h3k_hind_sign = h3k_hind_df[h3k_hind_df['chrom'] == chrom_id]['total_signal'].values
                h3k_hind_peak = h3k_hind_df[h3k_hind_df['chrom'] == chrom_id]['peak_count'].values
                h3k_brain_sign = h3k_fore_sign + h3k_mid_sign + h3k_hind_sign
                h3k_brain_peak = h3k_fore_peak + h3k_mid_peak + h3k_hind_peak
                h3k_array = np.divide(h3k_brain_sign, h3k_brain_peak, out=np.zeros_like(h3k_brain_sign, dtype=float), where=h3k_brain_peak != 0)
                win_start = h3k_fore_df[h3k_fore_df['chrom'] == chrom_id]['window_start'].values
            h3k_i_df = pd.DataFrame({
                'time': time, 
                'chrom': chrom_id,
                'window_start': win_start,
                'tissue': tissue, 
                'class': h3k_i,
                'value': h3k_array
            })
            h3k_df = pd.concat([h3k_df, h3k_i_df], ignore_index=True)

print(h3k_df)


      time  chrom  window_start tissue     class     value
0     14.5   chr1             0  Liver   H3K27ac  0.000000
1     14.5   chr1       1000000  Liver   H3K27ac  0.000000
2     14.5   chr1       2000000  Liver   H3K27ac  0.000000
3     14.5   chr1       3000000  Liver   H3K27ac  0.000000
4     14.5   chr1       4000000  Liver   H3K27ac  4.973663
...    ...    ...           ...    ...       ...       ...
9887  14.5  chr19      57000000  Brain  H3K27me3  4.451700
9888  14.5  chr19      58000000  Brain  H3K27me3  3.724736
9889  14.5  chr19      59000000  Brain  H3K27me3  4.225715
9890  14.5  chr19      60000000  Brain  H3K27me3  3.900072
9891  14.5  chr19      61000000  Brain  H3K27me3  3.999140

[9892 rows x 6 columns]


In [None]:
# Calculate the Pearson correlation coefficient between E1 values and epigenetic signals for each chromosome

for s, sample_id in enumerate(sample_list):
    for chrom_i in range(1, 20):
        chrom_id = 'chr{0}'.format(chrom_i)
        for t, tissue in enumerate(tissue_list):
            e1_path = '{0}{1}/{1}_fasthigashi_leiden_anno_man_{2}.cis_eigs.csv'.format(dir_path, sample_id, tissue)
            e1_df = pd.read_csv(e1_path, header=0, index_col=0, sep=',')
            e1_chr_df = e1_df[e1_df['chrom'] == chrom_id].copy()
            e1_array = e1_chr_df['E1'].values

            # prepare h3k data
            for h3k_i in h3k_list:
                h3k_array = h3k_df[(h3k_df['time'] == time) & 
                                   (h3k_df['chrom'] == chrom_id) & 
                                   (h3k_df['tissue'] == tissue) & 
                                   (h3k_df['class'] == h3k_i)]['value'].values
                valid_mask = ~np.isnan(e1_array) & ~np.isnan(h3k_array)
                e1_array_clean = e1_array[valid_mask]
                h3k_array_clean = h3k_array[valid_mask]
                corr_value = np.corrcoef(e1_array_clean, h3k_array_clean)[0, 1]
                print(f'sample id: {sample_id}\t chrom id: {chrom_id}\t\t tissue: {tissue}\t\t Pearson between E1 and {h3k_i}:{round(corr_value, 5)}')
            print()

In [15]:
sample_id = 'E14.5F5'
chrom_id = 'chr1'

plot_path = '{0}Lineplot_E1_GC_H3K_1MB.pdf'.format(save_dir)

with PdfPages(plot_path) as pdf:
    
    plot_row = len(h3k_list)+1+1        # h3k+E1+GC
    plot_col = 2
    plt.figure(figsize=(6*plot_col, 2*plot_row))

    gc_path = '{0}mm10_gc_cov_1MB.tsv'.format(gc_dir)
    gc_cov = pd.read_csv(gc_path, header=0, index_col=None, sep='\t')
    gc_cov_chr = gc_cov.loc[gc_cov['chrom'] == chrom_id, 'GC'].values
    gc_cov_chr = gc_cov_chr[3: -3]
    for i in range(plot_col):
        ax_gc = plt.subplot(plot_row, plot_col, i+1)
        ax_gc.plot(gc_cov_chr, label='GC')
        ax_gc.set_ylabel('GC', fontsize=16)
        ax_gc.set_xticks([])
        ax_gc.set_title(chrom_id, fontsize=20)
        ax_gc.spines['right'].set_visible(False)
        ax_gc.spines['top'].set_visible(False)
        ax_gc.spines['bottom'].set_visible(False)

    for t, tissue in enumerate(tissue_list):
        e1_path = '{0}{1}/{1}_fasthigashi_leiden_anno_man_{2}.cis_eigs.csv'.format(dir_path, sample_id, tissue)
        e1_df = pd.read_csv(e1_path, header=0, index_col=0, sep=',')
        e1_chr_df = e1_df[e1_df['chrom'] == chrom_id].copy()
        e1_array = e1_chr_df['E1'].values

        plot_start_idx = plot_col+t+1
        ax1 = plt.subplot(plot_row, plot_col, plot_start_idx)
        ax1.fill_between(range(e1_chr_df.shape[0]), e1_array, 0, where=(e1_array > 0), facecolor='red', alpha=0.5)
        ax1.fill_between(range(e1_chr_df.shape[0]), e1_array, 0, where=(e1_array <= 0), facecolor='blue', alpha=0.5)
        ax1.plot(e1_array, label='E1', color='black', alpha=1, lw=0.2)
        ax1.set_ylabel('E1', fontsize=16)
        ax1.set_title('{0} {1}'.format(sample_id, tissue), fontsize=20)
        ax1.set_xticks([])
        ax1.spines['right'].set_visible(False)
        ax1.spines['top'].set_visible(False)
        ax1.spines['bottom'].set_visible(False)

        # prepare h3k data
        for h, h3k_i in enumerate(h3k_list):
            h3k_array = h3k_df[(h3k_df['time'] == time) & 
                               (h3k_df['chrom'] == chrom_id) & 
                               (h3k_df['tissue'] == tissue) & 
                               (h3k_df['class'] == h3k_i)]['value'].values

            ax_h3k = plt.subplot(plot_row, plot_col, plot_start_idx+(h+1)*plot_col)   
            ax_h3k.plot(h3k_array)
            ax_h3k.set_ylabel(h3k_i, fontsize=16)
            ax_h3k.set_xticks([])
            ax_h3k.spines['right'].set_visible(False)
            ax_h3k.spines['top'].set_visible(False)
            ax_h3k.spines['bottom'].set_visible(False)

    plt.subplots_adjust(hspace=0.5, wspace=0.2)
    pdf.savefig()
    plt.close()


In [None]:
# delete H3K9me3 information, add switch region information

sample_id = 'E14.5F5'
chrom_id = 'chr7'
h3k_list = ['H3K27ac', 'H3K27me3']

switch_region = pd.read_csv('{0}SwitchRange_{1}.csv'.format(switch_dir, sample_id), header=0, index_col=None)
switch_region = switch_region[(switch_region['chrom'] == chrom_id)]

plot_path = '{0}Lineplot_E1_GC_H3K_1MB.switch.pdf'.format(save_dir)

with PdfPages(plot_path) as pdf:
    
    plot_row = 2*len(h3k_list)+2*1+1        # h3k+E1+GC
    plot_col = 1
    plt.figure(figsize=(6*plot_col, 2*plot_row))

    gc_path = '{0}mm10_gc_cov_1MB.tsv'.format(gc_dir)
    gc_cov = pd.read_csv(gc_path, header=0, index_col=None, sep='\t')
    gc_cov_chr = gc_cov.loc[gc_cov['chrom'] == chrom_id, 'GC'].values
    gc_cov_chr = gc_cov_chr[3: -3]
    ax_gc = plt.subplot(plot_row, plot_col, 1)
    ax_gc.plot(gc_cov_chr, label='GC')
    ax_gc.set_ylabel('GC', fontsize=16)
    ax_gc.set_xticks([])
    ax_gc.set_title(chrom_id, fontsize=20)
    ax_gc.spines['right'].set_visible(False)
    ax_gc.spines['top'].set_visible(False)
    ax_gc.spines['bottom'].set_visible(False)

    for t, tissue in enumerate(tissue_list):
        e1_path = '{0}{1}/{1}_fasthigashi_leiden_anno_man_{2}.cis_eigs.csv'.format(dir_path, sample_id, tissue)
        e1_df = pd.read_csv(e1_path, header=0, index_col=0, sep=',')
        e1_chr_df = e1_df[e1_df['chrom'] == chrom_id].copy()
        e1_array = e1_chr_df['E1'].values

        plot_start_idx = plot_col+t*3+1
        ax1 = plt.subplot(plot_row, plot_col, plot_start_idx)
        ax1.fill_between(range(e1_chr_df.shape[0]), e1_array, 0, where=(e1_array > 0), facecolor='red', alpha=0.5)
        ax1.fill_between(range(e1_chr_df.shape[0]), e1_array, 0, where=(e1_array <= 0), facecolor='blue', alpha=0.5)
        ax1.plot(e1_array, label='E1', color='black', alpha=1, lw=0.2)
        ax1.set_ylabel('E1', fontsize=16)
        ax1.set_title('{0} {1}'.format(sample_id, tissue), fontsize=20)
        ax1.set_xticks([])
        ax1.spines['right'].set_visible(False)
        ax1.spines['top'].set_visible(False)
        ax1.spines['bottom'].set_visible(False)
        for switch_i in range(switch_region.shape[0]):
            start = switch_region.iloc[switch_i]['start'] // 1_000_000
            end = switch_region.iloc[switch_i]['end'] // 1_000_000
            ax1.axvspan(start, end, color='gray', alpha=0.3, label='switch')

        # prepare h3k data
        for h, h3k_i in enumerate(h3k_list):
            h3k_array = h3k_df[(h3k_df['time'] == time) & 
                               (h3k_df['chrom'] == chrom_id) & 
                               (h3k_df['tissue'] == tissue) & 
                               (h3k_df['class'] == h3k_i)]['value'].values

            ax_h3k = plt.subplot(plot_row, plot_col, plot_start_idx+(h+1))   
            ax_h3k.plot(h3k_array)
            ax_h3k.set_ylabel(h3k_i, fontsize=16)
            ax_h3k.set_xticks([])
            ax_h3k.spines['right'].set_visible(False)
            ax_h3k.spines['top'].set_visible(False)
            ax_h3k.spines['bottom'].set_visible(False)

    plt.subplots_adjust(hspace=0.5, wspace=0.2)
    pdf.savefig()
    plt.close()