# Get consensus peaks from cell type specific peaks
**Authorship:** Adam Klie (last updated: 08/18/2023)<br>
***
**Description:** Notebook to get consensus peaks from cell type specific peaks
***

# Set-up

In [7]:
import os
import sys
import glob
import yaml
import pandas as pd
import pyranges as pr
import requests

from pycisTopic.iterative_peak_calling import get_consensus_peaks

In [9]:
def load_narrow_peak(filename):
    """
    Load MACS2 narrow peak files as :class:`pr.PyRanges`.
    """
    narrow_peak = pd.read_csv(
        filename,
        sep="\t",
        header=None,
    )
    narrow_peak.columns = [
        "Chromosome",
        "Start",
        "End",
        "Name",
        "Score",
        "Strand",
        "FC_summit",
        "-log10_pval",
        "-log10_qval",
        "Summit",
    ]
    narrow_peak_pr = pr.PyRanges(narrow_peak)
    return narrow_peak_pr

In [10]:
# Other param
peaks_dir = "/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic"
peak_half_width = 250
chromsizes_path = "/cellar/users/aklie/data/igvf/references/genomes/hg38/GRCh38_EBV.chrom.sizes"
blacklist_path = "/cellar/users/aklie/data/igvf/references/blacklists/hg38/ENCFF356LFX.bed"
output_dir = os.path.join(peaks_dir, "consensus_peaks")
params_file = os.path.join(output_dir, "params.yaml")
params = {
    "peaks_dir": peaks_dir,
    "peak_half_width": peak_half_width,
    "chromsizes_path": chromsizes_path,
    "blacklist_path": blacklist_path,
    "output_dir": output_dir,
}

# Make output dir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Dump yaml
with open(params_file, "w") as f:
    yaml.dump(params, f)

In [19]:
# Get chromosome sizes (for hg38 here). We need them to ensure that extending the summits we don't fall out of the chromosome.
chromsizes = pd.read_csv(chromsizes_path, sep='\t', header=None)
chromsizes.columns = ['Chromosome', 'End']
chromsizes['Start'] = [0]*chromsizes.shape[0]
chromsizes = chromsizes.loc[:,['Chromosome', 'Start', 'End']]
chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].replace('v', '.') for x in range(len(chromsizes['Chromosome']))]
chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].split('_')[1] if len(chromsizes['Chromosome'][x].split('_')) > 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]
chromsizes.tail()

Unnamed: 0,Chromosome,Start,End
451,chrUn_KI270754v1,0,40191
452,chrUn_KI270755v1,0,36723
453,chrUn_KI270756v1,0,79590
454,chrUn_KI270757v1,0,71251
455,chrEBV,0,171823


In [11]:
# Get all the peak files
peaks_glob = os.path.join(peaks_dir, "*.narrowPeak")
peaks_files = glob.glob(peaks_glob)
peaks_files

['/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic/progenitor_peaks.narrowPeak',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic/SC_EC_peaks.narrowPeak',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic/SC_alpha_peaks.narrowPeak',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic/SC_beta_peaks.narrowPeak',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic/pre_SC_alpha_peaks.narrowPeak',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/peaks/igvf_sc-islet_10X-Multiome/10Aug23/pycistopic/SC_delta_peaks.narrowPeak']

In [15]:
# Get cell type names from consensus peaks
celltypes = [os.path.basename(i).split("/")[0].split(".")[0].split("_peaks")[0] for i in peaks_files]
celltypes

['progenitor', 'SC_EC', 'SC_alpha', 'SC_beta', 'pre_SC_alpha', 'SC_delta']

In [16]:
# Load the narrowPeak files as pyranges
peaks_prs = [load_narrow_peak(file) for file in peaks_files]

In [17]:
# Create a dictionary 
peaks_dict = dict(zip(celltypes, peaks_prs))
peaks_dict

{'progenitor': +--------------+-----------+-----------+------------------------+-------+
 | Chromosome   | Start     | End       | Name                   | +6    |
 | (category)   | (int32)   | (int32)   | (object)               | ...   |
 |--------------+-----------+-----------+------------------------+-------|
 | GL000195.1   | 16297     | 16636     | progenitor_peak_1      | ...   |
 | GL000195.1   | 17081     | 17346     | progenitor_peak_2      | ...   |
 | GL000195.1   | 24129     | 24382     | progenitor_peak_3      | ...   |
 | GL000195.1   | 30441     | 30955     | progenitor_peak_4a     | ...   |
 | ...          | ...       | ...       | ...                    | ...   |
 | chrY         | 56829308  | 56829703  | progenitor_peak_139485 | ...   |
 | chrY         | 56830714  | 56831059  | progenitor_peak_139486 | ...   |
 | chrY         | 56832541  | 56832943  | progenitor_peak_139487 | ...   |
 | chrY         | 56833555  | 56834370  | progenitor_peak_139488 | ...   |
 +---------

In [21]:
# Get consensus peaks
consensus_peaks = get_consensus_peaks(
    peaks_dict,
    peak_half_width, 
    chromsizes=chromsizes, 
    path_to_blacklist=blacklist_path
)

2023-08-22 12:07:56,523 cisTopic     INFO     Extending and merging peaks per class
2023-08-22 12:12:29,660 cisTopic     INFO     Normalizing peak scores
2023-08-22 12:12:30,585 cisTopic     INFO     Merging peaks
2023-08-22 12:16:29,098 cisTopic     INFO     Done!


In [22]:
consensus_peaks.to_bed(os.path.join(output_dir, "consensus_peaks.bed.gz"), keep=True, compression='infer', chain=False)

In [23]:
len(consensus_peaks)

442874

# DONE!

---