In [None]:
# warn about refactor
import warnings
warnings.warn("Use the refactored 'screpairutils' module instead")

---

In [None]:
import re

import h5py
import numpy as np
import pandas as pd

In [None]:
def is_valid_chrom(s):
    return (s not in {"MT", "Y"}) and (s[:4] != "ERCC") and (s[:2] not in {"GL", "JH"})

In [None]:
def natural_sort_key(s):
    """See http://www.codinghorror.com/blog/archives/001018.html"""
    return [int(c) if c.isdigit() else c for c in re.split(r'(\d+)', s)]

---

### Load GATC positions and mappability data, and bin the mappability

#### BONUS: mask "problematic" regions ("DAC Blacklisted Regions")

src: https://www.nature.com/articles/s41598-019-45839-z -> https://github.com/Boyle-Lab/Blacklist/ -> https://www.encodeproject.org/annotations/ENCSR636HFF/

In [1]:
POSFN = "/data/zfs/references/human/hg19/posarray/Homo_sapiens.GRCh37.dna.primary_assembly.with_ERCC.GATC.posarray.hdf5"
MAPFN = "/data/zfs/references/human/hg19/mappability/Homo_sapiens.GRCh37.dna.primary_assembly.with_ERCC.GATC.bowtie2_very_sensitive_N1.readlength_65.counts.pos.hdf5"
BLACKLISTFN = "/data/zfs/references/human/hg19/mappability/ENCFF001TDO.bed.gz"

In [None]:
with h5py.File(POSFN, 'r') as f:
    chroms = sorted(filter(is_valid_chrom, f.keys()), key=natural_sort_key)
    pos = {chrom: f[chrom][:].cumsum() for chrom in chroms}
    chromsizes = {chrom: int(pos[chrom][-1]) + 1 for chrom in chroms}

In [None]:
binned_chromsizes = {
    chrom: int(np.ceil(chromsizes[chrom] / BINSIZE))
    for chrom in chroms
}

In [None]:
binned_pos = {chrom: (pos[chrom] // BINSIZE) for chrom in chroms}

In [None]:
with h5py.File(MAPFN, 'r') as f:
    mapab = {chrom: (f[chrom][:] > 0) for chrom in chroms}

Load and apply the blacklist table

In [None]:
blacklisttbl = pd.read_csv(BLACKLISTFN, sep="\t", compression="gzip", header=None)

blacklisttbl.columns = ["chrom", "start", "end", "type", "score", "strand"]

assert blacklisttbl["chrom"].apply(lambda s: s.startswith("chr")).all()  # :(

In [None]:
CHRNAME_MAP = {
    chrom: "chr%s" % chrom
    for chrom in chroms
    if chrom not in "MT"
}

CHRNAME_MAP['MT'] = "chrM"

INV_CHRNAME_MAP = {
    v: k for (k, v) in CHRNAME_MAP.items()
}

blacklisttbl["chrom"] = blacklisttbl["chrom"].map(INV_CHRNAME_MAP)

In [None]:
for chrom, chromsubdf in blacklisttbl.groupby(["chrom"]):
    if chrom not in chroms:
        print("Skipping %s" % chrom)
        continue
    
    segments = np.array([
        # add some padding since GATC may fall out of blacklisted region but majority of read sequence may fall into it
        np.searchsorted(
            pos[chrom],
            np.maximum(0, chromsubdf["start"].values - 100).astype(int),
        ),
        np.searchsorted(
            pos[chrom],
            np.minimum(chromsizes[chrom], chromsubdf["end"].values + 100).astype(int),
        ),
    ]).T
    
    for start, end in segments:
        mapab[chrom][start:end] = False

In [None]:
binned_mapab = {chrom: np.zeros(binned_chromsizes[chrom], dtype=int) for chrom in chroms}
for chrom in chroms:
    np.add.at(binned_mapab[chrom], binned_pos[chrom], mapab[chrom].sum(axis=-1).astype(int))

In [None]:
CUTOFF_MAPAB = 2 * (BINSIZE / 1000)  # 2 per kb.

In [None]:
w_mapab = {chrom: binned_mapab[chrom] >= CUTOFF_MAPAB for chrom in chroms}

---