In [1]:
#!/bin/env python

"""
Created on Nov 17, 2017

Module that helps containerize the CLIP peak information.

@author: Brian
"""
import pandas as pd
import pybedtools
import pyBigWig
import intervals

class Peak():
    """
    ReadDensity class
    Attributes:
        self.pos(positive *.bw file)
        self.neg(negative *.bw file)
    """

    def __init__(self, peaks, name=None, stranded=True):
        try:
            self.peaks = pyBigWig.open(peaks)
            self.name = name if name is not None else ''
            self.stranded = stranded

        except Exception as e:
            print("couldn't open the peak files!")
            print(e)

    def overlaps(self, chrom, start, end, strand, flatten=False):
        """
        Returns true if there is a peak that overlaps the defined region.
        False otherwise.

        Parameters
        ----------
        chrom
        start
        end
        strand
        flatten

        Returns
        -------

        """
        region = pybedtools.create_interval_from_list(
            [
                chrom, str(start), str(end), '.', '0', strand
            ]
        )
        series = pd.Series(data=0, index=range(len(region)))
        try:
            overlapped_peaks = self.peaks.entries(chrom, start, end, strand)
        except RuntimeError as e:
            print(
            "weird entry (this can happen if the peak bb does not contain this chromosome, or if the region is invalid)"
            ": {}:{}-{}:{}".format(chrom, start, end, strand), e)
            return False
        if overlapped_peaks is None:
            return False
        else:
            return True

    def values(self, chrom, start, end, strand, flatten=False):
        """

        Parameters
        ----------
        chrom : basestring
            (eg. chr1)
        start : int
            0-based start (first position in chromosome is 0)
        end : int
            1-based end (last position is not included)
        strand : str
            either '+' or '-'
        flatten : bool
            in the case where multiple peaks overlap a region,
            scores will be summed over these regions. If flatten = True,
            scores will be the minimum of the multiple peaks.

        Returns
        -------
        densities : list
            values corresponding to density over specified positions.
        """

        # Get all overlapping values
        region = pybedtools.create_interval_from_list(
            [
                chrom, str(start), str(end), '.', '0', strand
            ]
        )
        series = pd.Series(data=0, index=range(len(region)))
        try:
            overlapped_peaks = self.peaks.entries(chrom, start, end, strand)
        except RuntimeError as e:
            print("weird entry (this can happen if the peak bb does not contain this chromosome, or if the region is invalid)"
                  ": {}:{}-{}:{}".format(chrom, start, end, strand), e)
            return series

        if overlapped_peaks is None:
            return series
        else:
            print(overlapped_peaks)
            for p in overlapped_peaks:
                bed_list = [chrom, str(p[0]), str(p[1])] + p[2].split('\t')
                if self.stranded:
                    if bed_list[5] == strand:
                        peak = pybedtools.create_interval_from_list(bed_list)
                        if flatten:
                            print('not implemented or important yet')  # TODO: implement flatten
                        else:
                            series += intervals.get_overlap(peak, region)
                else:
                    peak = pybedtools.create_interval_from_list(bed_list)
                    if flatten:
                        print('not implemented or important yet')  # TODO: implement flatten
                    else:
                        series += intervals.get_overlap(peak, region)
            return series

    def pseudocount(self):
        return 0

In [2]:
bb_file = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/204_01.basedon_204_01.peaks.l2inputnormnew.bed.compressed.bed.p3f1.bed.sorted.bed.bb'

In [3]:
stranded_test = Peak(peaks=bb_file, name='204_01_test')
unstranded_test = Peak(peaks=bb_file, name='204_01_test', stranded=False)

In [4]:
chrom = 'chr1'
start = 17451
end = 17528
strand = '-'
stranded_neg_test = stranded_test.values(chrom=chrom, start=start, end=end, strand=strand)[:5]
stranded_neg_test

[(17451, 17528, 'peak\t0\t-')]


0    1
1    1
2    1
3    1
4    1
dtype: int64

In [5]:
chrom = 'chr1'
start = 17451
end = 17528
strand = '+'
stranded_test.values(chrom=chrom, start=start, end=end, strand=strand)[:5]

[(17451, 17528, 'peak\t0\t-')]


0    0
1    0
2    0
3    0
4    0
dtype: int64

In [12]:
from pandas.testing import assert_series_equal

In [14]:
x = unstranded_test.values(chrom=chrom, start=start, end=end, strand="+")
y = unstranded_test.values(chrom=chrom, start=start, end=end, strand="-")
assert_series_equal(x, y)

[(17451, 17528, 'peak\t0\t-')]
[(17451, 17528, 'peak\t0\t-')]


In [17]:
z = stranded_test.values(chrom=chrom, start=start, end=end, strand="-")
assert_series_equal(x, z)

[(17451, 17528, 'peak\t0\t-')]


In [None]:
test_bigwig = pyBigWig.open(bb_file)

In [None]:
test_bigwig.entries(chrom, start, end, strand)