In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import glob
import pybedtools
import pyBigWig
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook

pd.set_option('display.max_rows', 1000)

In [2]:
test_peaks = '/projects/ps-yeolab3/bay001/maps/rbfox2/204_01.basedon_204_01.peaks.l2inputnormnew.bed.compressed.bed'
test_regions = '/projects/ps-yeolab3/bay001/annotations/data/regions/hg19_v19_exons.bed'

In [3]:
peaks = pybedtools.BedTool(test_peaks)
regions = pybedtools.BedTool(test_regions)

print(peaks.to_dataframe().shape)
print(regions.to_dataframe().shape)

(150490, 6)
(324422, 6)


In [4]:
def intersect_peaks(peaks, regions):
    """
    wraps pybedtools intersect
    """
    bt = peaks.intersect(
        regions,
        wa=True,
        wb=True,
        s=True,
    )
    df = bt.to_dataframe()
    df.columns = [
        'pchrom','pstart','pend','pl2fc','pl10p','pstrand',
        'rchrom','rstart','rend','rgene','rscore','rstrand'
    ]
    return df, bt

def get_overlap(peak, region):
    """
    Returns the length of a region with peak overlaps as a series.
    """
    series = pd.Series(data=0, index=range(len(region)))
    if peak.start == region.start and peak.end == region.end:
        series[:] = [1 for i in len(peak)]
    # peak overlaps the region start
    elif peak.start <= region.start and peak.end <= region.end:
        overlap = peak.end - region.start
        series[:overlap] = [1 for i in range(overlap)] 
    # peak overlaps the region end
    elif peak.start >= region.start and peak.end >= region.end:
        overlap = region.end - peak.start
        assert peak.start - region.start + overlap == len(series)
        series[-overlap:] = [1 for i in range(overlap)]
    # peak overlaps the whole region
    elif peak.start <= region.start and peak.end >= region.end:
        overlap = region.end - region.start
        assert overlap == len(series)
        series[:] = [1 for i in range(overlap)]
    # peak is completely contained within region
    elif peak.start >= region.start and peak.end <= region.end:
        overlap = peak.end - peak.start
        left_offset = peak.start - region.start
        right_offset = region.end - peak.end
        assert left_offset + overlap + right_offset == len(series)
        series[left_offset:-right_offset] = [1 for i in range(overlap)]
    else:
        print(peak, region)
    
    assert peak.strand == region.strand
    if peak.strand == '-':
        return series.iloc[::-1]
    else:
        return series

def create_intervals(series):
    """
    Takes a single 'window' dataframe and converts into two regions as intervals.
    """
    interval1_aslist = [str(col) for col in series[['pchrom','pstart','pend','pl2fc','pl10p','pstrand']]]
    interval1 = pybedtools.create_interval_from_list(interval1_aslist)
    interval2_aslist = [str(col) for col in series[['rchrom','rstart','rend','rgene','rscore','rstrand']]]
    interval2 = pybedtools.create_interval_from_list(interval2_aslist)
    return interval1, interval2

def create_matrix(peaks, regions):
    """
    Creates peak overlap matrix
    """
    overlaps = defaultdict()
    df, bt = intersect_peaks(peaks, regions)
    progress = tnrange(df.shape[0])
    for i in range(df.shape[0]):
        peak, region = create_intervals(df.iloc[i])
        try:
            series = get_overlap(peak, region)
        except AssertionError:
            print(peak, region)
        progress.update(1)
        overlaps[str(region)] = series if str(region) not in overlaps.keys() else overlaps[str(region)] + series
    return overlaps

def get_score_over_region(peak_interval, region_interval, overlap):
    series = pd.Series(index=range(region_interval.end - region_interval.start))
    return series
    

In [5]:
df, bt = intersect_peaks(peaks, regions)
df.head()

Unnamed: 0,pchrom,pstart,pend,pl2fc,pl10p,pstrand,rchrom,rstart,rend,rgene,rscore,rstrand
0,chr7,99949487,99949575,68.945425,5.091127,+,chr7,99948875,99949523,ENSG00000242294.2,0,+
1,chr7,99949345,99949407,46.955617,4.872148,+,chr7,99948875,99949523,ENSG00000242294.2,0,+
2,chr7,1028070,1028148,41.087353,3.512011,+,chr7,1027913,1029276,ENSG00000073067.9,0,+
3,chr7,99949314,99949345,35.864263,4.517305,+,chr7,99948875,99949523,ENSG00000242294.2,0,+
4,chr7,99949052,99949121,32.616651,4.394294,+,chr7,99948875,99949523,ENSG00000242294.2,0,+


In [7]:
# d = create_matrix(peaks, regions)

In [None]:
s1 = d['chr12	53564961	53565772	ENSG00000139631.14	0	-\n']
pd.DataFrame(s1).plot()

In [None]:
pd.DataFrame(s1.iloc[::-1]).plot()

In [None]:
s1.fillna(0) + s2.fillna(0)

In [101]:
#!/bin/env python

"""
Created on Nov 17, 2017

Module that helps containerize the CLIP read information.

@author: Brian
"""
import pandas as pd
import numpy as np
import pybedtools
from collections import defaultdict

def intersect_peaks(peaks, regions):
    """
    wraps pybedtools intersect
    """
    bt = peaks.intersect(
        regions,
        wa=True,
        wb=True,
        s=True,
    )
    df = bt.to_dataframe()
    df.columns = [
        'pchrom', 'pstart', 'pend', 'pl2fc', 'pl10p', 'pstrand',
        'rchrom', 'rstart', 'rend', 'rgene', 'rscore', 'rstrand'
    ]
    return df, bt


def get_overlap(peak, region, score_type='simple'):
    """
    Returns the length of a region with peak overlaps as a series.
    """
    series = pd.Series(data=0, index=range(len(region)))

    overlap_type, overlap = determine_overlap(peak, region)

    if overlap_type == 'no_overlap':
        return series
    elif overlap_type == 'equal':
        series[:] = [score(score_type, peak, region) for i in range(overlap)]
    elif overlap_type == 'left':
        assert peak.end - overlap == region.start
        series[:overlap] = [score(score_type, peak, region) for i in range(overlap)]
    elif overlap_type == 'right':
        assert peak.start - region.start + overlap == len(series)
        series[-overlap:] = [score(score_type, peak, region) for i in range(overlap)]
    elif overlap_type == 'whole_region':
        assert overlap == len(series)
        series[:] = [score(score_type, peak, region) for i in range(overlap)]
    elif overlap_type == 'whole_peak':
        left_offset = peak.start - region.start
        right_offset = region.end - peak.end
        assert left_offset + overlap + right_offset == len(series)
        series[left_offset:-right_offset] = [1 for i in range(overlap)]
    else:
        return -1

    # assert peak.strand == region.strand
    if peak.strand == '-':
        return series.iloc[::-1]
    else:
        return series

def create_intervals(series):
    """
    Takes a single 'window' dataframe and converts into two regions as intervals.
    """
    interval1_aslist = [str(col) for col in series[['pchrom','pstart','pend','pl2fc','pl10p','pstrand']]]
    interval1 = pybedtools.create_interval_from_list(interval1_aslist)
    interval2_aslist = [str(col) for col in series[['rchrom','rstart','rend','rgene','rscore','rstrand']]]
    interval2 = pybedtools.create_interval_from_list(interval2_aslist)
    return interval1, interval2

def determine_overlap(peak, region):
    """
    Takes two intervals (peak, region) and determines whether or not
    the peak overlaps the left, right, entire region, or not at all.

    Parameters
    ----------
    peak : pybedtools.Interval
    region : pybedtools.Interval

    Returns
    -------

    """
    # assert(peak.strand == region.strand)
    if peak.start >= region.end or region.start >= peak.end:
        # peak and region don't overlap
        return 'no_overlap', 0
    elif peak.start == region.start and peak.end == region.end:
        # peak and region sizes are equal (completely overlap)
        overlap = peak.end - peak.start
        return 'equal', overlap
    elif peak.start <= region.start and peak.end <= region.end:
        # peak overlaps the left side of the region only
        overlap = peak.end - region.start
        return 'left', overlap
    elif peak.start >= region.start and peak.end >= region.end:
        # peak overlaps the right side of the region only
        overlap = region.end - peak.start
        return 'right', overlap
    elif peak.start <= region.start and peak.end >= region.end:
        # region is completely contained within peak
        overlap = region.end - region.start
        return 'whole_region', overlap
    elif peak.start >= region.start and peak.end <= region.end:
        # peak is completely contained within region
        overlap = peak.end - peak.start
        return 'whole_peak', overlap
    else:
        print("warning: {}, {} overlaps in an unexpected way.".format(
            peak, region
        ))
        return 'no_overlap', -1

def score(score_type='simple', peak=None, region=None):
    if score_type == 'simple':
        return 1
    elif score_type == 'fraction_region':
        return 1.0/len(region)
    elif score_type == 'fraction_peak':
        return 1.0/len(peak)
    return 0

class Peak2():
    """
    ReadDensity class
    Attributes:
        self.pos(positive *.bw file)
        self.neg(negative *.bw file)
    """

    def __init__(self, peaks, name=None):
        try:
            df = pd.read_table(peaks, names=[
                'chrom','start','end','name','score','strand'
            ])
            self.peaks = defaultdict(dict)
            for chromosome in set(df['chrom']):
                for strand in ['+','-']:
                    self.peaks[chromosome][strand] = pybedtools.BedTool.from_dataframe(
                        df[(df['chrom']==chromosome) & (df['strand']==strand)]
                    )
            self.name = name if name is not None else ''

        except Exception as e:
            print("couldn't open the peak files!")
            print(e)

    def values(self, regions):
        overlaps = defaultdict()
        df, bt = intersect_peaks(self.peaks, regions)
        for i in range(df.shape[0]):
            peak, region = create_intervals(df.iloc[i])
            series = get_overlap(peak, region)
            overlaps[str(region)] = series if str(
                region) not in overlaps.keys() else overlaps[
                                                        str(region)] + series
        return overlaps
    
    def value(self, chrom, start, end, strand):
        """

        Parameters
        ----------
        chrom : basestring
            (eg. chr1)
        start : int
            0-based start (first position in chromosome is 0)
        end : int
            1-based end (last position is not included)
        strand : str
            either '+' or '-'

        Returns
        -------
        densites : list
            values corresponding to density over specified positions.
        """

        try:
            # series = pd.Series(data=np.NaN, index=range(end - start))
            region = pybedtools.create_interval_from_list(
                [chrom,str(start),str(end),'.','0',strand]
            )
            
            region = pybedtools.BedTool("{}\t{}\t{}\t{}\t{}\t{}".format(
                chrom,str(start),str(end),'.','0',strand
            ),
                from_string=True)
            df, bt = intersect_peaks(self.peaks[chrom][strand], region)
            for i in df.index:
                peak, region = create_intervals(df.loc[i])
                series = get_overlap(peak, region)

            return series
        except RuntimeError:
            # usually occurs when no chromosome exists in the bigwig file
            return [np.NaN] * abs(start - end)

class Peak():
    """
    ReadDensity class
    Attributes:
        self.pos(positive *.bw file)
        self.neg(negative *.bw file)
    """

    def __init__(self, peaks, name=None):
        try:
            self.peaks = pyBigWig.open(peaks)
            self.name = name if name is not None else ''

        except Exception as e:
            print("couldn't open the peak files!")
            print(e)

    def values(self, chrom, start, end, strand):
        """

        Parameters
        ----------
        chrom : basestring
            (eg. chr1)
        start : int
            0-based start (first position in chromosome is 0)
        end : int
            1-based end (last position is not included)
        strand : str
            either '+' or '-'

        Returns
        -------
        densites : list
            values corresponding to density over specified positions.
        """

        overlapped_peaks = self.peaks.entries(chrom, start, end, strand)
        region = pybedtools.create_interval_from_list(
            [
                chrom, str(start),str(end),'.','0',strand
            ]
        )
        series = pd.Series(data=0, index=range(len(region)))
        for p in overlapped_peaks:
            
            bed_list = [chrom ,str(p[0]), str(p[1])] + p[2].split('\t')
            if bed_list[5] == strand:
                peak = pybedtools.create_interval_from_list(bed_list)
                series += get_overlap(peak, region)
        return series

In [106]:
peak_file = '/home/bay001/projects/codebase/rbp-maps/maps/peak/test/test_intervals/204_01.basedon_204_01.peaks.l2inputnormnew.bed.compressed.sorted.bb'
peakobj = Peak(peaks=peak_file)

In [108]:
peakobj.values('chr12',53564961,53565772,'-')

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     0
43     0
44     0
45     0
46     0
47     0
48     0
49     0
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     0
63     0
64     0
65     0
66     0
67     0
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     0
76     0
77     0
78     0
79     0
80     0
81     0
82     0
83     0
84     0
85     0
86     0
87     0
88     0
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    0
1

In [None]:
miso_file = '/home/bay001/projects/codebase/rbp-maps/maps/peak/test/test_intervals/RBFOX2-BGHLV26-HepG2-excluded-upon-knockdown'
annotation = pd.read_table(
    miso_file, names=['miso','gene']
)
annotation['all_upstream_regions'] = annotation['miso'].apply(lambda x: x.split('@')[0])
annotation['chrom'] = annotation['all_upstream_regions'].apply(lambda x: x.split(':')[0])
annotation['start'] = annotation['all_upstream_regions'].apply(lambda x: x.split(':')[1])
annotation['end'] = annotation['all_upstream_regions'].apply(lambda x: x.split(':')[2])
annotation['name'] = '.'
annotation['score'] = 0
annotation['strand'] = annotation['all_upstream_regions'].apply(lambda x: x.split(':')[3])
annotation = annotation[['chrom','start','end','name','score','strand']]
annotation = pybedtools.BedTool.from_dataframe(annotation)

In [None]:
df = pd.read_table(
    '/home/bay001/projects/codebase/rbp-maps/maps/peak/test/test_intervals/204_01.basedon_204_01.peaks.l2inputnormnew.bed.compressed.bed',
    names=['chrom','start','end','name','score','strand']
)
df['name'] = '.'
df['score'] = 0
df.sort_values(by=['chrom','start','end'], inplace=True)
df.to_csv(
    '/home/bay001/projects/codebase/rbp-maps/maps/peak/test/test_intervals/204_01.basedon_204_01.peaks.l2inputnormnew.bed.compressed.sorted.bed',
    sep='\t', index=False, header=False
)