In [1]:
import pybedtools
import pandas as pd

In [2]:
peak_file = '/projects/ps-yeolab3/clip_not_encode/ecwheele/20170614_SRSF2_eirini_all_clip/clipper_intjob_test/EW42_SRSF2_C1-8_diff_IP.merged.r2.peaks.for.kmers.bed'
peaks = pybedtools.BedTool(peak_file)
peaks = peaks.sort()
peaks.to_dataframe().head() # just to preview what we have

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,17460,17568,3.5,6.136029,-
1,chr1,18350,18366,3.5,-3.478688,-
2,chr1,24737,24749,3.5,-3.457346,-
3,chr1,135156,135268,3.5,1.270319,-
4,chr1,135160,135268,3.5,-0.387692,-


# count number of reads

In [3]:
peaks.count() # this is the total number of peaks (lines) in the BED file

141858

# intersect with a given feature
- we will intersect our peaks with CDS regions to see which peaks overlap CDS

In [4]:
exons = pybedtools.BedTool(
    '/home/gpratt/clipper/clipper/data/regions/hg19_v19_cds.bed'
)
exons = exons.sort()
exons.to_dataframe().head() # just to preview what we have

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,69091,70005,ENSG00000186092.4,0,+
1,chr1,138533,139309,ENSG00000237683.5,0,-
2,chr1,367659,368594,ENSG00000235249.1,0,+
3,chr1,621099,622034,ENSG00000185097.2,0,-
4,chr1,738532,738618,ENSG00000269831.1,0,-


In [5]:
# these are the peaks that overlap CDS regions
cds_peaks = peaks.intersect(exons, s=True, u=True)
print(cds_peaks.to_dataframe().shape[0]) # the number of CDS-overlapping peaks
cds_peaks.to_dataframe().head()

57547


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,139037,139088,3.5,-0.768221,-
1,chr1,139291,139377,3.5,-1.48175,-
2,chr1,880129,880174,3.5,3.421289,-
3,chr1,880458,880473,3.5,-0.454948,-
4,chr1,880910,880953,3.5,0.283972,-


In [6]:
# these are the peaks that don't overlap CDS regions
non_cds_peaks = peaks.intersect(exons, s=True, v=True)
print(non_cds_peaks.to_dataframe().shape[0]) # the number of non-CDS-overlapping peaks
non_cds_peaks.to_dataframe().head()

84311


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,17460,17568,3.5,6.136029,-
1,chr1,18350,18366,3.5,-3.478688,-
2,chr1,24737,24749,3.5,-3.457346,-
3,chr1,135156,135268,3.5,1.270319,-
4,chr1,135160,135268,3.5,-0.387692,-


## What if we want to see if the rest of the peaks (that didn't overlap CDS) overlaps with 3'UTRs?

In [7]:
utr3 = pybedtools.BedTool(
    '/home/gpratt/clipper/clipper/data/regions/hg19_v19_three_prime_utrs.bed'
)
utr3 = utr3.sort()
utr3.to_dataframe().head() # just to preview what we have

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,70006,70008,ENSG00000186092.4,0,+
1,chr1,134901,135802,ENSG00000237683.5,0,-
2,chr1,137621,138532,ENSG00000237683.5,0,-
3,chr1,368595,368634,ENSG00000235249.1,0,+
4,chr1,621059,621098,ENSG00000185097.2,0,-


In [8]:
# these are the peaks that overlap CDS regions
utr3_peaks = non_cds_peaks.intersect(utr3, s=True, u=True)
print(utr3_peaks.to_dataframe().shape[0]) # the number of non-3'utr-overlapping peaks
utr3_peaks.to_dataframe().head()

4104


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,135156,135268,3.5,1.270319,-
1,chr1,135160,135268,3.5,-0.387692,-
2,chr1,137793,137852,3.5,0.13758,-
3,chr1,137794,137844,3.5,-2.145547,-
4,chr1,137963,137997,3.5,-3.835166,-


In [9]:
# these are the peaks that don't overlap 3'UTR regions
non_utr3_peaks = non_cds_peaks.intersect(utr3, s=True, v=True)
print(non_utr3_peaks.to_dataframe().shape[0]) # the number of non-3'utr-overlapping peaks
non_utr3_peaks.to_dataframe().head()

80207


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,17460,17568,3.5,6.136029,-
1,chr1,18350,18366,3.5,-3.478688,-
2,chr1,24737,24749,3.5,-3.457346,-
3,chr1,169258,169264,3.5,-3.435477,-
4,chr1,172556,172591,3.5,-2.749305,-


# All the numbers should add up
- total peak count: 141858
- CDS peaks count: 57547
- 3UTR peaks count: 4104
- "other" peaks count: 80207