In [1]:
import pybedtools
import os

# Comparing the similarity between two bed files
- As an example, I want to compare peak files generated between Gabe's GATK version of the eCLIP pipeline with my version (0.2.1+) to make sure the peaks are similar. They are not identical due to the preliminary demultiplexing step, which is different. 

# Read in two bed files

In [2]:
gatk = '/home/bay001/projects/michelle_msi_20180420/permanent_data/GATK/analysis_v1/'
eclip = '/home/bay001/projects/michelle_msi_20180420/permanent_data/MSI/results/'

bed1 = pybedtools.BedTool(
    os.path.join(gatk, 'NSC201cb_Msi1_AI_rep1_Msi1.merged.r2.peaks.bed')
)
bed2 = pybedtools.BedTool(
    os.path.join(eclip, 'Msi.NSC201cb_Msi1_AIP_rep1.X2A.r1.fqTrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.merged.r2.peakClusters.bed')
)

# Merge potentially overlapping peaks within each bed file

In [3]:
bed1 = bed1.merge(s=True, c='4,5,6', o='distinct,distinct,distinct')
bed2 = bed2.merge(s=True, c='4,5,6', o='distinct,distinct,distinct')

# Number of peaks after merging:

num_peaks_1 = bed1.count()
num_peaks_2 = bed2.count()
print("Number of peaks after merging (bed1: {}, bed2: {}".format(num_peaks_1, num_peaks_2))

Number of peaks after merging (bed1: 87821, bed2: 87628


# Perform reciprocal intersection 
- apply overlapping filter (-f, at least 50% of bases covered by both peaks)
- apply strandedness filter
- apply reciprocal filter (A peaks cover at least 50% of B, and B peaks overlap at least 50% of A)

In [4]:
num_intersecting = bed1.intersect(
    bed2, f=0.50, r=True, s=True
).count()

# If the number of intersecting regions that overlap is 90+% the number of original A and B regions, they are similar.

In [5]:
print(num_intersecting / float(num_peaks_1))
print(num_intersecting / float(num_peaks_2))

0.996640894547
0.998835988497
