In [1]:
import pybedtools
import os
pybedtools.helpers.set_bedtools_path('/projects/ps-yeolab4/software/yeolabconda3/envs/python3essential-0.0.1/bin/')

# Comparing the similarity between two bed files
- As an example, I want to compare peak files generated between Gabe's GATK version of the eCLIP pipeline with my version (0.2.1+) to make sure the peaks are similar. They are not identical due to the preliminary demultiplexing step, which is different. 

# Read in two bed files

In [2]:
bed1_dir = '/home/bay001/projects/eric_chimclip_20200418/temporary_data/v4/R_chi/results'
# bed2_dir = '/projects/ps-yeolab4/seqdata/20210929_eclipse_chimclips/GEO'
bed2_dir = '/home/bay001/projects/eric_chimclip_20200418/temporary_data/v5/R_chi/results'
bed1 = pybedtools.BedTool(
    os.path.join(
        bed1_dir, 
        'eclipse.R_Chi_2_1g.umi.r1.fqTrTr.fq.sorted.eclip.genome-mappedSoSo.rmDupSo.peakClusters.normed.compressed.bed'
    )
).sort()
bed2 = pybedtools.BedTool(
    os.path.join(
        bed2_dir, 
        'eclipse.R_Chi_2_1g.umi.r1.fqTrTr.fq.sorted.eclip.genome-mappedSoSo.rmDupSo.peakClusters.normed.compressed.bed'
        # 'R_CC_A3_01.basedon.peaks.l2inputnormnew.bed.compressed.bed'
    )
).sort()
num_peaks_1 = bed1.count()
num_peaks_2 = bed2.count()
print("Number of peaks before merging (bed1: {}, bed2: {}".format(num_peaks_1, num_peaks_2))

Number of peaks before merging (bed1: 353280, bed2: 353280


In [3]:
def sigfilter(feature):
    if float(feature['name']) < 3:
        return False
    if float(feature['score']) < 3:
        return False
    return True

bed1 = bed1.filter(sigfilter).saveas()
bed2 = bed2.filter(sigfilter).saveas()

# Merge potentially overlapping peaks within each bed file

In [4]:
bed1 = bed1.merge(s=True, c='4,5,6', o='distinct,distinct,distinct')
bed2 = bed2.merge(s=True, c='4,5,6', o='distinct,distinct,distinct')

# Number of peaks after merging:

num_peaks_1 = bed1.count()
num_peaks_2 = bed2.count()
print("Number of peaks after merging (bed1: {}, bed2: {}".format(num_peaks_1, num_peaks_2))

Number of peaks after merging (bed1: 12476, bed2: 12476


# Perform reciprocal intersection 
- apply overlapping filter (-f, at least 50% of bases covered by both peaks)
- apply strandedness filter
- apply reciprocal filter (A peaks cover at least 50% of B, and B peaks overlap at least 50% of A)

In [5]:
num_intersecting = bed1.intersect(
    bed2, f=0.01, r=True, s=True
).count()

# If the number of intersecting regions that overlap is 90+% the number of original A and B regions, they are similar.

In [6]:
print(num_intersecting / float(num_peaks_1))
print(num_intersecting / float(num_peaks_2))

1.0
1.0
