In [1]:
import pysam
import os
import pandas as pd
import numpy as np
import time
import argparse
import sys
import pybedtools


In [2]:
if not os.path.exists('tmp'):
    os.mkdir('tmp')
pybedtools.set_tempdir('tmp')

In [3]:
name = 'k100_minus_exclusion_lists'
mappability_bedGraph_path='../k100.Umap.MultiTrackMappability.hg38.bedGraph'
chrom_sizes_path = '/fh/fast/ha_g/grp/reference/GRCh38/hg38.standard.chrom.sizes'

exclude_path = '../encode_unified_GRCh38_exclusion_list.bed'
centromere_path = '../hg38_centromeres.bed'
gap_path = '../hg38_gaps.bed'
patch_path = '../hg38_fix_patches.bed'
alternative_haplotype_path = '../hg38_alternative_haplotypes.bed'

exclude_paths = [exclude_path,centromere_path,gap_path,patch_path,alternative_haplotype_path]
del(exclude_path,centromere_path,gap_path,patch_path)


In [4]:
#keep autosomes only
chroms = ['chr'+str(m) for m in range(1,23)]

In [5]:
start_time = time.time()
#keep only regions with perfect mapability
mapability = pybedtools.BedTool(mappability_bedGraph_path)

#filter out anything with mapability less than 1
#this is actually a bedGraph file so the score is in the 'name' column
filtered_map = mapability.filter(lambda b: b.name=='1')

#keep autosomes only
filtered_map = filtered_map.filter(lambda b: b.chrom in chroms)

#get the unmappable regions
unmappable_regions = filtered_map.complement(g=chrom_sizes_path)
print(time.time()-start_time)

245.97432589530945


In [6]:
#get the excluded regions
merged_exclude_regions = unmappable_regions
for path in exclude_paths:
    print('excluding:',path)
    current_regions = pybedtools.BedTool(path)
    merged_exclude_regions = merged_exclude_regions.cat(current_regions)    
    del(current_regions)
merged_exclude_regions_df = merged_exclude_regions.to_dataframe()
merged_exclude_regions_df = merged_exclude_regions_df[merged_exclude_regions_df['chrom'].isin(chroms)]
merged_exclude_regions = pybedtools.BedTool.from_dataframe(merged_exclude_regions_df)


excluding: ../encode_unified_GRCh38_exclusion_list.bed
excluding: ../hg38_centromeres.bed
excluding: ../hg38_gaps.bed
excluding: ../hg38_fix_patches.bed
excluding: ../hg38_alternative_haplotypes.bed


In [7]:
print('excluding',len(merged_exclude_regions),'regions, covering',merged_exclude_regions.total_coverage(),'bp')

mappable_regions = merged_exclude_regions.complement(g=chrom_sizes_path)
mappable_regions = mappable_regions.filter(lambda b: b.chrom in chroms) #fixed to filter chroms before exporting
mappable_regions = mappable_regions.saveas()
print('retaining',len(mappable_regions),'regions, covering',mappable_regions.total_coverage(),'bp')


mappable_regions_df = mappable_regions.to_dataframe()
mappable_regions_df.to_csv('../'+name+'.mappable_regions.hg38.bed', sep='\t', index=False, header=False)

excluding 467980 regions, covering 408170579 bp
retaining 467958 regions, covering 2466830943 bp


In [8]:
mappable_regions.tail()

chr9	138287819	138287820
chr9	138295898	138295899
chr9	138305748	138305772
chr9	138307875	138307876
chr9	138311405	138311406
chr9	138312342	138312343
chr9	138314012	138314013
chr9	138314553	138314554
chr9	138325043	138325044
chr9	138327329	138327330



In [9]:
pybedtools.cleanup('all')
os.rmdir('tmp')

removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.sw7x_0gs.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.uwv3yso_.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.y4ezc4ej.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.qkz0d62f.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.wme4wxy3.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.h86m_p9x.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.6x90a1bz.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.81zwd_hu.tmp
removing /fh/fast/ha_g/user/adoebley/projects/griffin_revisions_1/genome/scripts/tmp/pybedtools.yuv1y26w.tmp
removing /fh/fast/h