# We need bigwig files for fast access to coverage (used by the score_edits_* scripts)
- Used when computing edit/C metric
- this notebook assumes a reverse stranded SE library. Otherwise you may need to reverse the ```-strand +``` param

In [7]:
import glob
import os
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [8]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info'

### inputs should be the two ```*.sorted.rmdup.readfiltered.bam``` files that are intermediates from SAILOR. These are BAM files, split into their respective strands, and filtered for any low-quality or unexpected (non C/T) reads.

In [9]:
all_fwd_bams = sorted(glob.glob(os.path.join(input_dir, '*rand*trial*/results/*.fwd.sorted.rmdup.readfiltered.bam')))
print(len(all_fwd_bams))
all_fwd_bams[:3]

590


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-0.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-0.txt.fwd.sorted.rmdup.readfiltered.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-1.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-1.txt.fwd.sorted.rmdup.readfiltered.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-2.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-2.txt.fwd.sorted

# Write the bam to bigwig commands

In [10]:
chrom_sizes_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt'
cmds = []
for fwd_bam in all_fwd_bams:
    fwd_human_bam = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".fx.bam")
    fwd_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".bg")
    fwd_sorted_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".sorted.bg")
    fwd_sorted_bw = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".sorted.bw")
    if not os.path.exists(fwd_sorted_bw):
        cmd = 'module load makebigwigfiles;'
        cmd += 'samtools index {};'.format(fwd_bam)
        cmd += 'samtools view -b {} {} > {};'.format(fwd_bam, "{1..22} MT X Y", fwd_human_bam) # some of these bam files have a 'lenti common' extra contig but we don't care about these.
        cmd += 'bedtools genomecov -split -strand + -g {} -bg -ibam {} > {};'.format(chrom_sizes_file, fwd_human_bam, fwd_bg)
        cmd += 'bedtools sort -i {} > {};'.format(fwd_bg, fwd_sorted_bg)
        cmd += 'bedGraphToBigWig {} {} {};'.format(fwd_sorted_bg, chrom_sizes_file, fwd_sorted_bw)
        cmds.append(cmd)
    
for fwd_bam in all_fwd_bams:
    rev_bam = fwd_bam.replace('.fwd.sorted.rmdup.readfiltered.bam', '.rev.sorted.rmdup.readfiltered.bam')
    rev_human_bam = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".fx.bam")
    rev_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".bg")
    rev_sorted_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".sorted.bg")
    rev_sorted_bw = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".sorted.bw")
    if not os.path.exists(rev_sorted_bw):
        cmd = 'module load makebigwigfiles;'
        cmd += 'samtools index {};'.format(rev_bam)
        cmd += 'samtools view -b {} {} > {};'.format(rev_bam, "{1..22} MT X Y", rev_human_bam)
        cmd += 'bedtools genomecov -split -strand - -g {} -bg -ibam {} > {};'.format(chrom_sizes_file, rev_human_bam, rev_bg)
        cmd += 'bedtools sort -i {} > {};'.format(rev_bg, rev_sorted_bg)
        cmd += 'bedGraphToBigWig {} {} {};'.format(rev_sorted_bg, chrom_sizes_file, rev_sorted_bw)
        cmds.append(cmd)
print(len(cmds))

0


In [5]:
cmds[:4]

[]

In [6]:
Submitter(commands=cmds, job_name='bamToBigWig', array=True, nodes=1, ppn=1, submit=False, walltime='8:00:00')

Writing 0 tasks as an array-job.
Wrote commands to bamToBigWig.sh.


<qtools.submitter.Submitter at 0x2afed4dfe490>