# We need bigwig files for fast access to coverage (used by the score_edits_* scripts)

In [1]:
import glob
import os
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [2]:
input_prefix_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/'

In [3]:
chrom_sizes_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt'
cmds = []

# Again look at each directory, find all fwd bam files, and convert to fwd bigwigs
- There are three steps involved. 
    - convert BAM files into BEDGRAPH files
    - sort BEDGRAPH files
    - convert BEDGRAPH to binary BIGWIG format. 
    
# NOTE: since 10X datasets are forward stranded, use ```-strand +``` for fwd and ```-strand -``` for rev in our ```bedtools genomecov``` command.

In [4]:
progress = tnrange(16)
for nt1 in ['A','C','G','T']:
    for nt2 in ['A','C','G','T']:
        input_dir = input_prefix_dir + "{}{}".format(nt1, nt2)
        all_fwd_bams = glob.glob(os.path.join(input_dir, '*/results/*.fwd.sorted.rmdup.readfiltered.bam'))
        cmd = 'module load makebigwigfiles;'
        inner_progress = tnrange(len(all_fwd_bams), leave=False, desc="{}:{}:{}".format(len(all_fwd_bams), nt1, nt2))
        for fwd_bam in all_fwd_bams:
            fwd_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".bg")
            fwd_sorted_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".sorted.bg")
            fwd_sorted_bw = os.path.join(output_dir, os.path.splitext(os.path.basename(fwd_bam))[0] + ".sorted.bw")
            if not os.path.exists(fwd_sorted_bw):
                cmd += 'bedtools genomecov -split -strand + -g {} -bg -ibam {} > {};'.format(chrom_sizes_file, fwd_bam, fwd_bg)
                cmd += 'bedtools sort -i {} > {};'.format(fwd_bg, fwd_sorted_bg)
                cmd += 'bedGraphToBigWig {} {} {};'.format(fwd_sorted_bg, chrom_sizes_file, fwd_sorted_bw)
            inner_progress.update(1)
        cmds.append(cmd)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=16), HTML(value=u'')))

SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMDEyOkE6QScsIG1heD0xMDEyLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMDgwOkE6QycsIG1heD0xMDgwLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDQ2OkE6RycsIG1heD0xNDQ2LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMjIyOkE6VCcsIG1heD0xMjIyLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDk2OkM6QScsIG1heD0xNDk2LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dSc5NzQ6QzpDJywgbWF4PTk3NCwgc3R5bGU9UHJvZ3Jlc3NTdHlsZShkZXNjcmlwdGlvbl93aWR0aD11J2nigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dSc5Mjg6QzpHJywgbWF4PTkyOCwgc3R5bGU9UHJvZ3Jlc3NTdHlsZShkZXNjcmlwdGlvbl93aWR0aD11J2nigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDAyOkM6VCcsIG1heD0xNDAyLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMzIwOkc6QScsIG1heD0xMzIwLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dSc3OTM6RzpDJywgbWF4PTc5Mywgc3R5bGU9UHJvZ3Jlc3NTdHlsZShkZXNjcmlwdGlvbl93aWR0aD11J2nigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMTI4Okc6RycsIG1heD0xMTI4LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDg4Okc6VCcsIG1heD0xNDg4LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMDc3OlQ6QScsIG1heD0xMDc3LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNjg5OlQ6QycsIG1heD0xNjg5LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDM2OlQ6RycsIG1heD0xNDM2LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMzc4OlQ6VCcsIG1heD0xMzc4LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


# Do the same for reverse bigwigs

In [5]:
progress = tnrange(16)
for nt1 in ['A','C','G','T']:
    for nt2 in ['A','C','G','T']:
        input_dir = input_prefix_dir + "{}{}".format(nt1, nt2)
        all_rev_bams = glob.glob(os.path.join(input_dir, '*/results/*.rev.sorted.rmdup.readfiltered.bam'))
        print(len(all_rev_bams), nt1, nt2)
        cmd = 'module load makebigwigfiles;'
        inner_progress = tnrange(len(all_rev_bams), leave=False, desc="{}:{}:{}".format(len(all_rev_bams), nt1, nt2))
        for rev_bam in all_rev_bams:
            rev_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".bg")
            rev_sorted_bg = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".sorted.bg")
            rev_sorted_bw = os.path.join(output_dir, os.path.splitext(os.path.basename(rev_bam))[0] + ".sorted.bw")
            if not os.path.exists(rev_sorted_bw):
                cmd += 'bedtools genomecov -split -strand - -g {} -bg -ibam {} > {};'.format(chrom_sizes_file, rev_bam, rev_bg)
                cmd += 'bedtools sort -i {} > {};'.format(rev_bg, rev_sorted_bg)
                cmd += 'bedGraphToBigWig {} {} {};'.format(rev_sorted_bg, chrom_sizes_file, rev_sorted_bw)
            inner_progress.update(1)
        cmds.append(cmd)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=16), HTML(value=u'')))

(1012, 'A', 'A')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMDEyOkE6QScsIG1heD0xMDEyLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1080, 'A', 'C')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMDgwOkE6QycsIG1heD0xMDgwLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1446, 'A', 'G')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDQ2OkE6RycsIG1heD0xNDQ2LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1222, 'A', 'T')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMjIyOkE6VCcsIG1heD0xMjIyLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1496, 'C', 'A')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDk2OkM6QScsIG1heD0xNDk2LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(974, 'C', 'C')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dSc5NzQ6QzpDJywgbWF4PTk3NCwgc3R5bGU9UHJvZ3Jlc3NTdHlsZShkZXNjcmlwdGlvbl93aWR0aD11J2nigKY=


(928, 'C', 'G')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dSc5Mjg6QzpHJywgbWF4PTkyOCwgc3R5bGU9UHJvZ3Jlc3NTdHlsZShkZXNjcmlwdGlvbl93aWR0aD11J2nigKY=


(1402, 'C', 'T')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDAyOkM6VCcsIG1heD0xNDAyLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1320, 'G', 'A')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMzIwOkc6QScsIG1heD0xMzIwLCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(793, 'G', 'C')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dSc3OTM6RzpDJywgbWF4PTc5Mywgc3R5bGU9UHJvZ3Jlc3NTdHlsZShkZXNjcmlwdGlvbl93aWR0aD11J2nigKY=


(1128, 'G', 'G')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMTI4Okc6RycsIG1heD0xMTI4LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1488, 'G', 'T')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDg4Okc6VCcsIG1heD0xNDg4LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1077, 'T', 'A')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMDc3OlQ6QScsIG1heD0xMDc3LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1689, 'T', 'C')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNjg5OlQ6QycsIG1heD0xNjg5LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1436, 'T', 'G')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxNDM2OlQ6RycsIG1heD0xNDM2LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


(1378, 'T', 'T')


SEJveChjaGlsZHJlbj0oSW50UHJvZ3Jlc3ModmFsdWU9MCwgZGVzY3JpcHRpb249dScxMzc4OlQ6VCcsIG1heD0xMzc4LCBzdHlsZT1Qcm9ncmVzc1N0eWxlKGRlc2NyaXB0aW9uX3dpZHRoPXXigKY=


# Write the bam to bigwig commands

In [8]:
Submitter(commands=cmds, job_name='bedToBigWig', array=True, nodes=1, ppn=1, submit=True, walltime='8:00:00')

Writing 32 tasks as an array-job.
Wrote commands to bedToBigWig.sh.
Submitted script to queue home.
 Job ID: 21117547


<qtools.submitter.Submitter at 0x2b84c8d12350>

# Since each command pastes together 3x~40,000 individual commands and is potentially millions of characters long, we shouldn't try and list out the entire command. Let's "preview" the first 5000 chars instead.

In [6]:
len(cmds[0])

1483014

# Maybe run one or two barcodes to make sure all jobs won't fail for a dumb reason.

- ie. the first barcode from the first command looks something like this: 


```module load makebigwigfiles;bedtools genomecov -split -strand + -g /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt -bg -ibam /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1/results/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.bam > /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.bg;bedtools sort -i /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.bg > /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.sorted.bg;bedGraphToBigWig /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.sorted.bg /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.sorted.bw;```

In [7]:
cmds[0][:5000]

'module load makebigwigfiles;bedtools genomecov -split -strand + -g /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt -bg -ibam /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1/results/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.bam > /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.bg;bedtools sort -i /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltered.bg > /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bigwig_files/Apo_Control_possorted_genome_bam_MD-AAGCCATTCTCCTGTG-1.fwd.sorted.rmdup.readfiltere