# Generates the JSON manifests that are used to run the SAILOR pipeline
- Use sailor/1.1.0 so that we can capture C/T edits. The current released version only does A/G

In [1]:
import yaml
import os
import pandas as pd
import glob
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook
import time

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes'

ref_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs'

In [3]:
### I'm re-running this notebook a few times on a few different groups of bam files
bams = glob.glob(os.path.join(input_dir, '*-1.bam'))
print(len(bams))
bams[:3]

20617


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/possorted_genome_bam_MD-GCCAGGTGTCATAAAG-1.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/possorted_genome_bam_MD-GGGATGAGTCTTGCGG-1.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/possorted_genome_bam_MD-AGTGCCGCACCGCTGA-1.bam']

In [4]:
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        if not os.path.exists(output_dir + '_{}{}'.format(nt, nt2)):
            split_output = os.path.join(output_dir + '_{}{}'.format(nt, nt2))
            ! mkdir $split_output

# This block generates SAILOR json manifest runner files.

In [5]:
genome_file = os.path.join(ref_dir, 'refdata-cellranger-hg19_lenti_common-3.0.0/fasta/genome.fa')
snp_file = os.path.join(ref_dir, 'hg19.commonSNPs147.bed3')

assert os.path.exists(genome_file)
assert os.path.exists(snp_file)

skip_rmdup = "true"
progress = tnrange(len(bams))
for bam in bams:
    prefix = os.path.basename(bam).split('_MD-')[1][:2]
    split_output_dir = output_dir + "_{}".format(prefix)
    
    sample = {"input_bam": {
        'class':'File',
        'path':bam
    }}
    fa = {"reference": {
        'class':'File',
        'path':genome_file
    }}
    known_snp = {"known_snp": {
        'class':'File',
        'path':snp_file
    }}
    with open(
        os.path.join(
            split_output_dir, '{}.json'.format(
                os.path.basename(bam).split('.')[0]
            )
        ), 'w'
    ) as o:
        o.write("#!/usr/bin/env SAILOR\n")
        yaml.dump(sample, o, default_flow_style=False)
        yaml.dump(known_snp, o, default_flow_style=False)
        yaml.dump(fa, o, default_flow_style=False)
        o.write("ct: true\n")
        o.write("min_variant_coverage: 5\n")
        o.write("alpha: {}\n".format(0))
        o.write("beta: {}\n".format(0))
        o.write("edit_fraction: {}\n".format(0.01))
        o.write("skip_duplicate_removal: {}\n".format(skip_rmdup))
        o.write("reverse_stranded_library: false\n")
    progress.update(1)

HBox(children=(IntProgress(value=0, max=20617), HTML(value=u'')))

# Grab all JSON files created from the above cell.

In [6]:
jsons = []
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_output = os.path.join(output_dir + '_{}{}'.format(nt, nt2))
        jsons += glob.glob(os.path.join(split_output, '*.json'))
print(len(jsons))

20617


# It is worth looking at one of the json files to make sure things look ok so far. 
- Also worth trying to run one of the commands to make sure the pipeline works before submitting a ton of jobs.

In [7]:
jsons[:3]

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA/possorted_genome_bam_MD-AAGACAATCATAAGGA-1.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA/possorted_genome_bam_MD-AAGATAGGTAACGCGA-1.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA/possorted_genome_bam_MD-AATTTCCGTCCGGTCA-1.json']

In [8]:
cmds = []

def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

groupsize = 50 # 50 sailor runs per job.
progress = tnrange(len(jsons))
for json_group in chunker(jsons, groupsize):
    cmd = 'module load sailor/1.1.0;'
    for json in json_group:
        prefix = os.path.basename(json).split('_MD-')[1][:2]
        split_output_dir = output_dir + "_{}".format(prefix)
        ! chmod +x $json
        cmd += 'cd {};'.format(split_output_dir)
        cmd += './{};'.format(os.path.basename(json))
        progress.update(1)
    cmds.append(cmd)
cmds[:2]

HBox(children=(IntProgress(value=0, max=20617), HTML(value=u'')))

['module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA;./possorted_genome_bam_MD-AAGACAATCATAAGGA-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA;./possorted_genome_bam_MD-AAGATAGGTAACGCGA-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA;./possorted_genome_bam_MD-AATTTCCGTCCGGTCA-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA;./possorted_genome_bam_MD-AAGACAACATGACTGT-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA;./possorted_genome_bam_MD-AATAGAGTCTGGTGCG-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA;./p

In [9]:
print("total commands: {}".format(len(cmds)))

total commands: 413


In [10]:
Submitter(
    commands=cmds,
    job_name='scRNA-sailor-individual',
    sh='scRNA-sailor-individual.sh',
    array=True,
    nodes=1,
    ppn=1,
    walltime='72:00:00',
    submit=True,
)

Writing 413 tasks as an array-job.
Wrote commands to scRNA-sailor-individual.sh.
Submitted script to queue home.
 Job ID: 21210617


<qtools.submitter.Submitter at 0x2b2ac9f9a890>

# Check to make sure the results are all there

In [29]:
len(bams)

20617

In [30]:
progress = tnrange(len(bams))

all_fwd_files = []
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_output = os.path.join(output_dir + '_{}{}'.format(nt, nt2))
        files = sorted(glob.glob(os.path.join(split_output, "*/results/*.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed")))
        all_fwd_files += files
        progress.update(len(files))
len(all_fwd_files)

HBox(children=(IntProgress(value=0, max=20617), HTML(value=u'')))

20617

In [31]:
def get_barcode_from_bam_file_name(fn):
    return os.path.basename(fn).split('-')[1]

all_barcodes = set()
for bam in bams:
    all_barcodes.add(get_barcode_from_bam_file_name(bam))
    
len(all_barcodes)

20617

In [32]:
def get_barcode_from_fwd_bed(fn):
    return os.path.basename(fn).split('-')[1]

all_beds = set()
for fwd_file in all_fwd_files:
    all_beds.add(get_barcode_from_fwd_bed(fwd_file))
    
len(all_beds)

20617

In [33]:
missing_barcodes = all_barcodes.difference(all_beds)

In [34]:
cmds = []


progress = tnrange(len(missing_barcodes))
for barcode in missing_barcodes:
    cmd = 'module load sailor/1.1.0;'
    prefix = barcode[:2]
    split_output_dir = output_dir + "_{}".format(prefix)
    json = os.path.join(split_output_dir, 'possorted_genome_bam_MD-{}-1.json'.format(barcode))
    ! chmod +x $json
    cmd += 'cd {};'.format(split_output_dir)
    cmd += './{};'.format(os.path.basename(json))
    progress.update(1)
    cmds.append(cmd)
cmds[:2]

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

[]

In [27]:
len(cmds)

650

In [28]:
Submitter(
    commands=cmds,
    job_name='scRNA-sailor-individual',
    sh='scRNA-sailor-individual.sh',
    array=True,
    nodes=1,
    ppn=1,
    walltime='72:00:00',
    submit=True,
)

Writing 500 tasks as an array-job.
Wrote commands to scRNA-sailor-individual1.sh.
Submitted script to queue home.
 Job ID: 21213834
Writing 150 tasks as an array-job.
Wrote commands to scRNA-sailor-individual2.sh.
Submitted script to queue home.
 Job ID: 21213835


<qtools.submitter.Submitter at 0x2b2acb4026d0>