# Generates the JSON manifests that are used to run the SAILOR pipeline
- Use sailor/1.1.0 so that we can capture C/T edits. The current released version only does A/G

In [1]:
import yaml
import os
import pandas as pd
import glob
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook
import time

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bam_files' # output from 01
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes' # since there are so many SAILOR jobs, I'm splitting the output by the first 2NT of the barcode. 
ref_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs' # we need to get genome.fa and the SNP file 

In [3]:
### I'm re-running this notebook a few times on a few different groups of bam files
bams = glob.glob(os.path.join(input_dir, '*-1.bam'))
print(len(bams))
bams[:3]

19869


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bam_files/Apo_Control_possorted_genome_bam_MD-GTTACAGTCAGAGTTC-1.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bam_files/RPS2_possorted_genome_bam_MD-TCGGATAAGATTGATG-1.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bam_files/RPS2_possorted_genome_bam_MD-GTGACGCTCGCTACAA-1.bam']

# Generate all the directories needed given the output_dir prefix. 
- For example, if our output_dir looks like: ```03_scRNA/sailor_outputs_individual_barcodes``` then we will be creating 16 directories ```03_scRNA/sailor_outputs_individual_barcodes_AA/```, ```03_scRNA/sailor_outputs_individual_barcodes_AC```...

In [4]:
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        if not os.path.exists(output_dir + '_{}{}'.format(nt, nt2)):
            split_output = os.path.join(output_dir + '_{}{}'.format(nt, nt2))
            ! mkdir $split_output

# This block generates SAILOR json manifest runner files.

In [5]:
genome_file = os.path.join(input_dir, 'refdata-cellranger-hg19_lenti_common-3.0.0/fasta/genome.fa')

skip_rmdup = "true"
progress = tnrange(len(bams))
for bam in bams:
    prefix = os.path.basename(bam).split('_MD-')[1][:2]
    split_output_dir = output_dir + "_{}".format(prefix)
    
    sample = {"input_bam": {
        'class':'File',
        'path':bam
    }}
    fa = {"reference": {
        'class':'File',
        'path':genome_file
    }}
    known_snp = {"known_snp": {
        'class':'File',
        'path':os.path.join(ref_dir, 'hg19.commonSNPs147.bed3.sc')
    }}
    with open(
        os.path.join(
            split_output_dir, '{}.json'.format(
                os.path.basename(bam).split('.')[0]
            )
        ), 'w'
    ) as o:
        o.write("#!/usr/bin/env SAILOR\n")
        yaml.dump(sample, o, default_flow_style=False)
        yaml.dump(known_snp, o, default_flow_style=False)
        yaml.dump(fa, o, default_flow_style=False)
        o.write("ct: true\n")
        o.write("min_variant_coverage: 5\n")
        o.write("alpha: {}\n".format(0))
        o.write("beta: {}\n".format(0))
        o.write("edit_fraction: {}\n".format(0.01))
        o.write("skip_duplicate_removal: {}\n".format(skip_rmdup))
        o.write("reverse_stranded_library: false\n")
    progress.update(1)

HBox(children=(IntProgress(value=0, max=19869), HTML(value=u'')))

# Grab all JSON files created from the above cell.

In [6]:
jsons = []
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_output = os.path.join(output_dir + '_{}{}'.format(nt, nt2))
        jsons += glob.glob(os.path.join(split_output, '*.json'))
print(len(jsons))

19869


In [7]:
jsons[:3]

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA/RPS2_possorted_genome_bam_MD-AAACGCTAGGACGGAG-1.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA/RPS2_possorted_genome_bam_MD-AAGCATCGTAACGGTG-1.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA/RPS2_possorted_genome_bam_MD-AAGCATCAGTGAGCCA-1.json']

# Use chunker() to run 500 SAILOR jsons per qsub job.
- ### ALWAYS a good idea to run one of these jobs just to make sure the commands and outputs make sense. A big pain if all 20k jobs fail because I didn't check one or two.

In [8]:
cmds = []

def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

groupsize = 50 # 50 sailor runs per job.
progress = tnrange(len(jsons))
for json_group in chunker(jsons, groupsize):
    cmd = 'module load sailor/1.1.0;'
    for json in json_group:
        prefix = os.path.basename(json).split('_MD-')[1][:2]
        split_output_dir = output_dir + "_{}".format(prefix)
        ! chmod +x $json
        cmd += 'cd {};'.format(split_output_dir)
        cmd += './{};'.format(os.path.basename(json))
        progress.update(1)
    cmds.append(cmd)
cmds[:2]

HBox(children=(IntProgress(value=0, max=19869), HTML(value=u'')))

['module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA;./RPS2_possorted_genome_bam_MD-AAACGCTAGGACGGAG-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA;./RPS2_possorted_genome_bam_MD-AAGCATCGTAACGGTG-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA;./RPS2_possorted_genome_bam_MD-AAGCATCAGTGAGCCA-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA;./RPS2_possorted_genome_bam_MD-AACCATGCAGCGTGCT-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA;./Apo_Control_possorted_genome_bam_MD-AAGAACATCTCCATAT-1.json;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_AA;./Apo_Control_possorted_genome_bam_MD

# MAKE SURE we don't exceed ~500 total qsub commands! (submit=False for this reason, when everything looks good re-run with ```submit=True```)

In [9]:
print("total commands: {}".format(len(cmds)))

total commands: 398


In [10]:
Submitter(
    commands=cmds,
    job_name='scRNA-sailor-individual',
    sh='scRNA-sailor-individual.sh',
    array=True,
    nodes=1,
    ppn=1,
    walltime='72:00:00',
    submit=False,
)

Writing 398 tasks as an array-job.
Wrote commands to scRNA-sailor-individual.sh.
Submitted script to queue home.
 Job ID: 21107152


<qtools.submitter.Submitter at 0x2b73d7f616d0>