# Generates the JSON manifests that are used to run the SAILOR pipeline
- Use sailor/1.1.0 so that we can capture C/T edits. The current released version only does A/G

In [1]:
import yaml
import os
import pandas as pd
import glob
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook
import time

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep'

ref_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/'

In [3]:
### I'm re-running this notebook a few times on a few different groups of bam files
bams = sorted(glob.glob(os.path.join(input_dir, 'RPS2-STAMP_possorted_genome_bam_MD-RPS2_non_APO_edits_barcodes_RPKM.txt.bam')))
print(len(bams))
bams

1


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2-STAMP_possorted_genome_bam_MD-RPS2_non_APO_edits_barcodes_RPKM.txt.bam']

# This block generates SAILOR json manifest runner files.

In [4]:
skip_rmdup = "true"
progress = tnrange(len(bams))
for bam in bams:
    genome_file = os.path.join(ref_dir, 'refdata-cellranger-hg19_lenti_common-3.0.0/fasta/genome.fa')
    snp_file = os.path.join(ref_dir, 'hg19.commonSNPs147.bed3.sc')
    
    assert os.path.exists(genome_file)
    assert os.path.exists(snp_file)
    
    sample = {"input_bam": {
        'class':'File',
        'path':bam
    }}
    fa = {"reference": {
        'class':'File',
        'path':genome_file
    }}
    known_snp = {"known_snp": {
        'class':'File',
        'path':snp_file
    }}
    json_output = os.path.join(
        output_dir, '{}.json'.format(
            os.path.splitext(os.path.basename(bam))[0]
        )
    )
    
    if not os.path.exists(json_output):
        print(json_output)
        with open(
            json_output, 'w'
        ) as o:
            o.write("#!/usr/bin/env SAILOR-deep\n")  # use SAILOR-deep for really deep sequenced BAMs. SAILOR is OK to use for everything under ~100M reads, but SAILOR-deep increases the mpileup depth to 10M
            yaml.dump(sample, o, default_flow_style=False)
            yaml.dump(known_snp, o, default_flow_style=False)
            yaml.dump(fa, o, default_flow_style=False)
            o.write("ct: true\n")
            o.write("min_variant_coverage: 5\n")
            o.write("alpha: {}\n".format(0))
            o.write("beta: {}\n".format(0))
            o.write("edit_fraction: {}\n".format(0.01))
            o.write("skip_duplicate_removal: {}\n".format(skip_rmdup))
            o.write("reverse_stranded_library: false\n")
    progress.update(1)

HBox(children=(IntProgress(value=0, max=1), HTML(value=u'')))

/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RPS2-STAMP_possorted_genome_bam_MD-RPS2_non_APO_edits_barcodes_RPKM.txt.json


# Grab all JSON files created from the above cell.
- actually just grab the ones we want to run since we're kinda doing this piece by piece. 

In [5]:
jsons = glob.glob(os.path.join(output_dir, '*.json'))

In [6]:
jsons[:3]

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RPS2-STAMP_possorted_genome_bam_MD-RPS2_EPKM_barcodes_with_flags_cluster1.txt.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RPS2-STAMP_possorted_genome_bam_MD-RPS2_escore_barcodes_with_flags_cluster3.txt.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RPS2-STAMP_possorted_genome_bam_MD-38_barcodes.tsv.json']

In [7]:
cmds = []

progress = tnrange(len(jsons))
for json in jsons:
    if not os.path.exists(os.path.splitext(json)[0]):
        cmd = 'module load sailor/1.1.0;'
        ! chmod +x $json
        cmd += 'cd {};'.format(output_dir)
        cmd += './{};'.format(os.path.basename(json))
        cmds.append(cmd)
    progress.update(1)

cmds[:10]

HBox(children=(IntProgress(value=0, max=74), HTML(value=u'')))

['module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep;./RPS2-STAMP_possorted_genome_bam_MD-RPS2_non_APO_edits_barcodes_RPKM.txt.json;']

In [8]:
print("total commands: {}".format(len(cmds)))

total commands: 1


In [9]:
Submitter(
    commands=cmds,
    job_name='scRNA-sailor-groups',
    sh='scRNA-sailor-groups.sh',
    array=True,
    nodes=1,
    ppn=4,
    walltime='72:00:00',
    submit=True,
)

Writing 1 tasks as an array-job.
Wrote commands to scRNA-sailor-groups.sh.
Submitted script to queue home.
 Job ID: 24727464


<qtools.submitter.Submitter at 0x2abbba001790>