# Generates the JSON manifests that are used to run the SAILOR pipeline
- Use the sailor/1.1.0 module that was created (modified from 1.0.5 which did not support C/T editing).

In [1]:
import yaml
import os
import pandas as pd
import glob
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs'

### Here I'm grabbing the BAM files I want to use as inputs to SAILOR

In [4]:
bams = glob.glob(os.path.join(input_dir, '*1000*sorted.bam'))
print(len(bams))
bams

18


['/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/APOBEC_only_1000_merged_R1.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/RPS2-1000_S18_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/SLBP_1000_48hr_B_merged_R1.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/ETF1-1000_S16_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/SLBP_1000_72hr_B_merged_R1.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted.bam',
 '/home

# Create JSON manifest to run SAILOR on each of the above BAM files
- min_edit_fraction, alpha and beta scores tweak the conf score parameters. min_edit_fraction can discard many low-edit-% calls, alpha/beta can add a pseudocount if coverage isn't sufficient.
- keep min coverage at 5 (default) 
- ct is True since we're looking for C/T edits.
- commonSNP file from dbsnp

In [5]:
for min_edit_fraction in [0.01]:
    for alpha in [0]:
        for beta in [0]:
            for bam in bams:
                renamed_bam = os.path.splitext(bam)[0] + '_a{}_b{}_e{}.noRmDup.bam'.format(
                    alpha, beta, min_edit_fraction
                )
                # let's rename the bam file to make it easier to combine
                if not os.path.exists(renamed_bam):
                    ! ln -s $bam $renamed_bam
                sample = {"input_bam": {
                    'class':'File',
                    'path':renamed_bam
                }}
                fa = {"reference": {
                    'class':'File',
                    'path':'/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
                }}
                known_snp = {"known_snp": {
                    'class':'File',
                    'path':'/projects/ps-yeolab3/bay001/annotations/hg19/hg19.commonSNPs147.bed3'
                }}
                json_file_to_create = os.path.join(
                    output_dir, '{}_a{}_b{}_e{}.noRmDup.json'.format(
                        os.path.basename(bam).split('.')[0], 
                        alpha, 
                        beta,
                        min_edit_fraction
                    )
                )
                print(json_file_to_create)
                if not os.path.exists(json_file_to_create):
                    print("Creating: {}".format(json_file_to_create))
                    with open(json_file_to_create, 'w'
                    ) as o:
                        o.write("#!/usr/bin/env SAILOR\n")
                        yaml.dump(sample, o, default_flow_style=False)
                        yaml.dump(known_snp, o, default_flow_style=False)
                        yaml.dump(fa, o, default_flow_style=False)
                        o.write("ct: true\n")
                        o.write("min_variant_coverage: 5\n")
                        o.write("alpha: {}\n".format(alpha))
                        o.write("beta: {}\n".format(beta))
                        o.write("edit_fraction: {}\n".format(min_edit_fraction))
                        o.write("skip_duplicate_removal: true")

/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/APOBEC_only_1000_merged_R1_a0_b0_e0.01.noRmDup.json
Creating: /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/APOBEC_only_1000_merged_R1_a0_b0_e0.01.noRmDup.json
/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/RPS2-1000_S18_L002_R1_001_a0_b0_e0.01.noRmDup.json
Creating: /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/RPS2-1000_S18_L002_R1_001_a0_b0_e0.01.noRmDup.json
/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/SLBP_1000_48hr_B_merged_R1_a0_b0_e0.01.noRmDup.json
Creating: /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/SLBP_1000_48hr_B_merged_R1_a0_b0_e0.01.noRmDup.json
/home/bay001/projects/kris_apobec_20200121

In [8]:
jsons = glob.glob(os.path.join(output_dir, '*.noRmDup.json'))
len(jsons)

18

In [9]:
cmds = []
for json in jsons:
    if not os.path.exists(os.path.join(output_dir, os.path.splitext(os.path.basename(json))[0])):
        ! chmod +x $json
        cmd = 'module load sailor/1.1.0;'
        cmd += 'cd {};'.format(output_dir)
        cmd += './{}'.format(os.path.basename(json))
        cmds.append(cmd)
        
print(len(cmds))
cmds[:3]

18


['module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs;./GSTP1-1000_S17_L002_R1_001_a0_b0_e0.01.noRmDup.json',
 'module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs;./SLBP_1000_72hr_B_merged_R1_a0_b0_e0.01.noRmDup.json',
 'module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs;./SLBP_1000_48hr_B_merged_R1_a0_b0_e0.01.noRmDup.json']

In [10]:
Submitter(
    commands=cmds,
    job_name='lenti_dox_sailor',
    sh='lenti_dox_sailor.sh',
    array=True,
    nodes=1,
    ppn=4,
    walltime='72:00:00',
    submit=True,
)

Writing 18 tasks as an array-job.
Wrote commands to lenti_dox_sailor.sh.
Submitted script to queue home.
 Job ID: 21764510


<qtools.submitter.Submitter at 0x2b105d081050>