# Generates the JSON manifests that are used to run the SAILOR pipeline
- Use sailor/1.1.0 so that we can capture C/T edits. The current released version only does A/G

In [60]:
import yaml
import os
import pandas as pd
import glob
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook
import time

In [61]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups'

ref_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/'

In [62]:
### I'm re-running this notebook a few times on a few different groups of bam files
bams = glob.glob(os.path.join(input_dir, 'RBFOX*.bam')) + glob.glob(os.path.join(input_dir, 'TIA*.bam'))
print(len(bams))
bams

8


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more_noFeatureCells.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/TIA1_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/TIA1_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/TIA1_ORFS.barcodes_RBFOX2-TIA1_subset.bam',
 

# This block generates SAILOR json manifest runner files.

In [63]:
skip_rmdup = "true"
progress = tnrange(len(bams))
for bam in bams:
    genome_file = os.path.join(ref_dir, 'refdata-cellranger-hg19_lenti_common-3.0.0/fasta/genome.fa')
    snp_file = os.path.join(ref_dir, 'hg19.commonSNPs147.bed3')
    
    assert os.path.exists(genome_file)
    assert os.path.exists(snp_file)
    
    sample = {"input_bam": {
        'class':'File',
        'path':bam
    }}
    fa = {"reference": {
        'class':'File',
        'path':genome_file
    }}
    known_snp = {"known_snp": {
        'class':'File',
        'path':snp_file
    }}
    json_output = os.path.join(
        output_dir, '{}.json'.format(
            os.path.splitext(os.path.basename(bam))[0]
        )
    )
    
    if not os.path.exists(json_output):
        print(json_output)
        with open(
            json_output, 'w'
        ) as o:
            o.write("#!/usr/bin/env SAILOR\n")
            yaml.dump(sample, o, default_flow_style=False)
            yaml.dump(known_snp, o, default_flow_style=False)
            yaml.dump(fa, o, default_flow_style=False)
            o.write("ct: true\n")
            o.write("min_variant_coverage: 5\n")
            o.write("alpha: {}\n".format(0))
            o.write("beta: {}\n".format(0))
            o.write("edit_fraction: {}\n".format(0.01))
            o.write("skip_duplicate_removal: {}\n".format(skip_rmdup))
            o.write("reverse_stranded_library: false\n")
    progress.update(1)

HBox(children=(IntProgress(value=0, max=8), HTML(value=u'')))

/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more_noFeatureCells.json
/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/TIA1_ORFS.barcodes_RBFOX2-TIA1_subset_more_noFeatureCells.json


# Grab all JSON files created from the above cell.
- actually just grab the ones we want to run since we're kinda doing this piece by piece. 

In [52]:
jsons = [
    os.path.join(output_dir, 'RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.json'),
    os.path.join(output_dir, 'TIA1_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.json'),
]
# jsons = glob.glob(os.path.join(output_dir, 'sampled*barcodes_RBFOX2-TIA1_subset*.json'))

In [53]:
jsons[:3]

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.json',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups/TIA1_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.json']

In [54]:
cmds = []

progress = tnrange(len(jsons))
for json in jsons:
    cmd = 'module load sailor/1.1.0;'
    ! chmod +x $json
    cmd += 'cd {};'.format(output_dir)
    cmd += './{};'.format(os.path.basename(json))
    cmds.append(cmd)
    progress.update(1)

cmds

HBox(children=(IntProgress(value=0, max=2), HTML(value=u'')))

['module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups;./RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.json;',
 'module load sailor/1.1.0;cd /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_groups;./TIA1_ORFS.barcodes_RBFOX2-TIA1_subset_noFeatureCells.json;']

In [55]:
print("total commands: {}".format(len(cmds)))

total commands: 2


In [56]:
Submitter(
    commands=cmds,
    job_name='scRNA-sailor-groups5',
    sh='scRNA-sailor-groups5.sh',
    array=True,
    nodes=1,
    ppn=1,
    walltime='72:00:00',
    submit=True,
)

Writing 2 tasks as an array-job.
Wrote commands to scRNA-sailor-groups5.sh.
Submitted script to queue home.
 Job ID: 21369910


<qtools.submitter.Submitter at 0x2b217e46b310>