# Run annotator on all SAILOR results.
- Annotator accepts inputs and outputs as lists, so let's take advantage of that so as not to submit 20k+ jobs at once.

In [1]:
import glob
import os
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles'

In [3]:
all_beds = glob.glob(os.path.join(input_dir, '*-1.bed'))
len(all_beds)

19611

# 10X uses ensembl-style annotations, so each contig lacks the 'chr' prefix (so 1:10-20 instead of chr1:10-20). We need to fix this if we want to annotate using Gencode.

In [4]:
def fix_chr(infile, outfile):
    with open(infile, 'r') as i:
        with open(outfile, 'w') as o:
            for line in i:
                o.write('chr{}'.format(line))

In [5]:
progress = tnrange(len(all_beds))
for bed in all_beds:
    output_fx_bed = os.path.splitext(bed)[0] + ".fx.bed"
    fix_chr(bed, output_fx_bed)
    progress.update(1)

HBox(children=(IntProgress(value=0, max=19611), HTML(value=u'')))

In [6]:
gtfdb_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
species = 'hg19'

In [7]:
all_beds = glob.glob(os.path.join(input_dir, '*-1.fx.bed'))
len(all_beds)

19611

# User chunker() to "load up" 500 files at a time and submit as one command 
- (ie. ```annotator --input GCAA-1.bed GCAC-1.bed GCAG-1.bed --output GCAA-1.bed.annotated GCAC-1.bed.annotated GCAG-1.bed.annotated```..)

In [8]:
def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

groupsize = 500

cmds = []
for group in chunker(all_beds, groupsize):
    sample_str = ""
    output_str = ""
    cmd = 'module load annotator/0.0.14;annotator '
    cmd += '--gtfdb {} '.format(gtfdb_file)
    cmd += '--species {} '.format(species)
    for g in group:
        sample_str += '{} '.format(g)
        output_str += '{} '.format(g + '.annotated')
    cmd += '--input {} '.format(sample_str)
    cmd += '--output {} '.format(output_str)
    cmds.append(cmd)
    
print("Number of commands: {}".format(len(cmds)))

Number of commands: 40


# Always try to run the first cmd already to make sure it's all correct.

In [9]:
Submitter(commands=cmds, job_name='annotate_editing_sites', array=True, nodes=1, ppn=8, submit=True, walltime='24:00:00')

Writing 40 tasks as an array-job.
Wrote commands to annotate_editing_sites.sh.
Submitted script to queue home.
 Job ID: 21118719


<qtools.submitter.Submitter at 0x2b5a977f2c10>