# Generates and submits a qsub script to load and annotate edit BED files.
- Using annotator/0.0.14 (https://github.com/byee4/annotator)

In [1]:
import glob
import os
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info'

### Here, I want the joined BED files from 02 outputs
- not a big deal if this number is larger than expected, as this glob will grab ALL bed files, even the "post-fx" ones. We'll check this in the next cell.

In [3]:
all_beds = glob.glob(os.path.join(input_dir, '*.bed'))
print(len(all_beds))
all_beds

80


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam_MD-RPS2_EPKM_barcodes_with_flags_g2m_top500.txt.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam_MD-RPS2_escore_barcodes_with_flags_g1_top500.txt.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam_MD-RPS2_EPKM_barcodes_with_flags_g1_all.txt.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_bam_MD.downsampled60M.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info/RPS2-STAMP_possorted_genome_b

## Since the above list of BED files also contain the 'fixed' versions (fx) where the 'chr' prefix is appended to each chromosome, I'll need to check first before trying to make a new fx file. 

In [4]:
def fix_chr(infile, outfile):
    with open(infile, 'r') as i:
        with open(outfile, 'w') as o:
            for line in i:
                o.write('chr{}'.format(line))
                
progress = tnrange(len(all_beds))
for bed in all_beds:
    if bed[-7:] == '.fx.bed':
        pass
    else:
        output_fx_bed = os.path.splitext(bed)[0] + ".fx.bed"
        if not os.path.exists(output_fx_bed):
            print("Fixing {}".format(bed))
            fix_chr(bed, output_fx_bed)
    progress.update(1)

HBox(children=(IntProgress(value=0, max=80), HTML(value=u'')))

In [5]:
all_beds = glob.glob(os.path.join(input_dir, '*fx.bed'))
len(all_beds)

40

In [6]:
gtfdb_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
cmds = []
for bed in all_beds:
    output_file = os.path.join(output_dir, os.path.basename(bed).replace('.fx.bed','.fx.annotated'))
    if not os.path.exists(output_file):
        cmd = 'module load annotator;annotator '
        cmd += '--output {} '.format(output_file)
        cmd += '--input {} '.format(bed)
        cmd += '--gtfdb {} '.format(gtfdb_file)
        cmd += '--species {} '.format('hg19')
        # cmd += '--transcript-priority-file {} '.format(priority)
        # cmd += '--gene-priority-file {}'.format(priority)
        cmds.append(cmd)
        
len(cmds)

0

In [7]:
cmds[:3]

[]

In [8]:
Submitter(commands=cmds, job_name='annotate_editing_sites', array=True, nodes=1, ppn=2, submit=True, walltime='2:00:00')

Writing 0 tasks as an array-job.
Wrote commands to annotate_editing_sites.sh.


CalledProcessError: Command '['qsub', 'annotate_editing_sites.sh']' returned non-zero exit status 235