# Runs the script to generate windows across all exons (CDS, UTRs)

In [1]:
import glob
import os
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [2]:
annotated_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles'
bigwig_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/bigwig_files/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_scores_no_APO_filter'

In [3]:
all_annotated = sorted(glob.glob(os.path.join(annotated_dir, '*.annotated')))
print(len(all_annotated))
all_annotated[:3]

18462


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/APOBEC_transient_possorted_genome_bam_MD-AAACCCAAGAGAGGTA-1.fx.bed.annotated',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/APOBEC_transient_possorted_genome_bam_MD-AAACCCAAGTGCTACT-1.fx.bed.annotated',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/APOBEC_transient_possorted_genome_bam_MD-AAACCCACAGGCAATG-1.fx.bed.annotated']

In [4]:
# bg_edits_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/ApoControl-1000_S21_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed'

chrom_sizes_file = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.chrom.sizes'
gtfdb_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
genome_fa = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'

cds_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/hg19_v19_cds.bed'
three_prime_utr_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/hg19_v19_three_prime_utrs.bed'
five_prime_utr_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/hg19_v19_five_prime_utrs.bed'

def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

groupsize = 500
need_to_run = [] # unfinished runs
cmds = []
for group in chunker(all_annotated, groupsize):
    cmd = 'module load python3essential;'
    for g in group:
        output_file = os.path.join(output_dir, os.path.basename(g) + '.exons.txt')
        output_file_summed = os.path.join(output_dir, os.path.basename(g) + '.exons.merged.txt')

        pos_bw = os.path.join(bigwig_dir, os.path.basename(g).replace('.fx.bed.annotated','') + '.fwd.sorted.rmdup.readfiltered.sorted.bw')
        neg_bw =os.path.join(bigwig_dir, os.path.basename(g).replace('.fx.bed.annotated','') + '.rev.sorted.rmdup.readfiltered.sorted.bw')
        if not os.path.exists(output_file_summed):
            if os.path.exists(pos_bw) and os.path.exists(neg_bw) and os.path.exists(g):
                cmd += 'python /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/score_edits_total_exon_coverage.py '
                cmd += '--conf 0 ' 
                cmd += '--gtfdb {} '.format(gtfdb_file)
                cmd += '--chrom_sizes_file {} '.format(chrom_sizes_file)
                cmd += '--pos_bw {} '.format(pos_bw)
                cmd += '--neg_bw {} '.format(neg_bw)
                cmd += '--annotated_edits_file {} '.format(g)
                # cmd += '--bg_edits_file {} '.format(bg_edits_file)
                cmd += '--genome_fa {} '.format(genome_fa)
                cmd += '--output_file {} '.format(output_file)
                cmd += '--output_file_summed {} '.format(output_file_summed)
                cmd += '--three_prime_utr_file {} '.format(three_prime_utr_file)
                cmd += '--five_prime_utr_file {} '.format(five_prime_utr_file)
                cmd += '--cds_file {};'.format(cds_file)
            else:
                print(os.path.exists(pos_bw), os.path.exists(neg_bw), os.path.exists(g))
                need_to_run.append(g)
    if cmd != 'module load python3essential;':
        cmds.append(cmd)

print("Number of commands: {}".format(len(cmds)))

Number of commands: 0


In [5]:
for cmd in cmds:
    print(len(cmd))

In [None]:
if len(cmds) > 0:
    Submitter(commands=cmds, job_name='score_exon_edits', array=True, nodes=1, ppn=4, submit=True, walltime='24:00:00')

# Write the commands to score all exon (minus 3'UTR) edits

In [None]:
cmds = []
for group in chunker(all_annotated, groupsize):
    cmd = 'module load python3essential;'
    for g in group:
        output_file = os.path.join(output_dir, os.path.basename(g) + '.exons_no3utr.txt')
        output_file_summed = os.path.join(output_dir, os.path.basename(g) + '.exons_no3utr.merged.txt')

        pos_bw = os.path.join(bigwig_dir, os.path.basename(g).replace('.fx.bed.annotated','') + '.fwd.sorted.rmdup.readfiltered.sorted.bw')
        neg_bw =os.path.join(bigwig_dir, os.path.basename(g).replace('.fx.bed.annotated','') + '.rev.sorted.rmdup.readfiltered.sorted.bw')
        if not os.path.exists(output_file_summed):
            if os.path.exists(pos_bw) and os.path.exists(neg_bw) and os.path.exists(g):
                cmd += '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/score_edits_total_exon_coverage.py '
                cmd += '--conf 0 ' 
                cmd += '--gtfdb {} '.format(gtfdb_file)
                cmd += '--chrom_sizes_file {} '.format(chrom_sizes_file)
                cmd += '--pos_bw {} '.format(pos_bw)
                cmd += '--neg_bw {} '.format(neg_bw)
                cmd += '--annotated_edits_file {} '.format(g)
                # cmd += '--bg_edits_file {} '.format(bg_edits_file)
                cmd += '--genome_fa {} '.format(genome_fa)
                cmd += '--output_file {} '.format(output_file)
                cmd += '--output_file_summed {} '.format(output_file_summed)
                cmd += '--five_prime_utr_file {} '.format(five_prime_utr_file)
                cmd += '--cds_file {};'.format(cds_file)
            else:
                print(os.path.exists(pos_bw), os.path.exists(neg_bw), os.path.exists(g))
    if cmd != 'module load python3essential;':
        cmds.append(cmd)

print("Number of commands: {}".format(len(cmds)))

In [None]:
if len(cmds) > 0:
    Submitter(commands=cmds, job_name='score_exon_no3utr_edits', array=True, nodes=1, ppn=4, submit=True, walltime='24:00:00')

In [None]:
cmds[0][:5000]