In [1]:
import pandas as pd
import numpy as np
import os
import glob
import sys
from qtools import Submitter

In [2]:
input_dir = '/projects/ps-yeolab3/bay001/for_jackie/scratch/m6a/inputs/'
work_dir = '/projects/ps-yeolab3/bay001/for_jackie/scratch/m6a/work_dir/'
output_dir = '/projects/ps-yeolab3/bay001/for_jackie/scratch/m6a/outputs/'

### Gather all the necessary bam files

In [3]:
all_bam_files = glob.glob(os.path.join(input_dir, '*.CLIP_*.bam'))
len(all_bam_files)

12

### Group them into IP/INPUT pairs
- as jackie named them, we could probably do this automatically, but best to manually check

In [4]:
pairs = {"YTHDF2.CLIP_LM2_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_LM2_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_LM2_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_LM2_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MCF7_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MCF7_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MCF7_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MCF7_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MDA_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MDA_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MDA_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MDA_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MERA9_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MERA9_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MERA9_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MERA9_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MERA9_TAM_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MERA9_TAM_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_MERA9_TAM_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_MERA9_TAM_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_SKBR3_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_SKBR3_R1.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam",
"YTHDF2.CLIP_SKBR3_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam":"YTHDF2.INPUT_SKBR3_R2.umi.r1TrTr.sorted.STARUnmapped.out.sorted.STARAligned.outSo.rmDupSo.bam"}

### Define the 5'UTR/CDS/3'UTR bed files to use

In [5]:
# all transcripts
utr5 = os.path.join(input_dir, 'hg19_v19_five_prime_utrs.transcripts.bed')
cds = os.path.join(input_dir, 'hg19_v19_cds.transcripts.bed')
utr3 = os.path.join(input_dir, 'hg19_v19_three_prime_utrs.transcripts.bed')
# only the highest expressed transcripts
utr5_hi_tpm = os.path.join(input_dir, 'hg19_v19_five_prime_utrs.topENSTbyTPM.transcripts.bed')
cds_hi_tpm = os.path.join(input_dir, 'hg19_v19_cds.transcripts.topENSTbyTPM.bed')
utr3_hi_tpm = os.path.join(input_dir, 'hg19_v19_three_prime_utrs.topENSTbyTPM.transcripts.bed')

### Write and submit cmds for making maps overlapping eCLIP bigwig density files with the top expressed transcripts

In [6]:
cmds = []
for bam in all_bam_files:
    input_bam = os.path.join(input_dir, pairs[os.path.basename(bam)])
    ip_pos_bw = os.path.splitext(bam)[0] + ".norm.pos.bw"
    ip_neg_bw = os.path.splitext(bam)[0] + ".norm.neg.bw"
    input_pos_bw = os.path.splitext(input_bam)[0] + ".norm.pos.bw"
    input_neg_bw = os.path.splitext(input_bam)[0] + ".norm.neg.bw"
    output_file = os.path.join(output_dir, os.path.basename(bam).split('.')[1] + ".topENSTbyTPM.svg")
    cmd = 'source activate rbp-maps;plot_map '
    cmd += '--ipbam {} '.format(bam)
    cmd += '--ip_pos_bw {} '.format(ip_pos_bw)
    cmd += '--ip_neg_bw {} '.format(ip_neg_bw)
    cmd += '--inputbam {} '.format(input_bam)
    cmd += '--input_pos_bw {} '.format(input_pos_bw)
    cmd += '--input_neg_bw {} '.format(input_neg_bw)
    cmd += '--output {} '.format(output_file)
    cmd += '--normalization_level 1 '
    cmd += '--event metagene '
    cmd += '--annotations {} {} {} '.format(
        utr5_hi_tpm,
        cds_hi_tpm,
        utr3_hi_tpm,
    )
    cmd += '--annotation_type {} {} {}'.format("utr5", "cds", "utr3")
    cmds.append(cmd)

In [7]:
bash_script = os.path.join(work_dir, 'run_plot_map_topENSTbyTPM.sh')
Submitter(
    commands=cmds, 
    job_name='plot_map_topENSTbyTPM', 
    array=True, 
    walltime="8:00:00", 
    nodes=1, 
    ppn=1, 
    submit=True
)

Writing 12 tasks as an array-job.
Wrote commands to plot_map_topENSTbyTPM.sh.
Submitted script to queue home.
 Job ID: 14950720


<qtools.submitter.Submitter at 0x2b47ee0a0b90>

### Write and submit cmds for making maps overlapping eCLIP bigwig density files with all annotated transcripts

In [8]:
cmds = []
for bam in all_bam_files:
    input_bam = os.path.join(input_dir, pairs[os.path.basename(bam)])
    ip_pos_bw = os.path.splitext(bam)[0] + ".norm.pos.bw"
    ip_neg_bw = os.path.splitext(bam)[0] + ".norm.neg.bw"
    input_pos_bw = os.path.splitext(input_bam)[0] + ".norm.pos.bw"
    input_neg_bw = os.path.splitext(input_bam)[0] + ".norm.neg.bw"
    cmd = 'source activate rbp-maps;plot_map '
    cmd += '--ipbam {} '.format(bam)
    cmd += '--ip_pos_bw {} '.format(ip_pos_bw)
    cmd += '--ip_neg_bw {} '.format(ip_neg_bw)
    cmd += '--inputbam {} '.format(input_bam)
    cmd += '--input_pos_bw {} '.format(input_pos_bw)
    cmd += '--input_neg_bw {} '.format(input_neg_bw)
    cmd += '--output {} '.format(os.path.join(output_dir, os.path.basename(bam).split('.')[1] + ".allENST.svg"))
    cmd += '--normalization_level 1 '
    cmd += '--event metagene '
    cmd += '--annotations {} {} {} '.format(
        utr5,
        cds,
        utr3,
    )
    cmd += '--annotation_type {} {} {}'.format("utr5", "cds", "utr3")
    cmds.append(cmd)

In [9]:
bash_script = os.path.join(work_dir, 'run_plot_map_allENST.sh')
Submitter(
    commands=cmds, 
    job_name='plot_map_allENST', 
    array=True, 
    walltime="8:00:00", 
    nodes=1, 
    ppn=1, 
    submit=True
)

Writing 12 tasks as an array-job.
Wrote commands to plot_map_allENST.sh.
Submitted script to queue home.
 Job ID: 14950721


<qtools.submitter.Submitter at 0x2b47ee0a0c50>