# Creates trackhubs of shRNA knockdown RNA SEQ ENCODE data using Gabe's make_bigwig_files.py script

In [1]:
import os
import glob
import sys
import pandas as pd
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [2]:
wd = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown_graveley_tophat'
bams = glob.glob(os.path.join(wd,'*.bam'))

In [3]:
man_dir = '/projects/ps-yeolab3/encode/'
k562_manifest = pd.read_table(
    man_dir + 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt'
)
k562_manifest['cell'] = 'K562'
hepg2_manifest = pd.read_table(
    man_dir + 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt'
)
hepg2_manifest['cell'] = 'HepG2'

all_manifest = pd.concat([hepg2_manifest, k562_manifest])
all_manifest.head()

Unnamed: 0.1,Unnamed: 0,control_rep1,control_rep1_md5sum,control_rep2,control_rep2_md5sum,expt_rep1,expt_rep1_md5sum,expt_rep2,expt_rep2_md5sum,name,cell
0,ENCSR000SKS,ENCFF534AWV.bam,982950ab9c9c2b0639bcbd9c52fcc6fb,ENCFF134QUL.bam,ad84b066ad25b40758edb3c36a86a3f8,ENCFF992YGD.bam,72a623286dc027f63023f6dc4ba9691e,ENCFF126RPY.bam,2935c0ddba008e679dbd7f8db89b155d,RPLP0,HepG2
1,ENCSR003LSA,ENCFF988VWE.bam,63aab8b4b68d96306f21956b86c25d0f,ENCFF893QHC.bam,fa5e2cc66763ec268cd91b072c9eb06f,ENCFF831XLU.bam,4c6811e884dc0cc3038c29ab484e458e,ENCFF194QUZ.bam,f77e3808093ed9e4f5e8030e326285ed,SNRNP200,HepG2
2,ENCSR004OSI,ENCFF460AAE.bam,b691efe23fdd3873e7b5d5e6a9b20ba4,ENCFF058QDT.bam,d8f3f5606b74eccd282f560e20b8c893,ENCFF582HPV.bam,40ce65ef362259a1b15fc6060802ce64,ENCFF601DMY.bam,677705e5a1f0cce1085fe55dbe284faa,DNAJC2,HepG2
3,ENCSR009PPI,ENCFF052HTH.bam,e6d81c60dffc32e98f491f1ee02236b8,ENCFF857QSU.bam,2dd2e67ec047cb195b6da4207d0cc26d,ENCFF865GTN.bam,f6cf64580d7a79c8ffc30890ee670317,ENCFF569MFV.bam,bbef1be538e57ea495a193eb46e464e3,FXR1,HepG2
4,ENCSR010ZMZ,ENCFF499UUZ.bam,b0fdad0ee704046b511099cf04e7eb5e,ENCFF326XKY.bam,ecd206b9d417019fa0c50604afef3094,ENCFF120YPA.bam,9e34244ba7a6beb52932fa3b3688c138,ENCFF791RLG.bam,12265b4e8f2b2bb1bf05c5fd0a4890ed,HLTF,HepG2


In [4]:
out_dir = '/projects/ps-yeolab3/bay001/encode_temp2/'

In [5]:
genome_file = '/projects/ps-yeolab3/bay001/annotations/hg19_ercc.chrom.sizes'

cmds = []
progress = tnrange(all_manifest.shape[0])
for _, row in all_manifest.iterrows():
    exp_rep1 = row['expt_rep1']
    exp_rep2 = row['expt_rep2']
    control_rep1 = row['control_rep1']
    control_rep2 = row['control_rep2']
    
    files = {control_rep2:'control_rep2' , exp_rep2:'expt_rep2', control_rep1:'control_rep1', exp_rep1:'expt_rep1'}
    
    for f, label in files.iteritems():
        
        bw_prefix = os.path.join(out_dir,"{}-{}-{}-{}-{}".format(
            row['name'], 
            row['cell'], 
            label,
            row['Unnamed: 0'], 
            f.replace('.bam','.norm'))
        )
        cmd = 'python /home/bay001/gscripts/gscripts/general/make_bigwig_files_pe.py ' # python /projects/ps-yeolab3/bay001/software/make_bigwig_files.py '
        cmd = cmd + '--bam {} '.format(os.path.join(wd, f))
        cmd = cmd + '--genome {} '.format(genome_file)
        cmd = cmd + '--bw_pos {} '.format(bw_prefix + '.pos.bw')
        cmd = cmd + '--bw_neg {} '.format(bw_prefix + '.neg.bw')
        if not os.path.exists(bw_prefix + '.pos.bw'):
            cmds.append(cmd)
    progress.update(1)
    jobname = "redo_timedout"
    my_bash_file = os.path.join(out_dir,'redo_timedout' + '.sh')

"""Submitter(
    cmds, 
    jobname, 
    array=True,
    nodes=1, 
    ppn=8, 
    walltime='6:00:00', 
    sh=my_bash_file, 
    submit=False, 
    queue='home-yeo'
)
"""

"Submitter(\n    cmds, \n    jobname, \n    array=True,\n    nodes=1, \n    ppn=8, \n    walltime='6:00:00', \n    sh=my_bash_file, \n    submit=False, \n    queue='home-yeo'\n)\n"

In [6]:
all_manifest[all_manifest['name']=='SNRNP70']

Unnamed: 0.1,Unnamed: 0,control_rep1,control_rep1_md5sum,control_rep2,control_rep2_md5sum,expt_rep1,expt_rep1_md5sum,expt_rep2,expt_rep2_md5sum,name,cell
135,ENCSR635BOO,ENCFF231NKH.bam,da3325036619c8e20698628adc49575f,ENCFF893QAU.bam,94cf6b7e3be9c6c3828af1ec3b0731d7,ENCFF080EFD.bam,7f28a947b2182b94166423629c3f514b,ENCFF717XEO.bam,e78371e338296a51d0d90d3321aa907c,SNRNP70,HepG2


In [None]:

def make_chrom_sizes(bam_h, output_file):
    """
    Uses the bam header from a bam file to get the chrom.sizes
    And creates the chrom.sizes file
    """
    o = open(output_file, 'w')
    with open(bam_h, 'r') as f:
        for line in f:
            if line.startswith('@SQ'):
                line = line.split('\t')
                o.write('{}\t{}'.format(line[1].replace('SN:',''), line[2].replace('LN:','')))
    o.close()
# bam_h = '/projects/ps-yeolab3/bay001/annotations/hg19_ercc.bam_header.txt'
# output_file = '/projects/ps-yeolab3/bay001/annotations/hg19_ercc.chrom.sizes'
# make_chrom_sizes(bam_h, output_file)

# Some of them weren't made, let's check the errs.

In [33]:
def was_killed(error_file):
    with open(error_file,'r') as f:
        for line in f:
            if 'read' in line:
                print(error_file, "read")
                return True
            elif 'Killed' in line:
                print(error_file, "killed.")
                return True
            elif 'float' in line:
                print(error_file, "float")
                return True
            elif 'failed' in line:
                print(error_file, "failed")
                return True
            elif 'Expecting' in line:
                print(error_file, 'expecting')
                return True
    return False

def parse(fn):
    bash_file = fn[:fn.find('.err')]
    cmd_num = int(fn[fn.find('.err-')+5:])
    return bash_file, cmd_num

def get_cmd(bash_file, cmd_num):
    with open(bash_file, 'r') as b:
        for line in b:
            if line.startswith('cmd[{}]='.format(cmd_num)):
                cmd = line.split('\"')[1]
                return cmd

cmds_to_rerun = []

error_files = glob.glob(os.path.join(out_dir, '*err-*'))
print("num of error files produced: {}".format(len(error_files)))

progress = tnrange(len(error_files))
for error_file in error_files:
    if was_killed(error_file):
        bash_file, cmd_num = parse(error_file)
        cmds_to_rerun.append(get_cmd(bash_file, cmd_num)),
    progress.update(1)

num of error files produced: 1869


('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout4.sh.err-108', 'killed.')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout2.sh.err-225', 'failed')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout1.sh.err-240', 'killed.')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout4.sh.err-239', 'read')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout1.sh.err-261', 'float')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout4.sh.err-154', 'failed')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout2.sh.err-408', 'failed')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout1.sh.err-253', 'expecting')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout2.sh.err-95', 'failed')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout1.sh.err-417', 'failed')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout3.sh.err-368', 'killed.')
('/projects/ps-yeolab3/bay001/encode_temp2/redo_timedout1.sh.err-361', 'killed.')
('/projects/ps-yeolab3/ba

In [34]:
jobname = "rerun_incomplete2"
my_bash_file = os.path.join(out_dir,'rerun_incomplete2' + '.sh')
    
Submitter(
    cmds_to_rerun, 
    jobname, 
    array=False,
    nodes=1, 
    ppn=8, 
    walltime='6:00:00', 
    sh=my_bash_file, 
    submit=True, 
    queue='home-yeo'
)

Wrote commands to /projects/ps-yeolab3/bay001/encode_temp2/rerun_incomplete2.sh.
Submitted script to queue home-yeo.
 Job ID: 8986110


<qtools.submitter.Submitter at 0x2b34ed9b2b10>