# This notebook shows how to add randomers and UMIs back to paired-end eCLIP reads. You will need to do this prior to submitting to GEO
- Yes we need to do this

In [13]:
import pandas as pd
import yaml
import pandas as pd
import os
import glob
from IPython.core.display import HTML
import numpy as np
import shutil
from collections import Counter, defaultdict, OrderedDict
import yaml
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/nazia_clipseq_20170627/permanent_data/eCLIP-0.2.2/'
output_dir = '/home/bay001/projects/nazia_clipseq_20170627/permanent_data/eCLIP-0.2.2/add_randomer_for_GEO/'

yaml_file = 'DDX5.yaml'

# below assumes the yaml document is run in the same directory as where the results folder is
results_dir = os.path.join(input_dir,  os.path.splitext(yaml_file)[0], 'results')
assert os.path.exists(results_dir)
print("results directory: {}".format(results_dir))

results directory: /home/bay001/projects/nazia_clipseq_20170627/permanent_data/eCLIP-0.2.2/DDX5/results


In [3]:
# this script was copied from Gabe on 12/12/2018
add_randomer_program = '/home/bay001/projects/codebase/bfx/pyscripts/clipseq/add_randomer.py'

In [4]:
def read_expt_metadata(fn):
    """ reads in a YAML file into a YAML object for easier parsing"""
    skip=True
    ln = '\n'
    with open(fn) as f:
        for i in range(14):  # bleh, basically skip any line that isn't YAML
            x = f.readline()
        try:
            data = yaml.load(f)
            return data
        except Exception as e:
            print(e)
            return None

metadata = read_expt_metadata(os.path.join(input_dir, yaml_file))

In [5]:
def get_demuxed_file_from_filename(results_dir, dataset, name, barcode, readnum):
    """ since eCLIP 2.2 has consistent names, we can use this func to infer the demultiplexed file in results/ from the original filename and barcode"""
    fn = '{}.{}.{}.r{}.fq.gz'.format(dataset, name, barcode, readnum)
    return os.path.join(results_dir, fn)


# We need to:
1. ip read 1 demuxed barcode 1: add barcode 1 (add_randomer.py -i -o -b)
2. ip read 1 demuxed barcode 2: add barcode 2
3. ip read 2 demuxed barcode 1: add randomer (add_randomer.py -i -o)
4. ip read 2 demuxed barcode 2: add randomer (add_randomer.py -i -o)
5. input read 1 (NIL demuxed): do nothing
6. input read 2 (NIL demuxed): add randomer (add_randomer.py -i -o)

7. merge the ip read1 demuxed + re-barcoded files together
8. leave input reads alone

In [6]:
dataset_name = metadata['dataset']
cmds = []
for paired_sample in metadata['samples']:
    ip_barcode1 = paired_sample[0]['barcodeids'][0]
    ip_barcode2 = paired_sample[0]['barcodeids'][1]
    
    ### read1 of ip ###
    ip_read1_barcode1_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[0]['name'],
        ip_barcode1, # first barcode
        1, # read1 or read2
    )
    assert os.path.exists(ip_read1_barcode1_in)
    ip_read1_barcode2_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[0]['name'],
        ip_barcode2, # second barcode
        1, # read1 or read2
    )
    assert os.path.exists(ip_read1_barcode2_in)
    
    ip_read1_barcode1_out = os.path.join(output_dir, os.path.basename(ip_read1_barcode1_in)) + '.addbarcode.fq.gz'
    assert not os.path.exists(ip_read1_barcode1_out)
    ip_read1_barcode2_out = os.path.join(output_dir, os.path.basename(ip_read1_barcode2_in)) + '.addbarcode.fq.gz'
    assert not os.path.exists(ip_read1_barcode2_out)
    # step 1 
    randomer_read1_barcode1_ip_cmd = 'python {} -i {} -o {} -b {}'.format(
        add_randomer_program,
        ip_read1_barcode1_in,
        ip_read1_barcode1_out,
        ip_barcode1.replace('D8f', 'D08fixed')
    )
    cmds.append(randomer_read1_barcode1_ip_cmd)
    # step 2
    randomer_read1_barcode2_ip_cmd = 'python {} -i {} -o {} -b {}'.format(
        add_randomer_program,
        ip_read1_barcode2_in,
        ip_read1_barcode2_out,
        ip_barcode2.replace('D8f', 'D08fixed')
    )
    cmds.append(randomer_read1_barcode2_ip_cmd)
    ### read2 of ip ###
    ip_read2_barcode1_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[0]['name'],
        ip_barcode1, # second barcode
        2, # read1 or read2
    )
    assert os.path.exists(ip_read2_barcode1_in)
    ip_read2_barcode2_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[0]['name'],
        ip_barcode2, # second barcode
        2, # read1 or read2
    )
    assert os.path.exists(ip_read2_barcode2_in)
    
    ip_read2_barcode1_out = os.path.join(output_dir, os.path.basename(ip_read2_barcode1_in)) + '.addrandomer.fq.gz'
    assert not os.path.exists(ip_read2_barcode1_out)
    ip_read2_barcode2_out = os.path.join(output_dir, os.path.basename(ip_read2_barcode2_in)) + '.addrandomer.fq.gz'
    assert not os.path.exists(ip_read2_barcode2_out)
    # step 3 
    randomer_read1_barcode1_ip_cmd = 'python {} -i {} -o {}'.format(
        add_randomer_program,
        ip_read2_barcode1_in,
        ip_read2_barcode1_out
    )
    cmds.append(randomer_read1_barcode1_ip_cmd)
    # step 4
    randomer_read1_barcode2_ip_cmd = 'python {} -i {} -o {}'.format(
        add_randomer_program,
        ip_read2_barcode2_in,
        ip_read2_barcode2_out
    )
    cmds.append(randomer_read1_barcode2_ip_cmd)
    ### read1 of input ###
    input_read1_barcode1_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[1]['name'],
        paired_sample[1]['barcodeids'][0], # first barcode (should be NIL for input)
        1,
    )
    assert os.path.exists(input_read1_barcode1_in)
    input_read1_barcode2_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[1]['name'],
        paired_sample[1]['barcodeids'][1], # first barcode (should be NIL for input)
        1,
    )
    assert os.path.exists(input_read1_barcode2_in)
    assert input_read1_barcode1_in == input_read1_barcode2_in # should be NIL for both?
    ### read2 of input ###
    input_read2_barcode1_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[1]['name'],
        paired_sample[1]['barcodeids'][0], # first barcode (should be NIL for input)
        2,
    )
    assert os.path.exists(ip_read2_barcode1_in)
    input_read2_barcode2_in = get_demuxed_file_from_filename(
        results_dir,
        dataset_name,
        paired_sample[1]['name'],
        paired_sample[1]['barcodeids'][1], # second barcode (should be NIL for input)
        2,
    )
    assert os.path.exists(ip_read2_barcode2_in)
    assert input_read2_barcode1_in == input_read2_barcode2_in # should be NIL for both?
    input_read2_barcode1_out = os.path.join(output_dir, os.path.basename(input_read2_barcode1_in)) + '.addrandomer.fq.gz'
    assert not os.path.exists(input_read2_barcode1_out)
    # step 5 (nothing)
    
    # step 6
    randomer_read2_barcode1_in_cmd = 'python {} -i {} -o {}'.format(
        add_randomer_program,
        input_read2_barcode1_in,
        input_read2_barcode1_out
    )
    cmds.append(randomer_read2_barcode1_in_cmd)

In [7]:
bash_script = 'add_randomer.sh'

Submitter(
    commands=cmds, 
    array=True, 
    job_name='add_randomer', 
    sh=bash_script,
    walltime="12:00:00",
    nodes=1,
    ppn=1,
    submit=False
)

Writing 30 tasks as an array-job.
Wrote commands to add_randomer.sh.
Submitted script to queue home.
 Job ID: 14061653


<qtools.submitter.Submitter at 0x2abbf559db10>

# Run the randomer script above
- submit = False for now just to double check stuff

# Now merge the fastq files
- (step 7)

In [22]:
with open(os.path.join(output_dir, 'merge_fastq.sh'), 'w') as f:
    f.write('#!/bin/bash\n')
    for paired_sample in metadata['samples']:
        sorted_r1_fq = sorted(glob.glob(
                os.path.join(
                    output_dir, 
                    "{}.{}.*.r1.fq.gz.addbarcode.fq.gz".format(
                        dataset_name, paired_sample[0]['name']
                    )
                )
            ))
        sorted_r2_fq = sorted(glob.glob(
                os.path.join(
                    output_dir, 
                    "{}.{}.*.r2.fq.gz.addrandomer.fq.gz".format(
                        dataset_name, paired_sample[0]['name']
                    )
                )
            ))
        output_r1_file = os.path.join(output_dir, '{}.{}.r1.reformatted.fq.gz'.format(dataset_name, paired_sample[0]['name']))
        output_r2_file = os.path.join(output_dir, '{}.{}.r2.reformatted.fq.gz'.format(dataset_name, paired_sample[0]['name']))
        cmd1 = 'cat {} {} > {}'.format(sorted_r1_fq[0], sorted_r1_fq[1], output_r1_file)
        cmd2 = 'cat {} {} > {}'.format(sorted_r2_fq[0], sorted_r2_fq[1], output_r2_file)
        f.write(cmd1 + '\n')
        f.write(cmd2 + '\n')

# Run the merge script