# This notebook writes qsub commands to split the whole 10X into each individual barcode. Barcodes come from Alex's tables.
- uses a helper script (split_bams.py) which accepts a list of barcodes to look for (inside the 'CB' tag) and the 10X "MD" bam file.
- python2 notebook because we're using Submitter

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
import random
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/'

# Read in all barcodes

In [3]:
all_barcodes = set(pd.read_csv(os.path.join(input_dir, 'barcodes.tsv.gz'), index_col=0, names=['barcodes']).index)

print("Num barcodes in RBFOX2/TIA: {}".format(len(all_barcodes)))

Num barcodes in RBFOX2/TIA: 20617


# Get indices pertaining to groups that we want to run SAILOR on
- let's get:
    - all barcodes!
    - I mean all 'filtered' barcodes from AlexC

# Split the combined BAM into individual barcodes that we are interested in
- store ALL barcode-of-interest reads into a dictionary reads_dict
- do this in chunks of 500 so we don't run out of memory

In [4]:
def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

i = 0
groupsize = 250
all_groups = []

for group in chunker(list(all_barcodes), groupsize):
    fout = os.path.join(tmp_dir, 'group_{}.txt'.format(i)) # groups should be 
    with open(fout, 'w') as o:
        for g in group:
            o.write("{}\n".format(g))
    i += 1
    all_groups.append(fout)

In [5]:
cmds = []


for group in all_groups:
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(os.path.join(input_dir, 'possorted_genome_bam_MD.bam'))
    cmd += '--barcodes_file {} '.format(group)
    cmd += '--output_dir {}'.format(output_dir)
    cmds.append(cmd)

print("Total commands to submit: {}".format(len(cmds)))

Total commands to submit: 83


In [6]:
Submitter(cmds, 'split-bam', nodes=1, ppn=8, array=True, submit=False, walltime='8:00:00', queue='condo')

Writing 83 tasks as an array-job.
Wrote commands to split-bam.sh.


<qtools.submitter.Submitter at 0x2ab405756950>

# This next block ensures that all barcodes are accounted for. If not, then we'll need to re-run.

In [11]:
def get_bam_file_name_from_barcode(barcode):
    return "possorted_genome_bam_MD-{}.bam".format(barcode)

missing_barcodes = []

for barcode in all_barcodes:
    bam_file = os.path.join(output_dir, get_bam_file_name_from_barcode(barcode))
    try:
        assert os.path.exists(bam_file)
    except AssertionError:
        missing_barcodes.append(barcode)

print(len(missing_barcodes))

0


In [8]:
fout = os.path.join(tmp_dir, 'missing.txt') # groups should be 
with open(fout, 'w') as o:
    for g in list(missing_barcodes):
        o.write("{}\n".format(g))

In [9]:
cmds = []

cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/split_bams.py '
cmd += '--possorted_bam_file {} '.format(os.path.join(input_dir, 'possorted_genome_bam_MD.bam'))
cmd += '--barcodes_file {} '.format(os.path.join(tmp_dir, 'missing.txt'))
cmd += '--output_dir {}'.format(output_dir)
cmds.append(cmd)

cmds


['module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/missing.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/']

In [10]:
Submitter(cmds, 'split-bam', nodes=1, ppn=8, array=True, submit=True, walltime='8:00:00', queue='home-yeo')

Writing 1 tasks as an array-job.
Wrote commands to split-bam.sh.
Submitted script to queue home-yeo.
 Job ID: 21206965


<qtools.submitter.Submitter at 0x2ab3dea2fb90>