# This notebook writes qsub commands to split the whole 10X into each individual barcode. Barcodes come from Alex's tables.
- uses a helper script (split_bams.py) which accepts a list of barcodes to look for (inside the 'CB' tag) and the 10X "MD" bam file.
- python2 notebook because we're using Submitter

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
import random
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/tmp/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/'

# Read in all barcodes

In [3]:
rps2_barcodes = set(pd.read_csv(os.path.join(input_dir, 'RPS2.barcodes.tsv.gz'), index_col=0, names=['barcodes']).index)
apo_barcodes = set(pd.read_csv(os.path.join(input_dir, 'Apo_Control.barcodes.tsv.gz'), index_col=0, names=['barcodes']).index)

print("Num barcodes in APO: {} and RPS2: {}".format(len(apo_barcodes), len(rps2_barcodes)))

Num barcodes in APO: 8719 and RPS2: 11150


# Get indices pertaining to groups that we want to run SAILOR on
- let's get:
    - all barcodes!
    - I mean all 'filtered' barcodes from AlexC

# Split the combined BAM into individual barcodes that we are interested in
- store ALL barcode-of-interest reads into a dictionary reads_dict
- This block splits this job in chunks of 500 so we don't run out of memory

In [4]:
def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

i = 0
groupsize = 500
all_apo_groups = []
all_rps2_groups = []

for group in chunker(list(apo_barcodes), groupsize):
    fout = os.path.join(tmp_dir, 'apo-group_{}.txt'.format(i)) # groups should be 
    with open(fout, 'w') as o:
        for g in group:
            o.write("{}\n".format(g))
    i += 1
    all_apo_groups.append(fout)

i = 0
for group in chunker(list(rps2_barcodes), groupsize):
    fout = os.path.join(tmp_dir, 'rps2-group_{}.txt'.format(i)) # groups should be 
    with open(fout, 'w') as o:
        for g in group:
            o.write("{}\n".format(g))
    i += 1
    all_rps2_groups.append(fout)

# Runs a 'split bams' script that runs through the possorted.bam file, and pulls out all the barcodes in ```--barcodes_file``` into each cell. Do this twice for our Apo_Control and RPS2 samples.

In [5]:
cmds = []


for group in all_apo_groups:
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(os.path.join(input_dir, 'Apo_Control_possorted_genome_bam_MD.bam'))
    cmd += '--barcodes_file {} '.format(group)
    cmd += '--output_dir {}'.format(output_dir)
    cmds.append(cmd)
    
for group in all_rps2_groups:
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(os.path.join(input_dir, 'RPS2_possorted_genome_bam_MD.bam'))
    cmd += '--barcodes_file {} '.format(group)
    cmd += '--output_dir {}'.format(output_dir)
    cmds.append(cmd)

print("Total commands to submit: {}".format(len(cmds)))

Total commands to submit: 41


In [6]:
Submitter(cmds, 'split-bam', nodes=1, ppn=8, array=True, submit=False, walltime='8:00:00', queue='condo') # first one run manually

Writing 41 tasks as an array-job.
Wrote commands to split-bam.sh.


<qtools.submitter.Submitter at 0x2b1a1397c910>

# This next block ensures that all barcodes are accounted for. If not, then we'll need to re-run.

In [7]:
def get_apo_bam_file_name_from_barcode(barcode):
    return "Apo_Control_possorted_genome_bam_MD-{}.bam".format(barcode)

def get_rps2_bam_file_name_from_barcode(barcode):
    return "RPS2_possorted_genome_bam_MD-{}.bam".format(barcode)

rps2_missing_barcodes = []
apobec_missing_barcodes = []

for barcode in apo_barcodes:
    bam_file = os.path.join(output_dir, get_apo_bam_file_name_from_barcode(barcode))
    try:
        assert os.path.exists(bam_file)
    except AssertionError:
        apobec_missing_barcodes.append(barcode)
        
for barcode in rps2_barcodes:
    bam_file = os.path.join(output_dir, get_rps2_bam_file_name_from_barcode(barcode))
    try:
        assert os.path.exists(bam_file)
    except AssertionError:
        rps2_missing_barcodes.append(barcode)
print(len(rps2_missing_barcodes), len(apobec_missing_barcodes))

(0, 0)


In [8]:
fout = os.path.join(tmp_dir, 'apo-missing.txt') # groups should be 
with open(fout, 'w') as o:
    for g in list(apobec_missing_barcodes):
        o.write("{}\n".format(g))

fout = os.path.join(tmp_dir, 'rps2-missing.txt') # groups should be 
with open(fout, 'w') as o:
    for g in list(rps2_missing_barcodes):
        o.write("{}\n".format(g))

In [9]:
cmds = []

cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/split_bams.py '
cmd += '--possorted_bam_file {} '.format(os.path.join(input_dir, 'Apo_Control_possorted_genome_bam_MD.bam'))
cmd += '--barcodes_file {} '.format(os.path.join(tmp_dir, 'apo-missing.txt'))
cmd += '--output_dir {}'.format(output_dir)
cmds.append(cmd)
    
cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/split_bams.py '
cmd += '--possorted_bam_file {} '.format(os.path.join(input_dir, 'RPS2_possorted_genome_bam.bam'))
cmd += '--barcodes_file {} '.format(os.path.join(tmp_dir, 'rps2-missing.txt'))
cmd += '--output_dir {}'.format(output_dir)
cmds.append(cmd)
    
cmds


['module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/Apo_Control_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/tmp/apo-missing.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/',
 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/RPS2_possorted_genome_bam.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/tmp/rps2-missing.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/']

In [10]:
Submitter(cmds, 'split-bam', nodes=1, ppn=8, array=True, submit=False, walltime='8:00:00', queue='condo')

Writing 2 tasks as an array-job.
Wrote commands to split-bam.sh.


<qtools.submitter.Submitter at 0x2b1a14767e90>