# This notebook parses the possorted_genome_bam_MD.bam to pull out only the barcodes specificed in Alex's lists.
- functionally equivalent to notebook 11_group_bams.ipynb, but in this case we're just re-parsing the main bam file instead of joining a bunch of smaller bam files.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 10000)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/'

In [3]:
# bam = os.path.join(input_dir, 'RBFOX2-TIA1-STAMP_possorted_genome_bam_MD.bam')
bam = os.path.join(input_dir, 'RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam')

# Read in expression values from Alex to help us decide which barcodes to look for/split.

In [4]:
tia1_barcodes = pd.read_csv(os.path.join(input_dir, 'RBFOX2-TIA1-STAMP_edits_for_TIA1_ORFs.csv'))
rbfox2_barcodes = pd.read_csv(os.path.join(input_dir, 'RBFOX2-TIA1-STAMP_edits_for_RBFOX2_ORFs.csv'))

rbfox2_barcodes.head()

Unnamed: 0,index,n_edits,RBFOX2_ORF,TIA1_ORF,feature,louvain_r0.5
0,possorted_genome_bam_MD-AAACGAAAGTCACACT-1,0.648778,0.35806,0.0,RBFOX2_ORF,RBFOX2-cSTAMP
1,possorted_genome_bam_MD-AAACGAACAGATCATC-1,0.117951,0.472917,0.0,RBFOX2_ORF,RBFOX2-cSTAMP
2,possorted_genome_bam_MD-AAACGAACATGACTAC-1,0.459542,0.50886,0.0,RBFOX2_ORF,RBFOX2-cSTAMP
3,possorted_genome_bam_MD-AAAGAACAGACTCCGC-1,0.210795,0.445535,0.0,RBFOX2_ORF,RBFOX2-cSTAMP
4,possorted_genome_bam_MD-AAAGGATAGATGACCG-1,1.028562,0.399376,0.0,RBFOX2_ORF,RBFOX2-cSTAMP


# Filter to get only (unique) ORF-expressed cells

In [5]:
tia1_barcodes = tia1_barcodes[(tia1_barcodes['TIA1_ORF'] > 0) & (tia1_barcodes['RBFOX2_ORF'] == 0)]
rbfox2_barcodes = rbfox2_barcodes[(rbfox2_barcodes['TIA1_ORF'] == 0) & (rbfox2_barcodes['RBFOX2_ORF'] > 0)]

print(tia1_barcodes.shape[0], rbfox2_barcodes.shape[0])

(518, 844)


In [6]:
def index_to_barcode(row, to_replace):
    """
    Not really a true index, just a column named 'index'.
    """
    barcode = row['index']
    for replace in to_replace:
        barcode = barcode.replace(replace,'')
        
    ### Just to make sure we're grabbing and correctly formatting barcodes.
    assert barcode.endswith('-1')

    return barcode

tia1_barcodes['barcode'] = tia1_barcodes.apply(index_to_barcode, args=(['possorted_genome_bam_MD-'],), axis=1)
rbfox2_barcodes['barcode'] = rbfox2_barcodes.apply(index_to_barcode, args=(['possorted_genome_bam_MD-'],), axis=1)
rbfox2_barcodes.head()

Unnamed: 0,index,n_edits,RBFOX2_ORF,TIA1_ORF,feature,louvain_r0.5,barcode
0,possorted_genome_bam_MD-AAACGAAAGTCACACT-1,0.648778,0.35806,0.0,RBFOX2_ORF,RBFOX2-cSTAMP,AAACGAAAGTCACACT-1
1,possorted_genome_bam_MD-AAACGAACAGATCATC-1,0.117951,0.472917,0.0,RBFOX2_ORF,RBFOX2-cSTAMP,AAACGAACAGATCATC-1
2,possorted_genome_bam_MD-AAACGAACATGACTAC-1,0.459542,0.50886,0.0,RBFOX2_ORF,RBFOX2-cSTAMP,AAACGAACATGACTAC-1
3,possorted_genome_bam_MD-AAAGAACAGACTCCGC-1,0.210795,0.445535,0.0,RBFOX2_ORF,RBFOX2-cSTAMP,AAAGAACAGACTCCGC-1
4,possorted_genome_bam_MD-AAAGGATAGATGACCG-1,1.028562,0.399376,0.0,RBFOX2_ORF,RBFOX2-cSTAMP,AAAGGATAGATGACCG-1


### Note to self: 33, 34 is identical to 35, 36. The difference comes from the BAM file that we're using to group on (Round2E vs F).

In [7]:
# tia1_barcodes[['barcode']].to_csv(os.path.join(output_dir, '33_barcodes.tsv'), sep='\t', index=False, header=False)
# rbfox2_barcodes[['barcode']].to_csv(os.path.join(output_dir, '34_barcodes.tsv'), sep='\t', index=False, header=False)
tia1_barcodes[['barcode']].to_csv(os.path.join(output_dir, '35_barcodes.tsv'), sep='\t', index=False, header=False)
rbfox2_barcodes[['barcode']].to_csv(os.path.join(output_dir, '36_barcodes.tsv'), sep='\t', index=False, header=False)

In [8]:
def generate_commandline_for_splitting(bam_file, barcodes_file, output_dir):
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(bam_file)
    cmd += '--barcodes_file {} '.format(barcodes_file)
    cmd += '--output_dir {} '.format(output_dir)
    cmd += '--group'
    return cmd

In [9]:
cmds = []
# for barcodes in [os.path.join(output_dir, '33_barcodes.tsv'), os.path.join(output_dir, '34_barcodes.tsv')]:
for barcodes in [os.path.join(output_dir, '35_barcodes.tsv'), os.path.join(output_dir, '36_barcodes.tsv')]:
    cmds.append(generate_commandline_for_splitting(bam, barcodes, output_dir))
cmds

['module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/35_barcodes.tsv --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/ --group',
 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/36_barcodes.tsv --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_gr

In [10]:
Submitter(cmds, 'subset_barcodes2', array=True, nodes=1, ppn=1, walltime='8:00:00', submit=True)

Writing 2 tasks as an array-job.
Wrote commands to subset_barcodes2.sh.
Submitted script to queue home.
 Job ID: 21950864


<qtools.submitter.Submitter at 0x2b1b867d82d0>