# This notebook parses the possorted_genome_bam_MD.bam to pull out only the barcodes specificed in Alex's lists.
- functionally equivalent to notebook 11_group_bams.ipynb, but in this case we're just re-parsing the main bam file instead of joining a bunch of smaller bam files.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 10000)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

original sources: 
```
RPS2_STAMP_Apo_filtered_lenti_common_expression.csv
APOBEC_STAMP_Apo_filtered_lenti_common_expression.csv
```

In [3]:
apo_barcodes = pd.read_csv(os.path.join(input_dir, 'APOBEC_STAMP_Apo_filtered_lenti_common_expression.csv'))
rps2_barcodes = pd.read_csv(os.path.join(input_dir, 'RPS2_STAMP_Apo_filtered_lenti_common_expression.csv'))

apo_barcodes.head()

Unnamed: 0,index,batch,lenti_common_normalized,lenti_common,new_clusters
0,Apo_Control_possorted_genome_bam_MD-AAACCCAAGCCAGTAG-1-Apo-Edits,Apo-Edits,-0.809566,0.0,APOBEC_STAMP
1,Apo_Control_possorted_genome_bam_MD-AAACCCAAGGATGCGT-1-Apo-Edits,Apo-Edits,-0.782604,0.0,APOBEC_STAMP
2,Apo_Control_possorted_genome_bam_MD-AAACCCACAATACAGA-1-Apo-Edits,Apo-Edits,-0.434219,1.0,APOBEC_STAMP
3,Apo_Control_possorted_genome_bam_MD-AAACCCACACGCAGTC-1-Apo-Edits,Apo-Edits,-0.545899,1.0,APOBEC_STAMP
4,Apo_Control_possorted_genome_bam_MD-AAACCCACAGAACATA-1-Apo-Edits,Apo-Edits,1.697777,28.0,APOBEC_STAMP


In [4]:
rps2_barcodes[rps2_barcodes['lenti_common_normalized']>=0].shape[0], rps2_barcodes.shape[0]

(2541, 3621)

In [5]:
def index_to_barcode(row, to_replace):
    """
    Not really a true index, just a column named 'index'.
    """
    barcode = row['index']
    for replace in to_replace:
        barcode = barcode.replace(replace,'')
        
    ### Just to make sure we're grabbing and correctly formatting barcodes.
    assert barcode.endswith('-1')

    return barcode

rps2_barcodes['barcode'] = rps2_barcodes.apply(index_to_barcode, args=(['RPS2_possorted_genome_bam_MD-', '-RPS2-Edits'],), axis=1)
apo_barcodes['barcode'] = apo_barcodes.apply(index_to_barcode, args=(['Apo_Control_possorted_genome_bam_MD-', '-Apo-Edits'],), axis=1)
rps2_barcodes.head()

Unnamed: 0,index,batch,lenti_common_normalized,lenti_common,new_clusters,barcode
0,RPS2_possorted_genome_bam_MD-AAACCCACAGGTACGA-1-RPS2-Edits,RPS2-Edits,-0.113331,3.0,RPS2_STAMP,AAACCCACAGGTACGA-1
1,RPS2_possorted_genome_bam_MD-AAACCCATCCTAGCCT-1-RPS2-Edits,RPS2-Edits,2.227205,39.0,RPS2_STAMP,AAACCCATCCTAGCCT-1
2,RPS2_possorted_genome_bam_MD-AAACGAAAGGGATGTC-1-RPS2-Edits,RPS2-Edits,0.539503,7.0,RPS2_STAMP,AAACGAAAGGGATGTC-1
3,RPS2_possorted_genome_bam_MD-AAACGAAAGTGGTCAG-1-RPS2-Edits,RPS2-Edits,0.685321,11.0,RPS2_STAMP,AAACGAAAGTGGTCAG-1
4,RPS2_possorted_genome_bam_MD-AAACGAACACACCTTC-1-RPS2-Edits,RPS2-Edits,-0.383744,0.0,RPS2_STAMP,AAACGAACACACCTTC-1


In [6]:
rps2_barcodes[['barcode']].to_csv(os.path.join(output_dir, 'RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt'), sep='\t', index=False, header=False)
apo_barcodes[['barcode']].to_csv(os.path.join(output_dir, 'APOBEC_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt'), sep='\t', index=False, header=False)

In [7]:
def generate_commandline_for_splitting(bam_file, barcodes_file, output_dir):
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(bam_file)
    cmd += '--barcodes_file {} '.format(barcodes_file)
    cmd += '--output_dir {} '.format(output_dir)
    cmd += '--group'
    return cmd

In [8]:
cmds = []
for bam, barcodes in zip(
    [
        os.path.join(input_dir, 'RPS2-STAMP_possorted_genome_bam_MD.bam'),
        os.path.join(input_dir, 'APOBEC-STAMP_possorted_genome_bam_MD.bam')
    ],
    [
        os.path.join(output_dir, 'RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt'),
        os.path.join(output_dir, 'APOBEC_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt')
    ]
):
    cmds.append(generate_commandline_for_splitting(bam, barcodes, output_dir))
    
cmds

['module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/ --group',
 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/APOBEC-STAMP_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/APOBEC_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_dat

In [10]:
Submitter(cmds, 'subset_barcodes', array=True, nodes=1, ppn=1, walltime='8:00:00', submit=True)

Writing 2 tasks as an array-job.
Wrote commands to subset_barcodes.sh.
Submitted script to queue home.
 Job ID: 21985503


<qtools.submitter.Submitter at 0x2b29a88f0b50>