# This notebook parses the possorted_genome_bam_MD.bam to pull out only the barcodes specificed in Alex's lists.
- functionally equivalent to notebook 11_group_bams.ipynb, but in this case we're just re-parsing the main bam file instead of joining a bunch of smaller bam files.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 10000)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

original sources: 
```
/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_ALL_edits_barcodes.csv
/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_APO_edits_barcodes.csv
/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_TIA1-RBFOX2_sample_edits_barcodes.csv
/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_TIA1_no_APO_edits_barcodes.csv
/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_RBFOX2_no_APO_edits_barcodes.csv
```

In [3]:
manifest = pd.read_csv(os.path.join(input_dir, '20200609_Datasets_10X.tsv'), sep='\t')  # From google doc shared with Alex.
manifest = manifest[manifest['Batch']!='-']
manifest[['Barcode_file', 'MD_tagged_BAM', 'Batch']]

Unnamed: 0,Barcode_file,MD_tagged_BAM,Batch
0,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_ALL_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam,RBFOX_TIA-Edits
1,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_ALL_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/APOBEC-STAMP_possorted_genome_bam_MD.bam,Apo-Edits
2,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_APO_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/APOBEC-STAMP_possorted_genome_bam_MD.bam,Apo-Edits
3,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_TIA1-RBFOX2_sample_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam,RBFOX_TIA-Edits
4,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_Background_APO_plus_RBFOX2-TIA1_APO_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam,RBFOX_TIA-Edits
5,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_Background_APO_plus_RBFOX2-TIA1_APO_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/APOBEC-STAMP_possorted_genome_bam_MD.bam,Apo-Edits
6,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_TIA1_no_APO_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam,RBFOX_TIA-Edits
7,/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_RBFOX2_no_APO_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam,RBFOX_TIA-Edits
19,/home/iachaim/notebooks/10X/APOBEC/01_02_Subsetting_on_non_APO_overlap/results/For_Motif_APOBEC_NPC_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/HEK-NPC-APOBEC-STAMP_possorted_genome_bam_MD.bam,APOBEC
20,/home/iachaim/notebooks/10X/APOBEC/01_02_Subsetting_on_non_APO_overlap/results/For_Motif_APOBEC_HEK_edits_barcodes.csv,/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/HEK-NPC-APOBEC-STAMP_possorted_genome_bam_MD.bam,APOBEC


In [4]:
def index_to_barcode(row, to_replace):
    """
    Not really a true index, just a column named 'index'.
    
    """
    barcode = row['index']
    for replace in to_replace:
        barcode = barcode.replace(replace,'')
        
    ### Just to make sure we're grabbing and correctly formatting barcodes.
    assert barcode.startswith('A') or barcode.startswith('C') or barcode.startswith('G') or barcode.startswith('T')
    assert 'Apo' not in barcode
    assert barcode.endswith('-1')

    return barcode

In [5]:
def read_and_return_barcodes_list(manifest, index, to_replace, output_file, names=None, batchcol='batch'):
    """
    Parses out the barcodes file as described in the metadata (manifest)
    and returns a list of barcodes as well as 
    """
    if names is None:
        barcodes = pd.read_csv(manifest.loc[index]['Barcode_file'])
    else:
        barcodes = pd.read_csv(manifest.loc[index]['Barcode_file'], names=names)
    barcodes = barcodes[barcodes[batchcol]==manifest.loc[index]['Batch']]
    barcodes['barcode'] = barcodes.apply(index_to_barcode, args=(to_replace, ), axis=1)
    barcodes['barcode'].to_csv(output_file, index=False, header=False)
    print(barcodes.head(2))
    return output_file, manifest.loc[index]['MD_tagged_BAM']

def generate_commandline_for_splitting(bam_file, barcodes_file, output_dir):
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(bam_file)
    cmd += '--barcodes_file {} '.format(barcodes_file)
    cmd += '--output_dir {} '.format(output_dir)
    cmd += '--group'
    return cmd

# Basically I'm going through the manifest, grabbing the barcodes lists (first reading in the barcodes file, selecting barcodes based on Batch ID, then formatting the indices so that they're proper barcodes), and generating the commmands to pull those barcodes from the specified BAM file.
- Hide the ones we've verified to be done/complete.

In [6]:
cmds = []

### Some barcodes files lack headers, so we need to add them in.

### Adding more barcode groups to split..

In [7]:
for index in [30, 31, 32]:
    barcodes_file, bam_file = read_and_return_barcodes_list(manifest=manifest, index=index, to_replace=[], output_file=os.path.join(output_dir, '{}_barcodes.tsv'.format(index)), batchcol='cell')
    cmds.append(generate_commandline_for_splitting(bam_file=bam_file, barcodes_file=barcodes_file, output_dir=output_dir))

                index     cell             barcode
0  AAACCCAGTGTGGTCC-1  NPC_DCX  AAACCCAGTGTGGTCC-1
1  AAACGCTCATATGGCT-1  NPC_DCX  AAACGCTCATATGGCT-1
                index      cell             barcode
0  AAACCCAAGGCTAAAT-1  NPC_SOX2  AAACCCAAGGCTAAAT-1
1  AAACCCAAGGTGCTAG-1  NPC_SOX2  AAACCCAAGGTGCTAG-1
                index cell             barcode
0  AAACCCAAGTGCTCAT-1  HEK  AAACCCAAGTGCTCAT-1
1  AAACCCACACGACGAA-1  HEK  AAACCCACACGACGAA-1


In [8]:
Submitter(cmds, 'subset_barcodes', array=True, nodes=1, ppn=8, walltime='8:00:00', submit=True)

Writing 3 tasks as an array-job.
Wrote commands to subset_barcodes.sh.
Submitted script to queue home.
 Job ID: 21869192


<qtools.submitter.Submitter at 0x2aab2f8e9350>