# This analysis groups the individually-split-barcoded bams using Alex's APO control-only barcodes.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
import random

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/'
split_bam_file_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

In [3]:
table = pd.read_csv(os.path.join(input_dir, 'For_Motif_APO_edits_barcodes.csv'), index_col=0)

table.head()

Unnamed: 0_level_0,batch,new_clusters
index,Unnamed: 1_level_1,Unnamed: 2_level_1
Apo_Control_possorted_genome_bam_MD-AAACCCAAGCCAGTAG-1-Apo-Edits,Apo-Edits,APOBEC_STAMP
Apo_Control_possorted_genome_bam_MD-AAACCCAAGGATGCGT-1-Apo-Edits,Apo-Edits,APOBEC_STAMP
Apo_Control_possorted_genome_bam_MD-AAACCCACAATACAGA-1-Apo-Edits,Apo-Edits,APOBEC_STAMP
Apo_Control_possorted_genome_bam_MD-AAACCCACACGCAGTC-1-Apo-Edits,Apo-Edits,APOBEC_STAMP
Apo_Control_possorted_genome_bam_MD-AAACCCACAGAACATA-1-Apo-Edits,Apo-Edits,APOBEC_STAMP


# Let's reformat this index to match what the barcodes actually are.

In [4]:
def get_bam_file_name_from_index(index_name, split_bam_file_dir=split_bam_file_dir):
    return os.path.join(split_bam_file_dir, index_name.replace('-Apo-Edits','.bam'))

# test it out
test_bam = get_bam_file_name_from_index(table.iloc[0].name)
print(test_bam)
assert os.path.exists(test_bam)

/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/Apo_Control_possorted_genome_bam_MD-AAACCCAAGCCAGTAG-1.bam


In [5]:
def get_barcode_from_index(index_name):
    return index_name.replace('-Apo-Edits','').replace('Apo_Control_possorted_genome_bam_MD-','')

# test it out
test_barcode = get_barcode_from_index(table.iloc[0].name)
print(test_barcode)

AAACCCAAGCCAGTAG-1


In [6]:
table['new_clusters'].value_counts()

APOBEC_STAMP    8084
TIA1_STAMP        19
RBFOX2_STAMP      14
Name: new_clusters, dtype: int64

# The next code block ensure that the split-bam files from the previous notebook were all made. If not, we might be using the wrong file!
- This is the same code as the previous notebook, just making extra sure that we've got everything.

In [7]:
apobec_indices = []
progress = tnrange(len(table.index))
for index in table.index:
    bam_file = os.path.join(split_bam_file_dir, get_bam_file_name_from_index(index))
    try:
        assert os.path.exists(bam_file)
        apobec_indices.append(get_barcode_from_index(index))
    except AssertionError:
        print(bam_file)
    progress.update(1)
    
print(len(apobec_indices))
apobec_indices[:3]

HBox(children=(IntProgress(value=0, max=8117), HTML(value='')))

8117


['AAACCCAAGCCAGTAG-1', 'AAACCCAAGGATGCGT-1', 'AAACCCACAATACAGA-1']

# create groups
- we need to group the individual barcodes

In [8]:
apobec_indices = set(apobec_indices)
print(len(apobec_indices))

8117


# We have too many barcodes (~2k) to run a merge command (argument list too long), so we need to cycle through the big BAM file and pull out barcodes that match the ones we want. 

In [11]:
def get_readcount(bam_file):
    """
    Parses a bam file idxstats to get the number of reads.
    The BAM file MUST have an index.
    """
    num_reads = pysam.idxstats(
        bam_file
    ).split('\n')
    nums = {}
    for num in num_reads:
        try:
            chrom, chrlen, mapped, unmapped = num.split('\t')
            nums[chrom] = int(mapped) + int(unmapped)
        except ValueError:
            print(num)
    return pd.DataFrame(nums, index=['num']).T.sum().values[0]


if os.path.exists(os.path.join(output_dir, 'Apo_Control_possorted_genome_bam_MD.{}.bam'.format("For_Motif_APO_edits_barcodes"))):
    pass
else:
    samfile = pysam.AlignmentFile(os.path.join(input_dir, 'Apo_Control_possorted_genome_bam_MD.bam'), "rb")

    progress = tnrange(get_readcount(os.path.join(input_dir, 'Apo_Control_possorted_genome_bam_MD.bam')))
    s = pysam.AlignmentFile(os.path.join(output_dir, 'Apo_Control_possorted_genome_bam_MD.{}.bam'.format("For_Motif_APO_edits_barcodes")), 'wb', template=samfile)
    for read in samfile:
        if (not read.is_unmapped) and (not read.is_secondary) and (not read.is_duplicate):  # get only primary mapped reads.
            try:
                barcode = read.get_tag('CB')  # identify read=assigned barcode
                gene_annotation = read.get_tag('GX')  # 
                if barcode in apobec_indices:
                    s.write(read)
            except KeyError:
                pass # no barcode or no GX tag, so 10X doesn't count this read and we shouldn't either.
        progress.update(1)

    s.close()




HBox(children=(IntProgress(value=0, max=650077059), HTML(value='')))