# This analysis groups the individually-split-barcoded bams from the previous notebook into various groups that might be useful (ie. top 10 by lenti expression). 

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
import random

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/'
split_bam_file_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

In [3]:
subset_more = pd.read_csv(os.path.join(input_dir, 'barcodes_RBFOX2-TIA1_subset_more.csv'), index_col=0)
subset_more_nofeature = pd.read_csv(os.path.join(input_dir, 'barcodes_RBFOX2-TIA1_subset_more_noFeatureCells.csv'), index_col=0)
subset = pd.read_csv(os.path.join(input_dir, 'barcodes_RBFOX2-TIA1_subset.csv'), index_col=0)
subset_nofeature = pd.read_csv(os.path.join(input_dir, 'barcodes_RBFOX2-TIA1_subset_noFeatureCells.csv'), index_col=0)

print(subset_more.shape[0], subset_more_nofeature.shape[0], subset.shape[0], subset_nofeature.shape[0])

subset_more.head()

3293 2658 7443 6349


Unnamed: 0_level_0,louvain_r0_3
index,Unnamed: 1_level_1
possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1-RBFOX_TIA-Edits,RBFOX2_ORF
possorted_genome_bam_MD-AAACCCACATCTCAAG-1-RBFOX_TIA-Edits,RBFOX2_ORF
possorted_genome_bam_MD-AAACCCAGTCAAGCGA-1-RBFOX_TIA-Edits,RBFOX2_ORF
possorted_genome_bam_MD-AAACCCAGTTCAGGTT-1-RBFOX_TIA-Edits,RBFOX2_ORF
possorted_genome_bam_MD-AAACCCATCGCAGAGA-1-RBFOX_TIA-Edits,TIA1_ORF


# Let's' get some basic stats.

In [4]:
table = subset_more
table_src = os.path.join(input_dir, 'barcodes_RBFOX2-TIA1_subset_more.csv')

In [5]:
table['louvain_r0_3'].value_counts()

RBFOX2_ORF    2146
TIA1_ORF      1147
Name: louvain_r0_3, dtype: int64

# The next code block ensure that the split-bam files from the previous notebook were all made. If not, we might need to re-run or re-make some of these.
- This is the same code as the previous notebook, just making extra sure that we've got everything.

In [6]:
def get_bam_file_name_from_index(index):
    prefix = index.replace('-RBFOX_TIA-Edits','')
    return "{}.bam".format(prefix)

for index in table.index:
    bam_file = os.path.join(split_bam_file_dir, get_bam_file_name_from_index(index))
    try:
        assert os.path.exists(bam_file)
    except AssertionError:
        print(bam_file)

# create groups
- we need to group the individual barcodes from split_bam_file_dir using the barcodes that were assigned according to the above table.
- in the interest of time, let me random sample first and start SAILOR on those, then group all barcodes associated in the next cells.

In [7]:
rbfox2_indices = []
tia1_indices = []

for index in table.index:
    if(table.loc[index]['louvain_r0_3'] == 'RBFOX2_ORF'):
        rbfox2_indices.append(index)
    
for index in table.index:
    if(table.loc[index]['louvain_r0_3'] == 'TIA1_ORF'):
        tia1_indices.append(index)
        
print(len(rbfox2_indices), len(tia1_indices))

2146 1147


In [8]:
def merge_indices(split_bam_file_dir, merged_output, index_list):
    """
    Returns the samtools merge command that combines all the individually-split BAM files into a single BAM file.
    """
    cmd = "samtools merge -f {} ".format(merged_output)
    for index in index_list:
        cmd += "{} ".format(os.path.join(split_bam_file_dir, get_bam_file_name_from_index(index)))
    return cmd

In [9]:
progress = tnrange(3)
for rand_num in [200, 300, 500]:
    sampled_rbfox2_indices = random.sample(rbfox2_indices, rand_num)
    sampled_tia1_indices = random.sample(tia1_indices, rand_num)
    
    rbfox2_merge_cmd = merge_indices(
        split_bam_file_dir=split_bam_file_dir,
        merged_output=os.path.join(output_dir, 'sampled_{}_RBFOX2_ORFS.{}.bam'.format(rand_num, os.path.splitext(os.path.basename(table_src))[0])),
        index_list=sampled_rbfox2_indices
    )
    
    tia1_merge_cmd = merge_indices(
        split_bam_file_dir=split_bam_file_dir,
        merged_output=os.path.join(output_dir, 'sampled_{}_TIA1_ORFS.{}.bam'.format(rand_num, os.path.splitext(os.path.basename(table_src))[0])),
        index_list=sampled_tia1_indices
    )
    ! $rbfox2_merge_cmd
    ! $tia1_merge_cmd
    
    progress.update(1)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

# We have too many barcodes (~2k) to run a merge command (argument list too long), so we need to first group barcodes, merge, and then group again.

In [10]:
def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


groupsize = 100 # 100 barcodes each merge.
progress = tnrange(int(len(rbfox2_indices)/groupsize)+1)
i = 0
for index_list in chunker(rbfox2_indices, groupsize):
    i += 1
    merged_output = os.path.join(output_dir, 'part_{}_RBFOX2_ORFS.{}.bam'.format(i, os.path.splitext(os.path.basename(table_src))[0]))
    cmd = merge_indices(split_bam_file_dir, merged_output, index_list)
    ! $cmd
    progress.update(1)

progress = tnrange(int(len(tia1_indices)/groupsize)+1)
i = 0
for index_list in chunker(tia1_indices, groupsize):
    i += 1
    merged_output = os.path.join(output_dir, 'part_{}_TIA1_ORFS.{}.bam'.format(i, os.path.splitext(os.path.basename(table_src))[0]))
    cmd = merge_indices(split_bam_file_dir, merged_output, index_list)
    ! $cmd
    progress.update(1)

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

In [11]:
all_rbfox2_bam_parts = sorted(glob.glob(os.path.join(output_dir, 'part_*_RBFOX2_ORFS.{}.bam'.format(os.path.splitext(os.path.basename(table_src))[0]))))
all_tia1_bam_parts = sorted(glob.glob(os.path.join(output_dir, 'part_*_TIA1_ORFS.{}.bam'.format(os.path.splitext(os.path.basename(table_src))[0]))))

print(len(all_rbfox2_bam_parts))
print(len(all_tia1_bam_parts))

cmds = []

final_output_bam = os.path.join(output_dir, 'RBFOX2_ORFS.{}.bam'.format(os.path.splitext(os.path.basename(table_src))[0]))
cmd = 'samtools merge -f {} '.format(final_output_bam)
for rbfox2_bam_part in all_rbfox2_bam_parts:
    cmd += '{} '.format(rbfox2_bam_part)

print(cmd)
! $cmd

final_output_bam = os.path.join(output_dir, 'TIA1_ORFS.{}.bam'.format(os.path.splitext(os.path.basename(table_src))[0]))
cmd = 'samtools merge -f {} '.format(final_output_bam)
for tia1_bam_part in all_tia1_bam_parts:
    cmd += '{} '.format(tia1_bam_part)
print(cmd)
! $cmd

22
12
samtools merge -f /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/part_10_RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/part_11_RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/part_12_RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/part_13_RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/part_14_RBFOX2_ORFS.barcodes_RBFOX2-TIA1_subset_more.bam /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/part_15_RBFOX2_ORFS.barcodes_RBFO