# This notebook parses 10X input (possorted_genome.bam WITH MD TAGs) and randomly downsamples to a specified number of barcodes.
- functionally equivalent to all of the other 01_ notebooks, but we're just subsetting on different groups.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 10000)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/'

In [3]:
bam_file = os.path.join(input_dir, '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam')

# Read in expression values from Alex to help us decide which barcodes to look for/split.

original sources: 
```
RPS2_STAMP_Apo_filtered_lenti_common_expression.csv
APOBEC_STAMP_Apo_filtered_lenti_common_expression.csv
```

In [4]:
tia_barcodes = pd.read_csv('/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_TIA1_no_APO_edits_barcodes.csv')
rbfox2_barcodes = pd.read_csv('/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/For_Motif_RBFOX2_no_APO_edits_barcodes.csv')

In [5]:
tia_barcodes.head()

Unnamed: 0,index,batch,new_clusters
0,possorted_genome_bam_MD-AAACCCACAGGAAGTC-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP
1,possorted_genome_bam_MD-AAACCCAGTATGTCCA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP
2,possorted_genome_bam_MD-AAACGAAAGGCTGAAC-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP
3,possorted_genome_bam_MD-AAACGAACACAGACGA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP
4,possorted_genome_bam_MD-AAACGAACACATACTG-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP


In [6]:
rbfox2_barcodes.head()

Unnamed: 0,index,batch,new_clusters
0,possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP
1,possorted_genome_bam_MD-AAACCCAAGTGCTACT-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP
2,possorted_genome_bam_MD-AAACCCACATCTATCT-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP
3,possorted_genome_bam_MD-AAACCCACATCTCAAG-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP
4,possorted_genome_bam_MD-AAACCCAGTAGATGTA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP


In [7]:
def index_to_barcode(row, to_replace):
    """
    Not really a true index, just a column named 'index'.
    """
    barcode = row['index']
    for replace in to_replace:
        barcode = barcode.replace(replace,'')
        
    ### Just to make sure we're grabbing and correctly formatting barcodes.
    assert barcode.endswith('-1')

    return barcode

tia_barcodes['barcode'] = tia_barcodes.apply(index_to_barcode, args=(['possorted_genome_bam_MD-', '-RBFOX_TIA-Edits'],), axis=1)
rbfox2_barcodes['barcode'] = rbfox2_barcodes.apply(index_to_barcode, args=(['possorted_genome_bam_MD-', '-RBFOX_TIA-Edits'],), axis=1)

In [8]:
tia_barcodes.head()

Unnamed: 0,index,batch,new_clusters,barcode
0,possorted_genome_bam_MD-AAACCCACAGGAAGTC-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP,AAACCCACAGGAAGTC-1
1,possorted_genome_bam_MD-AAACCCAGTATGTCCA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP,AAACCCAGTATGTCCA-1
2,possorted_genome_bam_MD-AAACGAAAGGCTGAAC-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP,AAACGAAAGGCTGAAC-1
3,possorted_genome_bam_MD-AAACGAACACAGACGA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP,AAACGAACACAGACGA-1
4,possorted_genome_bam_MD-AAACGAACACATACTG-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,TIA1_STAMP,AAACGAACACATACTG-1


In [9]:
rbfox2_barcodes.head()

Unnamed: 0,index,batch,new_clusters,barcode
0,possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP,AAACCCAAGAGCCCAA-1
1,possorted_genome_bam_MD-AAACCCAAGTGCTACT-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP,AAACCCAAGTGCTACT-1
2,possorted_genome_bam_MD-AAACCCACATCTATCT-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP,AAACCCACATCTATCT-1
3,possorted_genome_bam_MD-AAACCCACATCTCAAG-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP,AAACCCACATCTCAAG-1
4,possorted_genome_bam_MD-AAACCCAGTAGATGTA-1-RBFOX_TIA-Edits,RBFOX_TIA-Edits,RBFOX2_STAMP,AAACCCAGTAGATGTA-1


# Downsample RBFOX2 barcodes from 844 down to 1 and run motif analysis. Let's see how far down we can go and still recover the RBFOX2 motif.

In [10]:
# These come from 01_group_RBFOX2_TIA_barcodes.ipynb
tia_barcodes_file = os.path.join(output_dir, '33_barcodes.tsv')
rbfox2_barcodes_file = os.path.join(output_dir, '34_barcodes.tsv')

In [11]:
ncells = [range(50) + [844, 800, 700, 600, 500, 400, 300, 200, 100]]
# ncells = [range(50) + [844, 800, 700, 600, 500, 400, 300, 200, 100], range(50) + [518, 500, 400, 300, 200, 100]]  # in case we want to do this for TIA1 too.

# barcodes_files = [rbfox2_barcodes_file, tia_barcodes_file]  # in case we want to do this for TIA1 too.
barcodes_files = [rbfox2_barcodes_file]


trials = range(10)
counter = 0
for barcodes_file, ncell in zip(barcodes_files, ncells):
    progress = tnrange(len(ncell)*len(trials))
    df = pd.read_csv(barcodes_file, names=['barcode'])
    for n in ncell:
        for t in trials:
            if not os.path.exists(os.path.splitext(barcodes_file)[0] + ".rand{}.trial-{}.txt".format(n, t)):  # We don't want to re-randomize files that already exist, harder to debug if they're constantly being overwritten.
                dfs = df.sample(n=n) 
                dfs.to_csv(os.path.splitext(barcodes_file)[0] + ".rand{}.trial-{}.txt".format(n, t), header=False, index=False)
            else:
                print("rand {} trial {} exists. Skipping.".format(n, t))
            counter += 1
            progress.update(1)
print("We have {} files created.".format(counter))

HBox(children=(IntProgress(value=0, max=590), HTML(value=u'')))

rand 0 trial 0 exists. Skipping.
rand 0 trial 1 exists. Skipping.
rand 0 trial 2 exists. Skipping.
rand 0 trial 3 exists. Skipping.
rand 0 trial 4 exists. Skipping.
rand 0 trial 5 exists. Skipping.
rand 0 trial 6 exists. Skipping.
rand 0 trial 7 exists. Skipping.
rand 0 trial 8 exists. Skipping.
rand 0 trial 9 exists. Skipping.
rand 1 trial 0 exists. Skipping.
rand 1 trial 1 exists. Skipping.
rand 1 trial 2 exists. Skipping.
rand 1 trial 3 exists. Skipping.
rand 1 trial 4 exists. Skipping.
rand 1 trial 5 exists. Skipping.
rand 1 trial 6 exists. Skipping.
rand 1 trial 7 exists. Skipping.
rand 1 trial 8 exists. Skipping.
rand 1 trial 9 exists. Skipping.
rand 2 trial 0 exists. Skipping.
rand 2 trial 1 exists. Skipping.
rand 2 trial 2 exists. Skipping.
rand 2 trial 3 exists. Skipping.
rand 2 trial 4 exists. Skipping.
rand 2 trial 5 exists. Skipping.
rand 2 trial 6 exists. Skipping.
rand 2 trial 7 exists. Skipping.
rand 2 trial 8 exists. Skipping.
rand 2 trial 9 exists. Skipping.
rand 3 tri

# Subtract capture sequence barcodes

In [12]:
tia_cs_file = '/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/TIA1_ORF_matrix.csv'
tia_cs = pd.read_csv(tia_cs_file, index_col=0)
tia_cs = set(tia_cs[tia_cs['TIA1_ORF'] > 0].index)

rbfox2_cs_file = '/home/iachaim/notebooks/10X/APOBEC/02_02_Subsetting_on_non_APO_overlap/results/RBFOX2_ORF_matrix.csv'
rbfox2_cs = pd.read_csv(rbfox2_cs_file, index_col=0)
rbfox2_cs = set(rbfox2_cs[rbfox2_cs['RBFOX2_ORF'] > 0].index)

print(len(tia_cs), len(rbfox2_cs))

(540, 867)


In [13]:
tia_barcodes = tia_barcodes[tia_barcodes['barcode'].isin(tia_cs)==False]
rbfox2_barcodes = rbfox2_barcodes[rbfox2_barcodes['barcode'].isin(rbfox2_cs)==False]

print(tia_barcodes.shape[0], rbfox2_barcodes.shape[0])

(2089, 4627)


In [14]:
tia_barcodes_file = os.path.join(output_dir, '6_barcodes_noCS.txt')
tia_barcodes[['barcode']].to_csv(tia_barcodes_file, sep='\t', index=False, header=False)

rbfox2_barcodes_file = os.path.join(output_dir, '7_barcodes_noCS.txt')
rbfox2_barcodes[['barcode']].to_csv(rbfox2_barcodes_file, sep='\t', index=False, header=False)

# Downsample (noCS)

In [15]:
ncells = [10, 50, 100, 200, 300, 500]
for barcodes_file in [tia_barcodes_file, rbfox2_barcodes_file]:
    df = pd.read_csv(barcodes_file, names=['barcode'])
    for n in ncells:
        if not os.path.exists(os.path.splitext(barcodes_file)[0] + ".rand{}.txt".format(n)):
            dfs = df.sample(n=n) 
            dfs.to_csv(os.path.splitext(barcodes_file)[0] + ".rand{}.txt".format(n), header=False, index=False)
        else:
            print("skipping {}, already made.".format(os.path.splitext(barcodes_file)[0] + ".rand{}.txt".format(n)))

skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.rand10.txt, already made.
skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.rand50.txt, already made.
skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.rand100.txt, already made.
skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.rand200.txt, already made.
skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.rand300.txt, already made.
skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.rand500.txt, already made.
skipping /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/7_barcodes_noCS.rand10.txt, already made.
sk

# Collect all the barcode lists from above and generate the split bam commands for each one. 

In [16]:
all_rand_barcode_groups = sorted(glob.glob(os.path.join(output_dir, '*.rand*trial*.txt')))
print(len(all_rand_barcode_groups))
all_rand_barcode_groups[:5]

590


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/34_barcodes.rand0.trial-0.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/34_barcodes.rand0.trial-1.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/34_barcodes.rand0.trial-2.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/34_barcodes.rand0.trial-3.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/34_barcodes.rand0.trial-4.txt']

In [17]:
def generate_commandline_for_grouping(bam_file, barcodes_file, output_dir, v=False):
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(bam_file)
    cmd += '--barcodes_file {} '.format(barcodes_file)
    cmd += '--output_dir {} '.format(output_dir)
    cmd += '--group'
    # --v means to SUBTRACT barcodes from a BAM file. 
    if v:
        cmd += ' --v'
    return cmd

In [18]:
cmds = []
for group in all_rand_barcode_groups:
    if not os.path.exists(os.path.join(output_dir, os.path.splitext(os.path.basename(bam_file))[0] + '-{}.bam'.format(os.path.basename(group)))):
        cmds.append(generate_commandline_for_grouping(bam_file, group, output_dir))
len(cmds)

0

In [19]:
Submitter(cmds, 'subset_barcodes', array=True, nodes=1, ppn=1, walltime='24:00:00', submit=False)

Writing 0 tasks as an array-job.
Wrote commands to subset_barcodes.sh.


<qtools.submitter.Submitter at 0x2b72c8971b10>

# Subset BAM file (minus CS barcodes)

In [20]:
cmds = []
for barcodes_file in [tia_barcodes_file, rbfox2_barcodes_file]:
    cmds.append(generate_commandline_for_grouping(bam_file, barcodes_file, output_dir))
cmds

['module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/6_barcodes_noCS.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/ --group',
 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/7_barcodes_noCS.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam

In [21]:
Submitter(cmds, 'subset_barcodes2', array=True, nodes=1, ppn=1, walltime='8:00:00', submit=False)

Writing 2 tasks as an array-job.
Wrote commands to subset_barcodes2.sh.


<qtools.submitter.Submitter at 0x2b72c8971110>