# This analysis groups the individually-split-barcoded bams from the previous notebook into various groups that might be useful (ie. top 10 by lenti expression). 
- This notebook is a bit tricky, but ultimately what we wanted to do is group barcodes together (based on Alex's clustering/analysis) and re-run as a 'bulk' sample. 
- There are two strategies this notebook employs to do this:
    - perform ```samtools merge``` on the group of indidividually split (from notebook 01) barcodes (this is faster if you have only a few barcodes to group. Otherwise you will hit a limit).
    - cycle through the main possorted.bam file and subset the reads belonging to the barcodes of interest into a new merged file
    

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict, OrderedDict
from tqdm import tnrange, tqdm_notebook
import random
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/'
split_bam_file_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bam_files/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/bam_file_groups/'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

In [3]:
# I use the flags and the files specified in this cell to run this notebook twice, on 'normed' and 'not_normed' counts.

flag = 'normed' 

apo_df = pd.read_csv(os.path.join(input_dir, 'APO-control_normalized_counts.csv'), index_col=0)
apo_df.columns = ['APOBEC']
apo_df.sort_values(by=['APOBEC'], inplace=True, ascending=False)

rps2_df = pd.read_csv(os.path.join(input_dir, 'RPS2_normalized_counts.csv'), index_col=0)
rps2_df.columns = ['RPS2']
rps2_df.sort_values(by=['RPS2'], inplace=True, ascending=False)

print(apo_df.shape[0], rps2_df.shape[0])
apo_df.head()

(8131, 9990)


Unnamed: 0,APOBEC
GGTGTCGGTCTGCCTT-1,4.268595
ACGTACATCTGAGAAA-1,3.855157
CAGGGCTCAAACACCT-1,3.834341
GGGCCATCAGCCGGTT-1,3.821715
TTCCAATGTCCTACGG-1,3.548833


# The next code block ensure that the split-bam files from the previous notebook were all made. If not, we might need to re-run or re-make some of these.
- This is the same code as the previous notebook, just making extra sure that we've got everything.

In [4]:
def get_bam_file_name_from_barcode(barcode, sample):
    if sample == 'Apo':
        return 'Apo_Control_polyA_H7_MD-{}.bam'.format(barcode)
    elif sample == 'Rps2':
        return 'Rps2_polyA_H6_MD-{}.bam'.format(barcode)
    else:
        print("What?")

progress = tnrange(len(apo_df.index))
for barcode in apo_df.index:
    bam_file = os.path.join(split_bam_file_dir, get_bam_file_name_from_barcode(barcode, 'Apo'))
    try:
        assert os.path.exists(bam_file)
    except AssertionError:
        print(bam_file)
    progress.update(1)

progress = tnrange(len(rps2_df.index))
for barcode in rps2_df.index:
    bam_file = os.path.join(split_bam_file_dir, get_bam_file_name_from_barcode(barcode, 'Rps2'))
    try:
        assert os.path.exists(bam_file)
    except AssertionError:
        print(bam_file)
    progress.update(1)

HBox(children=(IntProgress(value=0, max=8131), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=9990), HTML(value=u'')))

# create groups
- ordered by RPS2 exprs and Apo exprs, split into quartiles. 

In [5]:
rps2_df['rank'] = rps2_df.rank(pct=True)
rps2_df_q1 = rps2_df[(rps2_df['rank'] >= 0.75)]
rps2_df_q2 = rps2_df[(rps2_df['rank'] >= 0.50) & (rps2_df['rank'] < 0.75)]
rps2_df_q3 = rps2_df[(rps2_df['rank'] >= 0.25) & (rps2_df['rank'] < 0.50)]
rps2_df_q4 = rps2_df[rps2_df['rank'] < 0.25]

apo_df['rank'] = apo_df.rank(pct=True)
apo_df_q1 = apo_df[(apo_df['rank'] >= 0.75)]
apo_df_q2 = apo_df[(apo_df['rank'] >= 0.50) & (apo_df['rank'] < 0.75)]
apo_df_q3 = apo_df[(apo_df['rank'] >= 0.25) & (apo_df['rank'] < 0.50)]
apo_df_q4 = apo_df[apo_df['rank'] < 0.25]

print(rps2_df_q1.shape[0], rps2_df_q2.shape[0], rps2_df_q3.shape[0], rps2_df_q4.shape[0])
print(apo_df_q1.shape[0], apo_df_q2.shape[0], apo_df_q3.shape[0], apo_df_q4.shape[0])

(2498, 2498, 2497, 2497)
(2033, 2033, 2033, 2032)


In [6]:
assert sum([rps2_df_q1.shape[0], rps2_df_q2.shape[0], rps2_df_q3.shape[0], rps2_df_q4.shape[0]]) == rps2_df.shape[0]
assert sum([apo_df_q1.shape[0], apo_df_q2.shape[0], apo_df_q3.shape[0], apo_df_q4.shape[0]]) == apo_df.shape[0]

In [7]:
rps2_df_q4.head()

Unnamed: 0,RPS2,rank
CAGATACAGTTTCTTC-1,-0.695487,0.24995
CTCCTCCAGAGTGACC-1,-0.695642,0.24985
TCTGCCATCTGAATGC-1,-0.696504,0.24975
CAGTGCGCAGAATTCC-1,-0.696635,0.24965
TTCCTAACAGATACCT-1,-0.69678,0.24955


In [8]:
def merge_barcodes_cmd(split_bam_file_dir, merged_output, bam_list):
    """
    Returns the samtools merge command that combines all the individually-split BAM files into a single BAM file.
    """
    cmd = "samtools merge -f {} ".format(merged_output)
    for bam in bam_list:
        cmd += "{} ".format(os.path.join(split_bam_file_dir, bam))
    return cmd

# For each Rps2/Apo quartile, build the command to merge and submit if necessary.

In [9]:
quartiles = {
    'Rps2_Q1_{}_barcodes.bam'.format(flag):rps2_df_q1.index,
    'Rps2_Q2_{}_barcodes.bam'.format(flag):rps2_df_q2.index,
    'Rps2_Q3_{}_barcodes.bam'.format(flag):rps2_df_q3.index,
    'Rps2_Q4_{}_barcodes.bam'.format(flag):rps2_df_q4.index,
    'Apo_Q1_{}_barcodes.bam'.format(flag):apo_df_q1.index,
    'Apo_Q2_{}_barcodes.bam'.format(flag):apo_df_q2.index,
    'Apo_Q3_{}_barcodes.bam'.format(flag):apo_df_q3.index,
    'Apo_Q4_{}_barcodes.bam'.format(flag):apo_df_q4.index,
}

cmds = []
for output_file, barcodes in quartiles.iteritems():
    bam_list = []
    sample = ""
    if output_file.startswith('Rps2'):
        sample = 'Rps2'
    elif output_file.startswith('Apo'):
        sample = 'Apo'
    else:
        print("What?")
        
    for barcode in barcodes:
        bam_list.append(get_bam_file_name_from_barcode(barcode, sample))

    cmds.append(merge_barcodes_cmd(split_bam_file_dir, os.path.join(output_dir, output_file), bam_list))
len(cmds)

8

In [10]:
Submitter(
    commands=cmds,
    job_name='merge-{}'.format(flag),
    sh='merge-{}.sh'.format(flag),
    array=True,
    nodes=1,
    ppn=2,
    walltime='48:00:00',
    submit=False,
)

Writing 8 tasks as an array-job.
Wrote commands to merge-normed.sh.


<qtools.submitter.Submitter at 0x2b72072ae050>

# Alex has grouped a few RPS barcodes to merge and call SAILOR on. Group and run SAILOR in the cells below.
- need to rethink this analysis, perhaps we can find what we want in a more unbiased manner.

# Just run all Apo/Rps2

# OK the above doesn't actually work since there are too many files to open... just merge the quartiles

In [28]:
cmd = 'samtools merge -f '
cmd += os.path.join(output_dir, 'Apo_{}_barcodes.bam '.format(flag))
for bam in sorted(glob.glob(os.path.join(output_dir, 'Apo_Q*_{}_barcodes.bam'.format(flag)))):
    cmd += '{} '.format(bam)
    
print(cmd)
if not os.path.exists(os.path.join(output_dir, 'Apo_{}_barcodes.bam'.format(flag))):
    ! $cmd

samtools merge -f /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Apo_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Apo_Q1_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Apo_Q2_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Apo_Q3_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Apo_Q4_normed_barcodes.bam 


In [None]:
cmd = 'samtools merge -f '
cmd += os.path.join(output_dir, 'Rps2_{}_barcodes.bam '.format(flag))
for bam in sorted(glob.glob(os.path.join(output_dir, 'Rps2_Q*_{}_barcodes.bam'.format(flag)))):
    cmd += '{} '.format(bam)
print(cmd)
if not os.path.exists(os.path.join(output_dir, 'Rps2_{}_barcodes.bam'.format(flag))):
    ! $cmd

samtools merge -f /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Rps2_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Rps2_Q1_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Rps2_Q2_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Rps2_Q3_normed_barcodes.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Rps2_Q4_normed_barcodes.bam 


# Group all S phase and G2M phase barcodes

In [11]:
phases = pd.read_csv(os.path.join(input_dir, 'RPS2_by_phase_normalized_counts.csv'), index_col=0)

s_phases = phases[phases['phase']=='S']
g2m_phases = phases[phases['phase']=='G2M']
g1_phases = phases[(phases['phase']!='S') & (phases['phase']!='G2M')]
assert s_phases.shape[0] + g2m_phases.shape[0] + g1_phases.shape[0] == phases.shape[0]

In [12]:
g1_phases.shape[0]

2930

In [13]:
s_phase_barcodes = set(s_phases.index)
g2m_phase_barcodes = set(g2m_phases.index)
g1_phase_barcodes = set(g1_phases.index)

In [14]:
def get_readcount(bam_file):
    """
    Parses a bam file idxstats to get the number of reads.
    The BAM file MUST have an index.
    """
    num_reads = pysam.idxstats(
        bam_file
    ).split('\n')
    nums = {}
    for num in num_reads:
        try:
            chrom, chrlen, mapped, unmapped = num.split('\t')
            nums[chrom] = int(mapped) + int(unmapped)
        except ValueError:
            print(num)
    return pd.DataFrame(nums, index=['num']).T.sum().values[0]


# Hiding the below block since it takes awhile to run, but it basically splits the bam file into each G1, G2M, and S-phase groups.

# Next idea, random sample G2M and S phase to the same number of barcodes as G1, make it easier to compare edits.
- this was originally done in 43_new_scRNA_editing (C_group_bams.ipynb)
- actually now that Alex has re-done this analysis to correct, we probably don't need to randomly downsample them. 

In [15]:
len(s_phase_barcodes), len(g2m_phase_barcodes), len(g1_phase_barcodes)

(3760, 3300, 2930)

# While we're at it, let's randomly sample S phase barcodes to scale with G2M 
- can't merge bams the old fashioned way, because bash doesn't like opening 4000+ barcodes (too many files open, or something like that).

# so we might be seeing a skew (in the EPKM scatterplots between G1 and S/G2M) because it seems like the G1 phase group contains barcodes that have higher Rps2 expression overall compared to barcodes in the G2M and S-phase. So randomly sampling either way will most likely pull a higher number of G2M/S-phase barcodes with 0 Rps2.. Alex correct me if i’m wrong about our discussion and Kris let me know if that’s not clear. But maybe we can try grabbing the top 10 barcodes from each group and comparing them to see if we can get similar edit numbers between groups, which may help with our comparisons

In [17]:
num_barcodes = 10

df = pd.read_csv(os.path.join(input_dir, 'RPS2_by_phase_normalized_counts.csv'), index_col=0)
s_phase = df[df['phase']=='S']
s_phase.sort_values(by=['RPS2_ORF'], ascending=False, inplace=True)
g2m_phase = df[df['phase']=='G2M']
g2m_phase.sort_values(by=['RPS2_ORF'], ascending=False, inplace=True)
g1_phase = df[df['phase']=='G1']
g1_phase.sort_values(by=['RPS2_ORF'], ascending=False, inplace=True)

s_top = s_phase.iloc[:num_barcodes,:]
g2m_top = g2m_phase.iloc[:num_barcodes,:]
g1_top = g1_phase.iloc[:num_barcodes,:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
g1_top

Unnamed: 0,RPS2_ORF,phase,S_score,G2M_score
TGATGGTGTCGAATTC-1,3.023643,G1,-0.337198,-0.207916
TCATTCACAACTACGT-1,2.982395,G1,-0.630462,-0.127964
TCCACGTTCACGGGAA-1,2.962103,G1,-0.305739,-0.213765
TACCGAAAGCAGGCTA-1,2.837131,G1,-0.036027,-0.06761
GACCCAGCACTTCAAG-1,2.830805,G1,-0.295343,-0.313252
TGTGTGAAGTTCCGGC-1,2.783374,G1,-0.0208,-0.324364
TGATGCAGTGTAGGAC-1,2.733465,G1,-0.44822,-0.090963
CACTGGGTCGAGTGAG-1,2.725585,G1,-0.325397,-0.008296
AGATCGTTCCACTTTA-1,2.720087,G1,-0.455538,-0.340071
TCCATCGCAGACTGCC-1,2.655674,G1,-0.274704,-0.397995


In [19]:
groups = OrderedDict({
    'Rps2_polyA_H6_MD.G2M-phase-top{}.bam'.format(num_barcodes) : list(g2m_top.index),
    'Rps2_polyA_H6_MD.S-phase-top{}.bam'.format(num_barcodes) : list(s_top.index),
    'Rps2_polyA_H6_MD.G1-phase-top{}.bam'.format(num_barcodes) : list(g1_top.index)
})

cmds = []
for output_file, barcodes in groups.iteritems():
    bam_list = []
    for barcode in barcodes:
        bam_list.append(get_bam_file_name_from_barcode(barcode, 'Rps2'))

    cmds.append(merge_barcodes_cmd(split_bam_file_dir, os.path.join(output_dir, output_file), bam_list))
len(cmds)

3

In [20]:
Submitter(
    commands=cmds, 
    job_name='top{}-merge'.format(num_barcodes),  # whoops, named this random-top10-merge. Remove the "random" since it's not...
    sh='top{}-merge.sh'.format(num_barcodes),
    array=True,
    nodes=1,
    ppn=1,
    walltime='12:00:00',
    submit=False,
)

Writing 3 tasks as an array-job.
Wrote commands to random-top10-merge.sh.


<qtools.submitter.Submitter at 0x2b71ddb70b10>

# Let's try getting the top 100 (previously 45) normalized barcodes from a list that Alex gave me (2/27)
- this is a new list from what we had earlier. I could technically re-use the code but for now let's keep the analysis separate.

In [22]:
g1_top100 = pd.read_csv(os.path.join(input_dir, 'RPS2_G1_top100_normalized_counts.csv'), index_col=0)
s_top100 = pd.read_csv(os.path.join(input_dir, 'RPS2_S_top100_normalized_counts.csv'), index_col=0)
g2m_top100 = pd.read_csv(os.path.join(input_dir, 'RPS2_G2M_top100_normalized_counts.csv'), index_col=0)

print(g1_top100.shape[0], s_top100.shape[0], g2m_top100.shape[0])
g1_top100.head()

(100, 100, 100)


Unnamed: 0,RPS2_ORF,phase,S_score,G2M_score,sum
TCACAAGAGTTGTAAG-1,1.914856,G1,-0.954358,-0.526157,-1.480515
GGTTAACAGTTACGAA-1,1.952246,G1,-0.678978,-0.690163,-1.36914
ACTGCAAGTTGGTGTT-1,1.798545,G1,-0.999408,-0.332579,-1.331986
CCGGTGATCGTACACA-1,1.909264,G1,-0.437678,-0.874413,-1.312092
TCATCCGCACCGTGAC-1,1.755865,G1,-0.481436,-0.824819,-1.306255


In [23]:
groups = OrderedDict({
    'Rps2_polyA_H6_MD.G2M-phase-top100.bam' : list(g2m_top100.index),
    'Rps2_polyA_H6_MD.S-phase-top100.bam' : list(s_top100.index),
    'Rps2_polyA_H6_MD.G1-phase-top100.bam' : list(g1_top100.index)
})

cmds = []
for output_file, barcodes in groups.iteritems():
    bam_list = []
    for barcode in barcodes:
        bam_list.append(get_bam_file_name_from_barcode(barcode, 'Rps2'))

    cmds.append(merge_barcodes_cmd(split_bam_file_dir, os.path.join(output_dir, output_file), bam_list))

['samtools merge -f /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Rps2_polyA_H6_MD.S-phase-top45.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-ACCAACATCTCGACGG-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-ACACAGTTCGTCTAAG-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-AAACGCTGTTGCTCGG-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-GATTGGTCATAGATCC-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-CCACGAGGTCGATTCA-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-GACCCAGCAAATGCGG-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Rps2_polyA_H6_MD-AC

In [24]:
Submitter(
    commands=cmds,
    job_name='top100-merge',
    sh='top100-merge.sh',
    array=True,
    nodes=1,
    ppn=1,
    walltime='12:00:00',
    submit=True,
)

Writing 3 tasks as an array-job.
Wrote commands to top100-merge.sh.
Submitted script to queue home.
 Job ID: 20955640


<qtools.submitter.Submitter at 0x2b7207cd9bd0>

# Let's group the top 100 Apo control barcodes as well (negative control)

In [25]:
g1_top100 = pd.read_csv(os.path.join(input_dir, 'APO-control_G1_top100_normalized_counts.csv'), index_col=0)
s_top100 = pd.read_csv(os.path.join(input_dir, 'APO-control_S_top100_normalized_counts.csv'), index_col=0)
g2m_top100 = pd.read_csv(os.path.join(input_dir, 'APO-control_G2M_top100_normalized_counts.csv'), index_col=0)

print(g1_top100.shape[0], s_top100.shape[0], g2m_top100.shape[0])
g1_top100.head()

(100, 100, 100)


Unnamed: 0,APO-control,phase,S_score,G2M_score,sum
CAGTTCCTCACCATAG-1,1.526547,G1,-1.095671,-0.582331,-1.678002
TTCACCGTCTACCACC-1,1.666168,G1,-0.506678,-0.901658,-1.408336
GGATGTTCAAAGGCGT-1,2.113909,G1,-0.910133,-0.431973,-1.342106
CACAACATCCAACACA-1,1.779427,G1,-0.444847,-0.796795,-1.241642
CTCTGGTTCAGTGTTG-1,2.203589,G1,-0.736379,-0.504132,-1.240511


In [26]:
groups = OrderedDict({
    'Apo_polyA_H6_MD.G2M-phase-top100.bam' : list(g2m_top100.index),
    'Apo_polyA_H6_MD.S-phase-top100.bam' : list(s_top100.index),
    'Apo_polyA_H6_MD.G1-phase-top100.bam' : list(g1_top100.index)
})

cmds = []
for output_file, barcodes in groups.iteritems():
    bam_list = []
    for barcode in barcodes:
        bam_list.append(get_bam_file_name_from_barcode(barcode, 'Apo'))

    cmds.append(merge_barcodes_cmd(split_bam_file_dir, os.path.join(output_dir, output_file), bam_list))
cmds

['samtools merge -f /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_file_groups/Apo_polyA_H6_MD.G2M-phase-top100.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Apo_Control_polyA_H7_MD-GAGTGTTGTGGCAACA-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Apo_Control_polyA_H7_MD-CCGCAAGAGCTATCCA-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Apo_Control_polyA_H7_MD-CCTCTAGGTCACGTGC-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Apo_Control_polyA_H7_MD-AGCCAGCGTGTAGCAG-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Apo_Control_polyA_H7_MD-ACTTCGCTCTAGAGCT-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/49_SAILOR_on_47/bam_files/Apo_Control_polyA_H7_MD-TCATTTGAGCCGCTTG-1.bam /home/bay001/projects/ryan_editing_20190314/permanent_data/4

In [27]:
Submitter(
    commands=cmds,
    job_name='top100-merge-apo',
    sh='top100-merge-apo.sh',
    array=True,
    nodes=1,
    ppn=1,
    walltime='12:00:00',
    submit=True,
)

Writing 3 tasks as an array-job.
Wrote commands to top100-merge-apo.sh.
Submitted script to queue home.
 Job ID: 20955657


<qtools.submitter.Submitter at 0x2b7207a8d110>