# This notebook parses the possorted_genome_bam_MD.bam to pull out only the barcodes specificed in Alex's lists.
- functionally equivalent to notebook 11_group_bams.ipynb, but in this case we're just re-parsing the main bam file instead of joining a bunch of smaller bam files.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 10000)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/'

In [3]:
prefix = 'RPS2_escore_barcodes_with_flags'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

original sources: 
```
/projects/ps-yeolab3/iachaim/10X/APOBEC/to_share/RPS2_EPKM_barcodes_with_flags.csv
/projects/ps-yeolab3/iachaim/10X/APOBEC/to_share/RPS2_escore_barcodes_with_flags.csv
```

In [4]:
# rps2_barcodes = pd.read_csv(os.path.join(input_dir, 'RPS2_EPKM_barcodes_with_flags.csv'))
rps2_barcodes = pd.read_csv(os.path.join(input_dir, '{}.csv'.format(prefix)))

rps2_barcodes.head()

Unnamed: 0.1,Unnamed: 0,n_genes,louvain_edits,n_counts,S_score,G2M_score,phase
0,AAACCCACAGGTACGA-1,4463,0,19455.0,-0.269578,0.010084,G2M
1,AAACGAAAGGGATGTC-1,4587,0,19799.0,-0.152814,0.091604,G2M
2,AAACGAAAGTGGTCAG-1,5123,0,25818.0,0.076701,-0.48185,S
3,AAACGAACACACCTTC-1,6084,0,35347.0,0.06873,0.245169,G2M
4,AAACGCTAGGAGTCTG-1,5418,1,30536.0,0.02263,0.325772,G2M


In [5]:
rps2_barcodes['phase'].value_counts()

S      1249
G2M    1076
G1      830
Name: phase, dtype: int64

In [6]:
g1_barcodes = rps2_barcodes[rps2_barcodes['phase']=='G1']
g2m_barcodes = rps2_barcodes[rps2_barcodes['phase']=='G2M']
s_barcodes = rps2_barcodes[rps2_barcodes['phase']=='S']

print(g1_barcodes.shape[0], g2m_barcodes.shape[0], s_barcodes.shape[0])

(830, 1076, 1249)


# Use the top 500 genes by score
- we don't have a G1 score, so let's use the top 500 genes with the lowest average S/G2M scores.
- for both S and G2M, we'll take the top 500 by score

In [7]:
g1_barcodes['avg_S_G2M'] = (g1_barcodes['S_score'] + g1_barcodes['G2M_score'])/2.
top_g1 = g1_barcodes.sort_values(by=['avg_S_G2M'], ascending=True)[:500]
top_g2m = g2m_barcodes.sort_values(by=['G2M_score'], ascending=False)[:500]
top_s = s_barcodes.sort_values(by=['S_score'], ascending=False)[:500]

print(top_g1.shape[0], top_g2m.shape[0], top_s.shape[0])
top_g1.head()

(500, 500, 500)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0.1,Unnamed: 0,n_genes,louvain_edits,n_counts,S_score,G2M_score,phase,avg_S_G2M
2204,GTTACCCAGGCGAACT-1,1308,3,2851.0,-0.363595,-1.34082,G1,-0.852208
1396,CTCAGAATCCGTTGAA-1,1124,3,2297.0,-1.09943,-0.534187,G1,-0.816809
896,CAGGCCAGTGAGTAAT-1,1824,3,3863.0,-0.895652,-0.719815,G1,-0.807734
443,AGGCTGCTCCCTGTTG-1,9705,1,112988.0,-0.424401,-0.879989,G1,-0.652195
2911,TGTTCCGTCACGAGGA-1,3515,2,13049.0,-0.866054,-0.27103,G1,-0.568542


# Save to a barcodes file

In [8]:
g1_barcodes[['Unnamed: 0']].to_csv(os.path.join(output_dir, '{}_g1_all.txt'.format(prefix)), sep='\t', index=False, header=False)
top_g1[['Unnamed: 0']].to_csv(os.path.join(output_dir,      '{}_g1_top500.txt'.format(prefix)), sep='\t', index=False, header=False)
g2m_barcodes[['Unnamed: 0']].to_csv(os.path.join(output_dir,'{}_g2m_all.txt'.format(prefix)), sep='\t', index=False, header=False)
top_g2m[['Unnamed: 0']].to_csv(os.path.join(output_dir,     '{}_g2m_top500.txt'.format(prefix)), sep='\t', index=False, header=False)
s_barcodes[['Unnamed: 0']].to_csv(os.path.join(output_dir,  '{}_s_all.txt'.format(prefix)), sep='\t', index=False, header=False)
top_s[['Unnamed: 0']].to_csv(os.path.join(output_dir,       '{}_s_top500.txt'.format(prefix)), sep='\t', index=False, header=False)

# Also let's select each louvain cluster and subset too

In [9]:
rps2_barcodes['louvain_edits'].value_counts()

0    1040
1    1028
2     894
3     193
Name: louvain_edits, dtype: int64

In [10]:
cluster_0 = rps2_barcodes[rps2_barcodes['louvain_edits']==0]
cluster_1 = rps2_barcodes[rps2_barcodes['louvain_edits']==1]
cluster_2 = rps2_barcodes[rps2_barcodes['louvain_edits']==2]
cluster_3 = rps2_barcodes[rps2_barcodes['louvain_edits']==3]
cluster_4 = rps2_barcodes[rps2_barcodes['louvain_edits']==4]
print(cluster_0.shape[0], cluster_1.shape[0], cluster_2.shape[0], cluster_3.shape[0], cluster_4.shape[0])
cluster_0.head()

(1040, 1028, 894, 193, 0)


Unnamed: 0.1,Unnamed: 0,n_genes,louvain_edits,n_counts,S_score,G2M_score,phase
0,AAACCCACAGGTACGA-1,4463,0,19455.0,-0.269578,0.010084,G2M
1,AAACGAAAGGGATGTC-1,4587,0,19799.0,-0.152814,0.091604,G2M
2,AAACGAAAGTGGTCAG-1,5123,0,25818.0,0.076701,-0.48185,S
3,AAACGAACACACCTTC-1,6084,0,35347.0,0.06873,0.245169,G2M
6,AAACGCTGTTAGTCGT-1,4831,0,23326.0,-0.005569,-0.174234,G1


In [11]:
cluster_0[['Unnamed: 0']].to_csv(os.path.join(output_dir, '{}_cluster0.txt'.format(prefix)), sep='\t', index=False, header=False)
cluster_1[['Unnamed: 0']].to_csv(os.path.join(output_dir, '{}_cluster1.txt'.format(prefix)), sep='\t', index=False, header=False)
cluster_2[['Unnamed: 0']].to_csv(os.path.join(output_dir, '{}_cluster2.txt'.format(prefix)), sep='\t', index=False, header=False)
cluster_3[['Unnamed: 0']].to_csv(os.path.join(output_dir, '{}_cluster3.txt'.format(prefix)), sep='\t', index=False, header=False)
# cluster_4[['Unnamed: 0']].to_csv(os.path.join(output_dir, '{}_cluster4.txt'.format(prefix)), sep='\t', index=False, header=False)

# Now let's check the files we just made to make sure the formatting is correct, and then build the commandlines to split the main RPS2 bam file into their respective barcodes.

In [12]:
all_barcodes_files = sorted(glob.glob(os.path.join(output_dir, '{}*.txt'.format(prefix))))
all_barcodes_files

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_cluster0.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_cluster1.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_cluster2.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_cluster3.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_g1_all.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_g1_top500.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_g2m_all.txt',
 '/home/bay001/project

In [13]:
for barcodes_file in all_barcodes_files:
    ! head $barcodes_file

AAACCCACAGGTACGA-1
AAACGAAAGGGATGTC-1
AAACGAAAGTGGTCAG-1
AAACGAACACACCTTC-1
AAACGCTGTTAGTCGT-1
AAAGAACGTCGACTTA-1
AAAGGTAGTCAAGCGA-1
AAAGGTATCATTCGTT-1
AAAGTGAGTATCGAGG-1
AAAGTGATCGTGCGAC-1
AAACGCTAGGAGTCTG-1
AAACGCTAGTTGGAAT-1
AAAGAACTCGAAGTGG-1
AAAGTCCAGTACAACA-1
AAAGTCCCACGTACAT-1
AAATGGAAGGGTAGCT-1
AACAAAGCAGAAGTGC-1
AACAAAGGTAGTATAG-1
AACAAAGGTCCCTAAA-1
AACAAAGTCCACGTAA-1
AAAGAACGTATCAGGG-1
AAAGAACGTATGGAGC-1
AAAGGATAGTATGAGT-1
AAAGGTACATCATGAC-1
AAAGGTAGTTCTTGTT-1
AAAGGTATCCTTCTTC-1
AAAGTCCAGGCTAACG-1
AAAGTCCCAGAGATGC-1
AAAGTGAAGCTATCTG-1
AAAGTGACATTGAGCT-1
AACCAACCATATCGGT-1
AACGAAAGTAGGAGGG-1
AAGCGAGCAATGAGCG-1
AAGCGAGTCCTCATAT-1
AAGGAATCATCGTGGC-1
AATGACCAGATGACAT-1
AATGGAAAGTAAGAGG-1
ACAAAGATCGCCAGAC-1
ACATCCCTCTGGGTCG-1
ACCCAAAGTGTACAGG-1
AAACGCTGTTAGTCGT-1
AAAGAACTCGAAGTGG-1
AAAGGTAGTCAAGCGA-1
AAATGGACAGATACCT-1
AACAAAGGTAGTATAG-1
AACAACCTCGCGTGCA-1
AACAAGAGTCCGGTGT-1
AACCACAAGCCTGGAA-1
AACCACACACACACTA-1
AACCTTTAGGTTATAG-1
GTTACCCAGGCGAACT-1
CTCAGAATCCGTTGAA-1
CAGGCCAGTGAG

In [14]:
bam_file = os.path.join(input_dir, 'RPS2-STAMP_possorted_genome_bam_MD.bam')

In [15]:
def generate_commandline_for_splitting(bam_file, barcodes_file, output_dir):
    cmd = 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py '
    cmd += '--possorted_bam_file {} '.format(bam_file)
    cmd += '--barcodes_file {} '.format(barcodes_file)
    cmd += '--output_dir {} '.format(output_dir)
    cmd += '--group'
    return cmd

In [16]:
cmds = []
for barcodes_file in all_barcodes_files:
    cmds.append(generate_commandline_for_splitting(bam_file, barcodes_file, output_dir))
cmds

['module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_cluster0.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/ --group',
 'module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_escore_barcodes_with_flags_cluster1.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/ 

In [17]:
Submitter(cmds, 'subset_barcodes', array=True, nodes=1, ppn=1, walltime='8:00:00', submit=True)

Writing 10 tasks as an array-job.
Wrote commands to subset_barcodes.sh.
Submitted script to queue home.
 Job ID: 24713067


<qtools.submitter.Submitter at 0x2b0358ecbf50>

In [None]:
# 

In [18]:
rps2_barcodes = pd.read_csv(os.path.join(input_dir, 'RPS2_non_APO_edits_barcodes_RPKM.csv'))
rps2_barcodes.head()


Unnamed: 0,index,batch,new_clusters
0,RPS2_possorted_genome_bam_MD-AAACCCACAGGTACGA-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP
1,RPS2_possorted_genome_bam_MD-AAACCCAGTTCCGCAG-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP
2,RPS2_possorted_genome_bam_MD-AAACCCATCCTAGCCT-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP
3,RPS2_possorted_genome_bam_MD-AAACGAAAGGGATGTC-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP
4,RPS2_possorted_genome_bam_MD-AAACGAAAGTGGTCAG-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP


In [19]:
rps2_barcodes['barcode'] = rps2_barcodes['index'].apply(
    lambda x: x.replace(
        '-RPS2-Edits',''
    ).replace(
        'RPS2_possorted_genome_bam_MD-',''
    )
)
rps2_barcodes.head()

Unnamed: 0,index,batch,new_clusters,barcode
0,RPS2_possorted_genome_bam_MD-AAACCCACAGGTACGA-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP,AAACCCACAGGTACGA-1
1,RPS2_possorted_genome_bam_MD-AAACCCAGTTCCGCAG-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP,AAACCCAGTTCCGCAG-1
2,RPS2_possorted_genome_bam_MD-AAACCCATCCTAGCCT-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP,AAACCCATCCTAGCCT-1
3,RPS2_possorted_genome_bam_MD-AAACGAAAGGGATGTC-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP,AAACGAAAGGGATGTC-1
4,RPS2_possorted_genome_bam_MD-AAACGAAAGTGGTCAG-1-RPS2-Edits,RPS2-Edits,RPS2-fSTAMP,AAACGAAAGTGGTCAG-1


In [20]:
rps2_barcodes[['barcode']].to_csv(
    os.path.join(output_dir, 'RPS2_non_APO_edits_barcodes_RPKM.txt'),
    sep='\t',
    index=False,
    header=False
)

In [21]:
# check real quick

In [22]:
rps2_barcode_formatted_file = os.path.join(output_dir, 'RPS2_non_APO_edits_barcodes_RPKM.txt')
! head $rps2_barcode_formatted_file

AAACCCACAGGTACGA-1
AAACCCAGTTCCGCAG-1
AAACCCATCCTAGCCT-1
AAACGAAAGGGATGTC-1
AAACGAAAGTGGTCAG-1
AAACGAACACACCTTC-1
AAACGAACAGTTAAAG-1
AAACGAAGTATGACAA-1
AAACGAATCGACGCGT-1
AAACGCTAGGAGTCTG-1


In [23]:
cmds = []
for barcodes_file in [rps2_barcode_formatted_file]:
    cmds.append(generate_commandline_for_splitting(bam_file, barcodes_file, output_dir))
cmds

['module load python3essential;/home/bay001/projects/kris_apobec_20200121/scripts/split_bams.py --possorted_bam_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam --barcodes_file /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2_non_APO_edits_barcodes_RPKM.txt --output_dir /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/ --group']