# This notebook downsamples RPS2 a bunch of times to run a bunch of SAILOR jobs

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pysam
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 10000)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/'

In [3]:
rps2_10k = os.path.join(input_dir, 'RPS2-STAMP_possorted_genome_bam_MD.bam')
assert os.path.exists(rps2_10k)

# Read in expression values from Alex to help us decide which barcodes to look for/split.

original sources: 
```
RPS2_STAMP_Apo_filtered_lenti_common_expression.csv
APOBEC_STAMP_Apo_filtered_lenti_common_expression.csv
```

In [4]:
fraction = ".08"

In [5]:
cmds = []

for trial in range(10):
    rps2_ds = os.path.join(output_dir, os.path.basename(rps2_10k) + "downsampled{}-{}.bam".format(fraction, trial))
    cmd = 'samtools view -s {}{} -b {} > {}'.format(
        trial,
        fraction,
        rps2_10k,
        rps2_ds
    )
    cmds.append(cmd)

In [6]:
cmds[0]

'samtools view -s 0.08 -b /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam > /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2-STAMP_possorted_genome_bam_MD.bamdownsampled.08-0.bam'

In [7]:
Submitter(cmds[1:], 'downsample_bams', array=True, nodes=1, ppn=1, walltime='8:00:00', submit=False)

Writing 9 tasks as an array-job.
Wrote commands to downsample_bams.sh.


<qtools.submitter.Submitter at 0x2b1348930910>

In [8]:
cmds

['samtools view -s 0.08 -b /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam > /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2-STAMP_possorted_genome_bam_MD.bamdownsampled.08-0.bam',
 'samtools view -s 1.08 -b /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam > /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2-STAMP_possorted_genome_bam_MD.bamdownsampled.08-1.bam',
 'samtools view -s 2.08 -b /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/RPS2-STAMP_possorted_genome_bam_MD.bam > /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bam_file_groups/RPS2-STAMP_possorted_genome_bam_MD.bamdownsampled.08-2.bam',
 'samtools view -s 3.08 -b /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA