# for the "new" epkm measurement, we'll need the expression (read counts) across select CDS or CDS+3'UTR regions.

In [1]:
%matplotlib inline

import glob
import os
import pandas as pd
import gffutils
import pysam
import pybedtools
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from collections import OrderedDict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts'

# Run featureCounts to get the number of edits assigned to each gene.
- Run featureCounts 16 times, once in each "bin" to save resources

In [3]:
region = 'cds_only'  # can be cds_and_3utr. Controls the if/else below, and sets the output suffixes.

In [4]:
if region == 'cds_and_3utr':
    saf = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/genes.cds_and_3utr.SAF'
elif region == 'cds_only':
    saf = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/genes.cds_only.SAF'
else:
    saf = None

In [5]:
for prefix in [
    'Apo_Control_possorted_genome_bam_MD-',
    'RPS2_possorted_genome_bam_MD-'
]:
    cmds = []
    for nt in ['A', 'C', 'G', 'T']:
        for nt2 in ['A', 'C', 'G', 'T']:
            output_file = os.path.join(output_dir, '{}{}{}_counts.{}.txt'.format(prefix, nt, nt2, region))
            cmd = 'module load subreadfeaturecounts;featureCounts '
            cmd += '-a {} '.format(saf)
            cmd += '-F SAF '
            cmd += '-s 1 '
            cmd += '-o {} '.format(output_file)
            cmd += os.path.join(input_dir, '{}{}{}*.bam'.format(prefix, nt, nt2))

            cmds.append(cmd)
    Submitter(
        commands=cmds,
        job_name='{}_{}featureCounts'.format(prefix, region),
        sh='{}_{}featureCounts.sh'.format(prefix, region),
        array=True,
        nodes=1,
        ppn=2,
        walltime='48:00:00',
        submit=True,
    )

Writing 16 tasks as an array-job.
Wrote commands to Apo_Control_possorted_genome_bam_MD-featureCounts.sh.
Submitted script to queue home.
 Job ID: 24376446
Writing 16 tasks as an array-job.
Wrote commands to RPS2_possorted_genome_bam_MD-featureCounts.sh.
Submitted script to queue home.
 Job ID: 24376447


In [6]:
print(len(cmds))
cmds[:2]

16


['module load subreadfeaturecounts;featureCounts -a /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/genes.cds_only.SAF -F SAF -s 1 -o /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-AA_counts.cds_only.txt /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/RPS2_possorted_genome_bam_MD-AA*.bam',
 'module load subreadfeaturecounts;featureCounts -a /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/genes.cds_only.SAF -F SAF -s 1 -o /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-AC_counts.cds_only.txt /home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/RPS2_possorted_genome_bam_MD-AC*.bam']

# Read in featureCounts (3'utr) output

In [20]:
prefix = 'RPS2_possorted_genome_bam_MD-'  # bad, refactor
region = 'cds_and_3utr'
all_counts = sorted(glob.glob(os.path.join(output_dir, '{}*_counts.{}.txt'.format(prefix, region))))
print(len(all_counts))
all_counts[:3]

16


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-AA_counts.cds_and_3utr.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-AC_counts.cds_and_3utr.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-AG_counts.cds_and_3utr.txt']

In [21]:
merged = pd.DataFrame(index=[])
merged

In [22]:
progress = tnrange(16)
for counts in all_counts:
    df = pd.read_csv(counts, sep='\t', index_col=0, skiprows=1)
    df = df.iloc[:,5:]
    df.columns = [
        c.replace('/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bam_files/', '') for c in df.columns
    ]
    merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
    print("dataframe size: {}, NEW merged size: {}".format(df.shape, merged.shape))
    progress.update(1)

HBox(children=(IntProgress(value=0, max=16), HTML(value=u'')))

dataframe size: (20356, 570), NEW merged size: (20356, 570)
dataframe size: (20356, 601), NEW merged size: (20356, 1171)
dataframe size: (20356, 813), NEW merged size: (20356, 1984)
dataframe size: (20356, 700), NEW merged size: (20356, 2684)
dataframe size: (20356, 826), NEW merged size: (20356, 3510)
dataframe size: (20356, 564), NEW merged size: (20356, 4074)
dataframe size: (20356, 494), NEW merged size: (20356, 4568)
dataframe size: (20356, 770), NEW merged size: (20356, 5338)
dataframe size: (20356, 728), NEW merged size: (20356, 6066)
dataframe size: (20356, 440), NEW merged size: (20356, 6506)
dataframe size: (20356, 617), NEW merged size: (20356, 7123)
dataframe size: (20356, 817), NEW merged size: (20356, 7940)
dataframe size: (20356, 648), NEW merged size: (20356, 8588)
dataframe size: (20356, 956), NEW merged size: (20356, 9544)
dataframe size: (20356, 822), NEW merged size: (20356, 10366)
dataframe size: (20356, 784), NEW merged size: (20356, 11150)


In [23]:
merged.to_csv(os.path.join(output_dir, '{}all_counts.{}.txt'.format(prefix, region)), sep='\t')