# Idea is to take the BED files with edits and run featureCounts on them, but featureCounts needs a BAM file as input.
- This notebook was formerly: La_calculate_epkm_sc.ipynb (/home/bay001/projects/ryan_editing_20190314/permanent_data/43_new_scRNA_editing/La_calculate_epkm_sc.ipynb)
- This notebook takes edits (merged from both strands) from each bin (since there are 20k cells to account for, I've split each run based on the first 2 bases of each barcode) and runs featureCounts. Then it joins all outputs together to form one dataframe.
- This notebook also counts edits across just the 3'UTR for comparison to bulk.

In [1]:
%matplotlib inline

import glob
import os
import pandas as pd
import gffutils
import pysam
import pybedtools
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from collections import OrderedDict
from tqdm import tnrange, tqdm_notebook
from qtools import Submitter

# Important! Filter the edit file for conf score

In [5]:
conf=0.9

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/'

sailor_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_merged_bedfiles'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts{}'.format(conf)
rnaseq_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/bam_files'

In [3]:
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_tmp_dir = os.path.join(tmp_dir, '{}{}'.format(nt, nt2))
        if not os.path.exists(split_tmp_dir):
            ! mkdir $split_tmp_dir

# Convert BED to BAM
- every 'alignment' is an edit site. 

In [4]:
all_bed_files = sorted(glob.glob(os.path.join(sailor_output_dir, '*.bed')))
print(len(all_bed_files))
all_bed_files[:3] # just like to see what the filenames look like so we can better parse them out in the cells below.

16946


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_merged_bedfiles/possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_merged_bedfiles/possorted_genome_bam_MD-AAACCCAAGAGCTTTC-1.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_merged_bedfiles/possorted_genome_bam_MD-AAACCCAAGATAGTGT-1.bed']

# Basically filter for conf, perform bedToBam and samtools sort on these guys.

In [6]:
def filter_bed(input_bed, output_bed, conf):
    """
    Filters the BED file 
    """
    edit_head = ['chrom','start','end','conf','frac','strand']
    df = pd.read_table(input_bed, names=edit_head)
    df = df[df['conf']>=conf]
    df.to_csv(output_bed, sep='\t', header=False, index=False)


genome = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/star/chrNameLength.txt'
progress = tnrange(len(all_bed_files))
for bed in all_bed_files:
    prefix = os.path.basename(bed).split('possorted_genome_bam_MD-')[1][:2]                               # ie. possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1.bed -> AA
    split_tmp_dir = os.path.join(tmp_dir, "{}".format(prefix))                                            # ie. AA -> for_featurecounts0.9/AA
    filter_fn = os.path.join(split_tmp_dir, os.path.basename(bed) + ".{}.bed".format(conf))               # ie. for_featurecounts0.9/AA/possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1.bed.0.9.bed
    output_bam = os.path.join(split_tmp_dir, os.path.basename(bed).replace('.bed','.bam'))                # ie. for_featurecounts0.9/AA/possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1.bam
    output_sorted_bam = os.path.join(split_tmp_dir, os.path.basename(bed).replace('.bed','.sorted.bam'))  # ie. for_featurecounts0.9/AA/possorted_genome_bam_MD-AAACCCAAGAGCCCAA-1.sorted.bam

    if not os.path.exists(output_sorted_bam):
        filter_bed(bed, filter_fn, conf)
        cmd = 'bedToBam '
        cmd += '-i {} '.format(filter_fn)
        cmd += '-g {} '.format(genome)
        cmd += '> {}'.format(output_bam)
        ! $cmd
        sort_cmd = 'samtools sort {} > {}'.format(output_bam, output_sorted_bam)
        ! $sort_cmd
    progress.update(1)

HBox(children=(IntProgress(value=0, max=16946), HTML(value=u'')))

# Run featureCounts to get the number of edits assigned to each gene.
- Run featureCounts 16 times, once in each "bin" to save resources

In [7]:
gtf = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/genes/genes.gtf'

In [8]:
cmds = []
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_tmp_dir = os.path.join(tmp_dir, '{}{}'.format(nt, nt2))

        cmd = 'module load subreadfeaturecounts;featureCounts '
        cmd += '-a {} '.format(gtf)
        cmd += '-s 1 '
        cmd += '-o {}'.format(os.path.join(split_tmp_dir, 'counts_at_conf_{}.txt '.format(conf)))
        cmd += '-R CORE '
        cmd += os.path.join(split_tmp_dir, '*.sorted.bam')

        cmds.append(cmd)

In [9]:
print(len(cmds))
cmds[:2]

16


['module load subreadfeaturecounts;featureCounts -a /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/genes/genes.gtf -s 1 -o /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AA/counts_at_conf_0.9.txt -R CORE /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AA/*.sorted.bam',
 'module load subreadfeaturecounts;featureCounts -a /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/refdata-cellranger-hg19_lenti_common-3.0.0/genes/genes.gtf -s 1 -o /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AC/counts_at_conf_0.9.txt -R CORE /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AC/*.sorted.bam']

In [10]:
Submitter(
    commands=cmds,
    job_name='featureCounts',
    sh='featureCounts.sh',
    array=True,
    nodes=1,
    ppn=2,
    walltime='48:00:00',
    submit=False,
)

Writing 16 tasks as an array-job.
Wrote commands to featureCounts.sh.


<qtools.submitter.Submitter at 0x2b209fe8bb10>

# Let's run featureCounts across just 3utrs also, in addition to the whole gene.
- Using the 10X ensembl-style three_prime_utr annotations, generate a SAF file and just call featureCounts on that.
- Run featureCounts 16 times, once in each "bin" to save resources

In [11]:
gtf_3utr_file = os.path.join(input_dir, 'genes.three_prime_utr_only.gtf')

! grep --color 'three_prime_utr' $gtf > $gtf_3utr_file

In [12]:
gtf_3utr = pd.read_csv(gtf_3utr_file, names=['chrom','src','region','start','end','.','strand','.','attr'], sep='\t')
gtf_3utr['geneid'] = gtf_3utr['attr'].str.extract("gene_id \"([\w\d\.]+)\"")
gtf_3utr = gtf_3utr[['geneid','chrom','start','end','strand']]
gtf_3utr.head()

  return _read(filepath_or_buffer, kwds)


Unnamed: 0,geneid,chrom,start,end,strand
0,ENSG00000237683,1,137621,138529,-
1,ENSG00000237683,1,134901,135802,-
2,ENSG00000235249,1,368598,368634,+
3,ENSG00000185097,1,621059,621095,-
4,ENSG00000187634,1,879534,879955,+


In [13]:
gtf_3utr.to_csv(
    os.path.join(input_dir, 'genes.three_prime_utr_only.SAF'),
    sep='\t',
    index=False,
    header=False
)

In [14]:
saf = os.path.join(input_dir, 'genes.three_prime_utr_only.SAF')

In [15]:
cmds = []
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_tmp_dir = os.path.join(tmp_dir, '{}{}'.format(nt, nt2))

        cmd = 'module load subreadfeaturecounts;featureCounts '
        cmd += '-a {} '.format(saf)
        cmd += '-F SAF '
        cmd += '-s 1 '
        cmd += '-o {}'.format(os.path.join(split_tmp_dir, 'counts_at_conf_{}.three_prime_utr.txt '.format(conf)))
        cmd += '-R CORE '
        cmd += os.path.join(split_tmp_dir, '*.sorted.bam')

        cmds.append(cmd)

In [16]:
# Just check the command to make sure it's OK
cmds[0]

'module load subreadfeaturecounts;featureCounts -a /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/inputs/genes.three_prime_utr_only.SAF -F SAF -s 1 -o /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AA/counts_at_conf_0.9.three_prime_utr.txt -R CORE /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AA/*.sorted.bam'

In [17]:
Submitter(
    commands=cmds,
    job_name='featureCounts-3utr',
    sh='featureCounts-3utr.sh',
    array=True,
    nodes=1,
    ppn=2,
    walltime='48:00:00',
    submit=False,
)

Writing 16 tasks as an array-job.
Wrote commands to featureCounts-3utr.sh.


<qtools.submitter.Submitter at 0x2b209fe8b390>

# Read in featureCounts outputs from each bin and merge them now.
- each column is something like: /home/bay001/projects/ryan_editing_20190314/permanent_data/43_new_scRNA_editing/tmp/AG/possorted_genome_bam_MD-AGTGTTGTCGAACCAT-1.merged.sorted.bam
- original 'split-bam' files look like this: /home/bay001/projects/ryan_editing_20190314/permanent_data/43_new_scRNA_editing/bam_files/possorted_genome_bam_MD-AGTGTTGTCGAACCAT-1.sorted.bam

In [18]:
print("checking {}".format(os.path.join(tmp_dir, "*", "counts_at_conf_{}.txt".format(conf))))
all_counts = glob.glob(os.path.join(tmp_dir, "*", "counts_at_conf_{}.txt".format(conf)))
print(len(all_counts))
all_counts[:3]

checking /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/*/counts_at_conf_0.9.txt
16


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/CA/counts_at_conf_0.9.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/TA/counts_at_conf_0.9.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AT/counts_at_conf_0.9.txt']

In [19]:
merged = pd.DataFrame(index=[])
merged

In [20]:
progress = tnrange(16)
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_input_dir = os.path.join(tmp_dir, '{}{}'.format(nt, nt2))
        df = pd.read_csv(os.path.join(split_input_dir, 'counts_at_conf_{}.txt'.format(conf)), sep='\t', index_col=0, skiprows=1)
        df = df.iloc[:,5:]
        df.columns = [
            c.replace(split_input_dir, '') for c in df.columns
        ]
        merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
        print("dataframe size: {}, NEW merged size: {}".format(df.shape, merged.shape))
        progress.update(1)
        

HBox(children=(IntProgress(value=0, max=16), HTML(value=u'')))

dataframe size: (32739, 887), NEW merged size: (32739, 887)
dataframe size: (32739, 943), NEW merged size: (32739, 1830)
dataframe size: (32739, 1219), NEW merged size: (32739, 3049)
dataframe size: (32739, 1063), NEW merged size: (32739, 4112)
dataframe size: (32739, 1252), NEW merged size: (32739, 5364)
dataframe size: (32739, 808), NEW merged size: (32739, 6172)
dataframe size: (32739, 738), NEW merged size: (32739, 6910)
dataframe size: (32739, 1205), NEW merged size: (32739, 8115)
dataframe size: (32739, 1131), NEW merged size: (32739, 9246)
dataframe size: (32739, 760), NEW merged size: (32739, 10006)
dataframe size: (32739, 882), NEW merged size: (32739, 10888)
dataframe size: (32739, 1258), NEW merged size: (32739, 12146)
dataframe size: (32739, 836), NEW merged size: (32739, 12982)
dataframe size: (32739, 1558), NEW merged size: (32739, 14540)
dataframe size: (32739, 1249), NEW merged size: (32739, 15789)
dataframe size: (32739, 1157), NEW merged size: (32739, 16946)


In [21]:
merged.to_csv(os.path.join(output_dir, 'all_scRNA_barcodes_counts.conf{}.txt'.format(conf)), sep='\t')

# Read in featureCounts (3'utr) output

In [22]:
print("checking {}".format(os.path.join(tmp_dir, "*", "counts_at_conf_{}.three_prime_utr.txt".format(conf))))

all_counts = glob.glob(os.path.join(tmp_dir, "*", "counts_at_conf_{}.three_prime_utr.txt".format(conf)))
print(len(all_counts))
all_counts[:3]

checking /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/*/counts_at_conf_0.9.three_prime_utr.txt
16


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/CA/counts_at_conf_0.9.three_prime_utr.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/TA/counts_at_conf_0.9.three_prime_utr.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/tmp/for_featurecounts0.9/AT/counts_at_conf_0.9.three_prime_utr.txt']

In [23]:
merged = pd.DataFrame(index=[])
merged

In [24]:
progress = tnrange(16)
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_input_dir = os.path.join(tmp_dir, '{}{}'.format(nt, nt2))
        df = pd.read_csv(os.path.join(split_input_dir, 'counts_at_conf_{}.three_prime_utr.txt'.format(conf)), sep='\t', index_col=0, skiprows=1)
        df = df.iloc[:,5:]
        df.columns = [
            c.replace(split_input_dir, '') for c in df.columns
        ]
        merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
        print("dataframe size: {}, NEW merged size: {}".format(df.shape, merged.shape))
        progress.update(1)
        

HBox(children=(IntProgress(value=0, max=16), HTML(value=u'')))

dataframe size: (19615, 887), NEW merged size: (19615, 887)
dataframe size: (19615, 943), NEW merged size: (19615, 1830)
dataframe size: (19615, 1219), NEW merged size: (19615, 3049)
dataframe size: (19615, 1063), NEW merged size: (19615, 4112)
dataframe size: (19615, 1252), NEW merged size: (19615, 5364)
dataframe size: (19615, 808), NEW merged size: (19615, 6172)
dataframe size: (19615, 738), NEW merged size: (19615, 6910)
dataframe size: (19615, 1205), NEW merged size: (19615, 8115)
dataframe size: (19615, 1131), NEW merged size: (19615, 9246)
dataframe size: (19615, 760), NEW merged size: (19615, 10006)
dataframe size: (19615, 882), NEW merged size: (19615, 10888)
dataframe size: (19615, 1258), NEW merged size: (19615, 12146)
dataframe size: (19615, 836), NEW merged size: (19615, 12982)
dataframe size: (19615, 1558), NEW merged size: (19615, 14540)
dataframe size: (19615, 1249), NEW merged size: (19615, 15789)
dataframe size: (19615, 1157), NEW merged size: (19615, 16946)


In [25]:
merged.to_csv(os.path.join(output_dir, 'all_scRNA_barcodes_counts.conf{}.three_prime_utr.txt'.format(conf)), sep='\t')

In [26]:
print("wrote to: {}".format(os.path.join(output_dir, 'all_scRNA_barcodes_counts.conf{}.three_prime_utr.txt'.format(conf))))

wrote to: /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/all_scRNA_barcodes_counts.conf0.9.three_prime_utr.txt


In [27]:
print("wrote to: {}".format(os.path.join(output_dir, 'all_scRNA_barcodes_counts.conf{}.txt'.format(conf))))

wrote to: /home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/outputs/all_scRNA_barcodes_counts.conf0.9.txt
