# Calculates new epkm values for each cell barcode in ApoControl and RPS2.

In [1]:
%matplotlib inline
from matplotlib_venn import venn2
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import glob
import gffutils
from tqdm import tnrange, tqdm_notebook

In [2]:
annotated_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_epkm'

In [3]:
region = 'cds'  # should match the suffix of the files that get created. Also controls the if/else statements below. Can be either cds, cds_and_3utr, or exons
regions = ['CDS'] # ['CDS', '3utr']  # should line up with a region inside the annotated file
replace = False  # this analysis takes counts data from two sources 1) 10X output for counting all exonic regions 2) featureCounts for counting CDS/CDS+3UTR regions, so the format is different. Mostly helps with formatting, see: convert_filename_to_barcode()
rbp = 'Apo_Control'  # either Apo_Control, or RPS2
gencode = False

In [4]:
if region == 'cds':
    if rbp == 'RPS2':
        counts_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-all_counts.cds_only.txt'
    else:
        counts_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/Apo_Control_possorted_genome_bam_MD-all_counts.cds_only.txt'
    lengths_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data/20191003_riboseq/featurecounts/counts.cds_only.txt'  # get the lengths of just CDS regions
elif region == 'cds_and_3utr':
    if rbp == 'RPS2':
        counts_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/RPS2_possorted_genome_bam_MD-all_counts.cds_and_3utr.txt'
    else:
        counts_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/Apo_Control_possorted_genome_bam_MD-all_counts.cds_and_3utr.txt'
    lengths_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data/20191003_riboseq/featurecounts/counts.cds_and_3utr.txt'  # get the lengths of just CDS regions
elif region == 'exons':
    if rbp == 'RPS2':
        counts_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/RPS2_filtered_feature_bc_matrix.csv'
    else:
        counts_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/inputs/Apo_Control_filtered_feature_bc_matrix.csv'
    lengths_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data/20191003_riboseq/featurecounts/counts.txt'  # get the lengths of all exons

print(counts_file)
print(lengths_file)

/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs/expression_counts/Apo_Control_possorted_genome_bam_MD-all_counts.cds_only.txt
/home/bay001/projects/kris_apobec_20200121/permanent_data/20191003_riboseq/featurecounts/counts.cds_only.txt


In [5]:
all_annotated = sorted(glob.glob(os.path.join(annotated_dir, '{}*.annotated'.format(rbp))))
print(len(all_annotated))
all_annotated[:3]  # 19611 for all, 8616 for Apo, 10995 for RPS2

8616


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/Apo_Control_possorted_genome_bam_MD-AAACCCAAGCCAGTAG-1.fx.bed.annotated',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/Apo_Control_possorted_genome_bam_MD-AAACCCAAGGATGCGT-1.fx.bed.annotated',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/Apo_Control_possorted_genome_bam_MD-AAACCCACAATACAGA-1.fx.bed.annotated']

# Read gene counts
- Just doing this once since it's a big file. We just need the total summed read counts across each cell, so I'll just write that to its own file and reference that instead of reading the counts every time.

# Compute epkm:
- (# of Edited Counts (from SAILOR)) / ((total mapped read counts/10^6)*(Gene length/1000))

In [6]:
annotated_names = ['chrom','start','end','conf','cov','strand','geneid','genename','region','overlap']

def read_and_sum_edits(f, regions=regions, gencode=True):
    edits = pd.read_csv(f, sep='\t', names=annotated_names)
    del edits['overlap']
    edits = edits[edits['region'].isin(regions)]
    edits = edits.loc[edits.geneid.apply(lambda x: len(x.split(','))) == 1] # removing ambiguous editing events (more than one gene)
    edits['edited_reads'] = edits['cov'].apply(lambda x: int(x.split(',')[0]))
    if not gencode:
        edits.reset_index(inplace=True)
        edits['ensembl_geneid'] = edits['geneid'].apply(lambda x: x.split('.')[0])
        del edits['geneid']
        edits.set_index('ensembl_geneid', inplace=True)
        return pd.DataFrame(edits.groupby('ensembl_geneid')['edited_reads'].sum())
    return pd.DataFrame(edits.groupby('geneid')['edited_reads'].sum())
# testing
read_and_sum_edits(f=all_annotated[0], gencode=gencode).head()

Unnamed: 0_level_0,edited_reads
ensembl_geneid,Unnamed: 1_level_1
ENSG00000087086,1
ENSG00000089009,1
ENSG00000096384,2
ENSG00000101182,2
ENSG00000104529,8


In [7]:
# one more test
test_annotated_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/RPS2_possorted_genome_bam_MD-GTAACCAAGTGAATAC-1.fx.bed.annotated'
read_and_sum_edits(f=test_annotated_file, gencode=gencode).loc['ENSG00000144713']

edited_reads    38
Name: ENSG00000144713, dtype: int64

In [8]:
# These should be the read counts across the specified genomic regions (either all exons, CDS only, or CDS + 3'UTR)
read_counts = pd.read_csv(counts_file + ".readcounts.txt", sep='\t', index_col=0)
read_counts.head()

Unnamed: 0,read_counts
Apo_Control_possorted_genome_bam_MD-AAACCCAAGCCAGTAG-1.bam,16998
Apo_Control_possorted_genome_bam_MD-AAACCCAAGGATGCGT-1.bam,24143
Apo_Control_possorted_genome_bam_MD-AAACCCACAATACAGA-1.bam,14618
Apo_Control_possorted_genome_bam_MD-AAACCCACACGCAGTC-1.bam,23624
Apo_Control_possorted_genome_bam_MD-AAACCCACAGAACATA-1.bam,18494


In [9]:
def convert_filename_to_barcode(f, replace=True):
    """
    replace: True if we are using all exons (due to the different format of mtx file). False if we're using a merged featureCounts file (which we used when counting CDS and CDS+3'UTR reads)
    """
    if replace:
        return f.replace(
            '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/Apo_Control_possorted_genome_bam_MD-',''
        ).replace(
            '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/RPS2_possorted_genome_bam_MD-',''
        ).replace(
            '.fx.bed.annotated',''
        )
    else:
        return f.replace(
            '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/',''
        ).replace(
            '.fx.bed.annotated',''
        ) + '.bam'
# testing
convert_filename_to_barcode(all_annotated[0], replace=replace)

'Apo_Control_possorted_genome_bam_MD-AAACCCAAGCCAGTAG-1.bam'

In [10]:
def get_read_counts_from_barcode(barcode, read_counts=read_counts):
    return read_counts.loc[barcode]['read_counts']

# testing
get_read_counts_from_barcode(convert_filename_to_barcode(all_annotated[0], replace=replace))

16998

# Need to get the gene lengths from somewhere, let's just use the lengths from a featureCounts output that we've run before.
- we can use different lengths if we're calculating the EPKM across CDS only.

In [11]:
lengths = pd.read_csv(
    lengths_file,
    index_col=0,
    skiprows=1,
    sep='\t'
)[['Length']]
lengths.head()

Unnamed: 0_level_0,Length
Geneid,Unnamed: 1_level_1
ENSG00000186092,915
ENSG00000237683,777
ENSG00000235249,936
ENSG00000185097,936
ENSG00000269831,129


In [12]:
def epkm(row, total_mapped, colname):
    edit_counts = row[colname]
    return edit_counts/((total_mapped/1000000.)*(row['Length']/1000.))

In [13]:
progress = tnrange(len(all_annotated))

for annotated in all_annotated:
    output_file = os.path.join(output_dir, os.path.basename(annotated) + ".{}.epkm.tsv".format(region))
    if not os.path.exists(output_file):
        edit_counts = read_and_sum_edits(f=annotated, gencode=gencode)
        total_mapped = get_read_counts_from_barcode(convert_filename_to_barcode(annotated, replace=replace))

        read_edit_counts = pd.merge(lengths, edit_counts, how='left', left_index=True, right_index=True).fillna(0)
        read_edit_counts['epkm'] = read_edit_counts.apply(epkm, axis=1, args=(total_mapped, 'edited_reads'))
        read_edit_counts.to_csv(
            output_file, 
            sep='\t'
        )
    progress.update(1)

HBox(children=(IntProgress(value=0, max=8616), HTML(value='')))