In [1]:
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import glob
import os
import numpy as np
import pandas as pd
import gffutils
import pysam
import pybedtools
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from Bio import SeqIO
pd.set_option('display.max_columns', 150)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/inputs/'
genome_fa = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
exons_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/hg19_v19_exons.bed'
genes_file = '/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/hg19_v19_genes.bed'

In [3]:
def get_positions_as_df(fn):
    return pd.read_csv(fn, sep='\t', index_col=3, names=['chrom','start','end','score','strand'])


def get_merged_positions_as_bedtool(geneid, df):
    try:
        df = df.loc[geneid].reset_index()[['chrom','start','end','index','score','strand']]
        return pybedtools.BedTool.from_dataframe(df).sort().merge(
            s=True, c="4,5,6", o="distinct,sum,distinct"
        )
    except KeyError:
        return pybedtools.BedTool.from_dataframe(pd.DataFrame(df.loc[geneid]).T.reset_index()[['chrom','start','end','index','score','strand']]).sort().merge(
            s=True, c="4,5,6", o="distinct,sum,distinct"
        )
    

def get_total_editable_sites(bedtool, genome_fa, debug=False):
    """
    Gets total number of total C's in each gene, returns dataframe.
    - negative strand returns reverse complement so always look for C
    """
    gene_sequence = bedtool.sequence(fi=genome_fa, s=True, name=True)
    total_editable = 0
    with open(gene_sequence.seqfn) as f:
        for record in SeqIO.parse(f, "fasta"):
            if debug:
                print("Sequence: [{}]".format(record.seq.upper()))
            total_editable += record.seq.upper().count('C')
            if debug:
                print("Number Cs found: {}".format(total_editable))
    return total_editable

In [None]:
exons = get_positions_as_df(exons_file)
genes = get_positions_as_df(genes_file)

In [None]:
exons.loc['ENSG00000223972.4']

Unnamed: 0,chrom,start,end,score,strand
ENSG00000223972.4,chr1,11868,12227,0,+
ENSG00000223972.4,chr1,12594,12721,0,+
ENSG00000223972.4,chr1,12974,13052,0,+
ENSG00000223972.4,chr1,13220,14412,0,+


In [None]:
editable = defaultdict(dict)

for label, region in zip(['genes','exons'], [genes, exons]):
    progress = tnrange(len(set(region.index)))
    for gene in set(region.index):
        try:
            bedtool = get_merged_positions_as_bedtool(gene, df=region)
            if type(bedtool) == pybedtools.BedTool:
                editable[gene][label] = get_total_editable_sites(
                    bedtool=bedtool,
                    genome_fa=genome_fa,
                )
            pybedtools.cleanup()
        except Exception as e:
            print(e, gene)

        progress.update(1)

HBox(children=(IntProgress(value=0, max=57820), HTML(value='')))

In [None]:
pd.DataFrame(editable).T.to_csv(os.path.join(input_dir, 'hg19_v19.editable_C.tsv'), sep='\t')