# Counts the number of C's in a gene's exon or gene location. 
- Functionally equivalent to the SUPPLEMENT_get_number_of_c.ipynb in 01_bulk except that we need to parse a few annotations from 10X reference data. 
- We should compare the two just so we know the annotations are more or less the same. 

In [1]:
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import glob
import os
import numpy as np
import pandas as pd
import gffutils
import pysam
import pybedtools
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
from Bio import SeqIO
pd.set_option('display.max_columns', 150)

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/'
genome_fa = '/projects/ps-yeolab4/NCRCRG/refs/refdata-cellranger-hg19-3.0.0/fasta/genome.fa'

### Grep for exons and genes from the 10X annotation file.

In [3]:
genes = '/projects/ps-yeolab4/NCRCRG/refs/refdata-cellranger-hg19-3.0.0/genes/genes.gtf'
exons_gtf = os.path.join(output_dir, 'genes.exons.gtf')
genes_gtf = os.path.join(output_dir, 'genes.genes.gtf')

! grep exon $genes > $exons_gtf
! grep -P '\tgene\t' $genes > $genes_gtf

### Now convert the GTF lines into BED6

In [4]:
for fn in [exons_gtf, genes_gtf]:
    gtf = pd.read_csv(fn, names=['chrom','src','region','start','end','.','strand','.','attr'], sep='\t')
    gtf['geneid'] = gtf['attr'].str.extract("gene_id \"([\w\d\.]+)\"")
    gtf['score'] = 0
    gtf = gtf[['chrom','start','end','geneid','score','strand']]
    gtf.to_csv(fn + ".bed", sep='\t', header=False, index=False)

  return _read(filepath_or_buffer, kwds)
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
exons_file = exons_gtf + ".bed"
genes_file = genes_gtf + ".bed"

In [6]:
def get_positions_as_df(fn):
    """
    Reads in a BED file and returns as a DataFrame.
    """
    return pd.read_csv(fn, sep='\t', index_col=3, names=['chrom','start','end','score','strand'])


def get_merged_positions_as_bedtool(geneid, df):
    """
    Takes a dataframe (df) and gets regions pertaining only to (geneid). 
    Does a strand-specific merge and returns non-overlapping regions as a bedtool.
    """
    try:
        df = df.loc[geneid].reset_index()[['chrom','start','end','index','score','strand']]
        return pybedtools.BedTool.from_dataframe(df).sort().merge(
            s=True, c="4,5,6", o="distinct,sum,distinct"
        )
    except KeyError:
        return pybedtools.BedTool.from_dataframe(pd.DataFrame(df.loc[geneid]).T.reset_index()[['chrom','start','end','index','score','strand']]).sort().merge(
            s=True, c="4,5,6", o="distinct,sum,distinct"
        )
    

def get_total_editable_sites(bedtool, genome_fa, debug=False):
    """
    Gets total number of total C's in each gene, returns dataframe.
    - negative strand returns reverse complement so always look for C
    """
    gene_sequence = bedtool.sequence(fi=genome_fa, s=True, name=True)
    total_editable = 0
    with open(gene_sequence.seqfn) as f:
        for record in SeqIO.parse(f, "fasta"):
            if debug:
                print("Sequence: [{}]".format(record.seq.upper()))
            total_editable += record.seq.upper().count('C')
            if debug:
                print("Number Cs found: {}".format(total_editable))
    return total_editable

In [7]:
exons = get_positions_as_df(exons_file)
genes = get_positions_as_df(genes_file)

  if (await self.run_code(code, result,  async_=asy)):


In [None]:
exons.loc['ENSG00000215611']

Unnamed: 0,chrom,start,end,score,strand
ENSG00000215611,GL000201.1,27522,27637,0,-
ENSG00000215611,GL000201.1,27522,27614,0,-
ENSG00000215611,GL000201.1,27612,27614,0,-
ENSG00000215611,GL000201.1,26348,26820,0,-
ENSG00000215611,GL000201.1,26416,26820,0,-
ENSG00000215611,GL000201.1,26413,26415,0,-


In [None]:
get_merged_positions_as_bedtool('ENSG00000215611', exons).to_dataframe()

Unnamed: 0,chrom,start,end,name,score,strand
0,GL000201.1,26348,26820,ENSG00000215611,0,-
1,GL000201.1,27522,27637,ENSG00000215611,0,-


In [None]:
editable = defaultdict(dict)

for label, region in zip(['genes','exons'], [genes, exons]):
    progress = tnrange(len(set(region.index)))
    for gene in set(region.index):
        try:
            bedtool = get_merged_positions_as_bedtool(gene, df=region)
            if type(bedtool) == pybedtools.BedTool:
                editable[gene][label] = get_total_editable_sites(
                    bedtool=bedtool,
                    genome_fa=genome_fa,
                )
            pybedtools.cleanup()
        except Exception as e:
            print(e, gene)

        progress.update(1)

HBox(children=(IntProgress(value=0, max=32738), HTML(value='')))

HBox(children=(IntProgress(value=0, max=32738), HTML(value='')))


Command was:

	bedtools getfasta -s -name -fo /scratch/bay001/21959923.tscc-mgr7.local/pybedtools.7t0ux2i8.tmp -fi /projects/ps-yeolab4/NCRCRG/refs/refdata-cellranger-hg19-3.0.0/fasta/genome.fa -bed /scratch/bay001/21959923.tscc-mgr7.local/pybedtools.4jgetb2r.tmp

Error message was:
Feature (2:34928447-34928447) has length = 0, Skipping.
 ENSG00000272027


In [16]:
pd.DataFrame(editable).T.to_csv(os.path.join(input_dir, 'genes.editable_C.tsv'), sep='\t')

In [17]:
pd.DataFrame(editable).T

Unnamed: 0,exons,genes
ENSG00000204832,409.0,5864.0
ENSG00000130812,648.0,2710.0
ENSG00000247765,459.0,4940.0
ENSG00000073969,1014.0,31185.0
ENSG00000258947,1853.0,5123.0
ENSG00000164509,991.0,15172.0
ENSG00000261267,103.0,5653.0
ENSG00000237473,112.0,33519.0
ENSG00000168496,583.0,1015.0
ENSG00000229563,1036.0,23194.0
