In [1]:
from eda_import import *
import pysam

In [2]:
print(datetime.datetime.today())

2018-05-09 08:31:55.495110


In [3]:
from extract_targets import extract_info

In [4]:
gtf = './reference_data/Homo_sapiens.GRCh37.75.gtf'

In [5]:
%%time
# http://uswest.ensembl.org/info/website/upload/gff.html
names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame',
         'attribute']
# adf: annotation df
print('reading {0}...'.format(gtf))
df = pd.read_csv(gtf, header=None, sep='\t', comment='#', low_memory=False, names=names)

reading ./reference_data/Homo_sapiens.GRCh37.75.gtf...
CPU times: user 14.4 s, sys: 1.29 s, total: 15.7 s
Wall time: 15.7 s


In [6]:
%time extract_info(df)

extracting length...
extracting transcript id...
extracting gene id...
extracting gene name...
CPU times: user 36.8 s, sys: 1.12 s, total: 37.9 s
Wall time: 37.9 s


In [24]:
df.head(2)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,len,transcript_id,gene_id,gene_name
0,1,pseudogene,gene,11869,14412,.,+,.,2544,,ENSG00000223972,DDX11L1
1,1,processed_transcript,transcript,11869,14409,.,+,.,2541,ENST00000456328,ENSG00000223972,DDX11L1


# Count stop codons per gene

In [26]:
%%time
sc_count = df.query('source == "protein_coding"').groupby('gene_id').apply(
    lambda g: g.query('feature == "stop_codon"').shape[0])

CPU times: user 1min 46s, sys: 573 ms, total: 1min 46s
Wall time: 1min 46s


In [27]:
sc_count.head()

gene_id
ENSG00000000003    1
ENSG00000000005    1
ENSG00000000419    3
ENSG00000000457    3
ENSG00000000460    4
dtype: int64

In [30]:
assert sc_count[sc_count > 1].shape[0] / float(sc_count.shape[0]) == 0.6531738730450782

about 65% of genes have multiple stop codons

# % of protein coding genes over whole genome

In [40]:
# chromosome lengths
chr_lens = df.groupby('seqname').apply(lambda g: g.end.max() - g.start.min())

In [46]:
chr_lens.sort_values(ascending=False).head(24) # after 24th are those patches

seqname
1     249219373
2     243122011
3     197889816
4     190964728
5     180841118
6     170959941
7     158955095
X     155087438
8     146267325
9     141138161
10    135425372
11    134870013
12    133747528
13    96058111 
16    90227533 
14    88177816 
15    82435529 
17    81183612 
18    77994326 
20    62876594 
19    59051063 
Y     56348845 
21    38427966 
22    35177580 
dtype: int64

In [49]:
genome_len = chr_lens.sum()
assert genome_len == 3144910296 

In [50]:
%%time
protein_coding_length = df.query(
    'source == "protein_coding"').query('feature == "gene"')['len'].sum()
assert protein_coding_length == 1395707084

CPU times: user 467 ms, sys: 198 ms, total: 665 ms
Wall time: 660 ms


In [51]:
protein_coding_length / genome_len

0.4437986946003499

~44% of the protein is coding, but this include introns

In [52]:
protein_coding_exon_length = df.query(
    'source == "protein_coding"').query('feature == "exon"')['len'].sum()
assert protein_coding_exon_length == 192167710

In [53]:
protein_coding_exon_length / genome_len

0.06110435335609331

This number still includes repeated count of multiple transcripts per gene

In [54]:
beg_end_pairs = df.query(
    'source == "protein_coding"').query('feature == "exon"')[['start', 'end']].values

In [56]:
%%time
bases = set([])
for pair in beg_end_pairs:
    bases.update(np.arange(pair[0], pair[1]))

CPU times: user 49.9 s, sys: 3.38 s, total: 53.3 s
Wall time: 53.2 s


In [59]:
assert len(bases) / genome_len == 0.020225602008713064

Based on this calculation, it's about 2% of the genome that's protein-coding, NOT including introns

## How many protein coding genes have multiple transcripts

In [99]:
gs = df[(df.source == 'protein_coding') & (df.feature.isin(['transcript']))].groupby('gene_id')

In [106]:
cdf = gs.size()

In [108]:
assert cdf[cdf > 1].shape[0] == 16627

(16627,)

In [109]:
assert cdf.shape[0] == 22642

In [110]:
16627 / 22642

0.7343432558961223

So over 73% of protein coding genes have more than one transcripts

## Show strand uniqueness per gene

In [111]:
res = df.groupby(['gene_id', 'gene_name']).apply(lambda g: g.strand.unique().shape[0])

In [112]:
res.shape

(63677,)

In [113]:
assert (res == 1).all()

That it returns True means all genes have only one unique strand.

In [114]:
df.gene_id.unique().shape

(63677,)

This is larger than 20k normally discussed, there are probably other ids than protein coding genes.

### How many transcripts don't have UTR?

In [115]:
ndf = df.query('source == "protein_coding"').query('feature != "gene"')

In [116]:
ndf.head(2)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,len,transcript_id,gene_id,gene_name
110,1,protein_coding,transcript,69091,70008,.,+,.,918,ENST00000335137,ENSG00000186092,OR4F5
111,1,protein_coding,exon,69091,70008,.,+,.,918,ENST00000335137,ENSG00000186092,OR4F5


In [118]:
%%time
has_utr_df = ndf.groupby('transcript_id').apply(
    lambda grp: grp.query('feature == "UTR"').shape[0] > 0)

CPU times: user 6min 6s, sys: 1.79 s, total: 6min 8s
Wall time: 6min 7s


In [119]:
has_utr_df.value_counts()

True     85661
False    4612 
dtype: int64

In [120]:
4612 / 85661.

0.05384013728534572

So 5.4% of the annotated transcripts don't have UTR

In [122]:
%%time
has_sc_df = ndf.groupby('transcript_id').apply(
    lambda grp: grp.query('feature == "stop_codon"').shape[0] > 0)

CPU times: user 4min 36s, sys: 1.07 s, total: 4min 37s
Wall time: 4min 37s


In [70]:
has_sc_df.value_counts()

True     62800
False    27473
dtype: int64

In [73]:
27473 / 62800.

0.437468152866242

44% of annotated transcripts don't have stop codons!!! How come?
Ans: there are transcripts that have UTR but not stop codons annotated: e.g. BRCA2-ENST00000470094, BRCA2-ENST00000470094, DRAM1-ENST00000549365, but the are three are all nonsense_mediated_decay (NMD). However, PIK3R1-ENST00000517698 is also a NMD, but it has annotated start and stop codons. Confused :(