Related to https://github.com/bcgsc/tasrkleat-TCGA-analysis-scripts/blob/master/analysis-notebooks/core/map-clv2sc-based-on-annotation-for-114-genes.ipynb, but extend the analysis to all genes in the annotation.

In [1]:
import sys
sys.path.insert(0, '..')
from eda_import import *
import pysam

In [2]:
print(datetime.datetime.today())

2018-05-11 10:15:56.283099


In [3]:
from utils.parse_gtf_utils import extract_info
from utils.calc_clv_sc import gen_sc_clv_per_transcript, get_strand
from utils.cluster import cluster_clv_sites

In [4]:
gtf = '../reference_data/Homo_sapiens.GRCh37.75.gtf'

In [5]:
%%time
# http://uswest.ensembl.org/info/website/upload/gff.html
names = ['seqname', 'source', 'feature', 'start', 'end', 'score',
         'strand', 'frame', 'attribute']
# adf: annotation df
print('reading {0}...'.format(gtf))
adf = pd.read_csv(gtf, header=None, sep='\t', comment='#', low_memory=False, names=names)

reading ../reference_data/Homo_sapiens.GRCh37.75.gtf...
CPU times: user 13.8 s, sys: 2.42 s, total: 16.3 s
Wall time: 16.3 s


In [6]:
%time extract_info(adf)

extracting length...
extracting transcript id...
extracting gene id...
extracting gene name...
extracting gene source...
extracting transcript source...
extracting tag cds_end_NF...
extracting tag cds_start_NF...
CPU times: user 22.6 s, sys: 1.35 s, total: 23.9 s
Wall time: 23.8 s


In [7]:
adf.head(2)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,len,transcript_id,gene_id,gene_name,gene_source,transcript_source,is_cds_end_NF,is_cds_start_NF
0,1,pseudogene,gene,11869,14412,.,+,.,2544,,ENSG00000223972,DDX11L1,ensembl_havana,,False,False
1,1,processed_transcript,transcript,11869,14409,.,+,.,2541,ENST00000456328,ENSG00000223972,DDX11L1,ensembl_havana,havana,False,False


In [8]:
bdf = adf\
    .query('source in {0}'.format(['protein_coding', 'nonsense_mediated_decay']))\
    .query('is_cds_end_NF == False').copy()

In [9]:
bdf.query('feature == "transcript"').transcript_source.value_counts()

havana            39620
ensembl_havana    29115
ensembl           13434
insdc             13   
Name: transcript_source, dtype: int64

In [10]:
%%time
cdf = bdf\
    .query('feature != "gene"')\
    .groupby(['gene_name', 'gene_id', 'transcript_id', 'seqname'])\
    .apply(gen_sc_clv_per_transcript)

CPU times: user 8min 29s, sys: 1.65 s, total: 8min 30s
Wall time: 8min 29s


In [12]:
# as a cache
# cdf.to_csv('../__results/gtf-analysis-all-genes.csv', index=False)

In [13]:
cdf.reset_index(inplace=True)

In [14]:
cdf.gene_name.unique().shape

(20462,)

In [15]:
cdf.source.value_counts()

protein_coding             68388
nonsense_mediated_decay    13794
Name: source, dtype: int64

In [16]:
cdf.head(2)

Unnamed: 0,gene_name,gene_id,transcript_id,seqname,source,sc,clv,strand,gene_source,transcript_source,is_cds_end_NF,is_cds_start_NF
0,A1BG,ENSG00000121410,ENST00000263100,19,protein_coding,58858388,58858216,-,ensembl_havana,ensembl_havana,False,False
1,A1CF,ENSG00000148584,ENST00000282641,10,protein_coding,52566489,52566327,-,ensembl_havana,ensembl,False,False


#### Side note: Check cases of zero 3'UTR length

In [18]:
def calc_utr_beg(row):
    """calculate the beginning of 3'UTR"""
    if row.strand == '+':
        return row.sc + 1
    else:
        return row.sc - 1

cdf['utr_beg'] = cdf.apply(calc_utr_beg, axis=1)

In [19]:
cdf.query('utr_beg == clv').source.value_counts()

protein_coding             3721
nonsense_mediated_decay    1   
Name: source, dtype: int64

In [21]:
_cols = [
    'seqname', 'gene_name', 'gene_id', 'strand', 'transcript_id', 
     'source', 'sc', 'utr_beg', 'clv', 'transcript_source'
]
zero_len_df = cdf.query('utr_beg == clv').copy()[_cols]

In [22]:
zero_len_df.sample(10)

Unnamed: 0,seqname,gene_name,gene_id,strand,transcript_id,source,sc,utr_beg,clv,transcript_source
4269,19,AP2S1,ENSG00000042753,-,ENST00000601649,protein_coding,47341704,47341703,47341703,ensembl_havana
666,16,AC009060.1,ENSG00000269746,-,ENST00000601706,protein_coding,70239303,70239302,70239302,ensembl
49942,12,P2RX7,ENSG00000089041,+,ENST00000535250,protein_coding,121622605,121622606,121622606,ensembl
41098,15,MCTP2,ENSG00000140563,+,ENST00000331706,protein_coding,94945786,94945787,94945787,ensembl
2353,HSCHR6_MHC_COX,AGER,ENSG00000234729,-,ENST00000552621,protein_coding,32097397,32097396,32097396,ensembl
74107,HG1497_PATCH,TREX2,ENSG00000269342,-,ENST00000595871,protein_coding,152613192,152613191,152613191,ensembl
70125,15,TBC1D21,ENSG00000167139,+,ENST00000535547,protein_coding,74181442,74181443,74181443,ensembl_havana
71348,2,THNSL2,ENSG00000144115,+,ENST00000402102,protein_coding,88485210,88485211,88485211,ensembl
74138,HSCHR6_MHC_MANN,TRIM10,ENSG00000235025,-,ENST00000551591,protein_coding,30166914,30166913,30166913,ensembl
14663,7,CHN2,ENSG00000106069,+,ENST00000539406,protein_coding,29552351,29552352,29552352,ensembl


In [23]:
cdf['sc'] = cdf.sc.astype(int)
cdf['utr_beg'] = cdf.utr_beg.astype(int)
cdf['clv'] = cdf.clv.astype(int)
cdf.drop_duplicates(inplace=True)

In [27]:
cdf.shape

(82182, 13)

In [28]:
_ = cdf
for i in range(2):
    print('{0} clustering...'.format(i))
    # cluster twice to final results more stable, see the experiment below
    %time _ = _.groupby('gene_id').apply(cluster_clv_sites, 20).reset_index(drop=True)
    # If there is only one annotated clv sites for that gene, after groupby & reset_index(),
    # the cluster_idx column is NaN and thus testing for convergence would always be False in the next step
    _['cluster_idx'] = _.cluster_idx.fillna(-1)
ddf = _

0 clustering...
CPU times: user 1min 52s, sys: 1.14 s, total: 1min 53s
Wall time: 1min 53s
1 clustering...
CPU times: user 1min 35s, sys: 426 ms, total: 1min 35s
Wall time: 1min 35s


In [31]:
# Make sure it's stable now, (fillna(-1) for cluster_idx)
_tmp = ddf.groupby('gene_id').apply(cluster_clv_sites, 20).reset_index(drop=True).fillna(-1) == ddf
_tmp.all().to_frame().T

Unnamed: 0,gene_name,gene_id,transcript_id,seqname,source,sc,clv,strand,gene_source,transcript_source,is_cds_end_NF,is_cds_start_NF,utr_beg,cluster_idx,mclv
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [32]:
ddf.shape

(82182, 15)

In [33]:
ddf.head(2)

Unnamed: 0,gene_name,gene_id,transcript_id,seqname,source,sc,clv,strand,gene_source,transcript_source,is_cds_end_NF,is_cds_start_NF,utr_beg,cluster_idx,mclv
0,TSPAN6,ENSG00000000003,ENST00000373020,X,protein_coding,99885795,99883667,-,ensembl_havana,ensembl_havana,False,False,99885794,1,99883667
1,TNMD,ENSG00000000005,ENST00000373031,X,protein_coding,99854714,99854882,+,ensembl_havana,ensembl_havana,False,False,99854715,1,99854882


Sanity check

In [34]:
def check_strand_with_coords(grp):
    """Check indeed cleavage sites are all downstream of stop codon"""
    strand = get_strand(grp)
    if strand == '+':
        return (grp.clv >= grp.sc).all()
    elif strand == '-':
        return (grp.clv <= grp.sc).all()
    else:
        raise

In [35]:
%time ddf.groupby('gene_id').apply(check_strand_with_coords).all()

CPU times: user 10.6 s, sys: 23.1 ms, total: 10.7 s
Wall time: 10.6 s


True

`cdf` is the dataframe with CSs extracted, and `ddf` is the one with clv clustered

In [36]:
assert ddf.shape[0] == cdf.shape[0]

In [49]:
%time ddf.to_csv('../__results/annotated-clv-sc-mapping-all-genes.csv', index=False)

CPU times: user 736 ms, sys: 64.1 ms, total: 800 ms
Wall time: 836 ms
