# For this metagene plot, we're using either the top 200/1000 or Q1 genes from the boxplot figures.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import gffutils
import os
from matplotlib_venn import venn2, venn3
from collections import defaultdict

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_3/inputs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_3/inputs'

db_file = '/projects/ps-yeolab4/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'

# Generate metagene using the top 200 transcripts from the GSE112353 Riboseq study.
- Using the top 

In [3]:
top = 200

In [4]:
hek_counts = pd.read_csv(os.path.join(input_dir, 'GSE112353_HEK_count_table.txt'), sep='\t', index_col=[2])
hek_counts.head()

Unnamed: 0_level_0,external_transcript_name,transcript_biotype,ensembl_gene_id,transcript_length,l_5utr,l_cds,l_3utr,ribolace_HEK_1,ribolace_HEK_2,riboseq_HEK_1,riboseq_HEK_2
ensembl_transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENST00000614365,5_8S_rRNA.1-201,rRNA,ENSG00000275877,152,0,0,0,1,1,5,1
ENST00000364415,5S_rRNA.1-201,rRNA,ENSG00000201285,116,0,0,0,0,0,0,0
ENST00000603504,7SK.6-201,misc_RNA,ENSG00000271394,247,0,0,0,5,51,9,6
ENST00000263100,A1BG-001,protein_coding,ENSG00000121410,1722,62,1488,172,6,4,4,10
ENST00000595014,A1BG-002,retained_intron,ENSG00000121410,2301,0,0,0,5,3,13,11


In [5]:
hek_counts['riboseq_HEK_1'].sum()

5900875

# Authors recommend to TPM or RPKM normalize these values prior to ranking quartiles:

In [6]:
def counts_to_rpkm(feature_counts_table):
    counts = hek_counts.ix[:,9:]
    lengths = hek_counts['transcript_length']
    mapped_reads = counts.sum()
    return (counts * pow(10,9)).div(mapped_reads, axis=1).div(lengths, axis=0)

def counts_to_tpm(counts_table):
    """
    simple function that converts a featureCounts pandas Dataframe
    into a TPM dataframe.
    
    :param counts_table: pandas.DataFrame() 
        either a featureCounts table (first five cols contain non-count info,
        the rest contain raw counts) or a generic counts table (use skip_col=0
        in this case)
    :return tpm: pandas.DataFrame
    """
    rpkm = counts_to_rpkm(counts_table)
    tpm = rpkm.div(rpkm.sum())*pow(10,6)
    return tpm

tpm = counts_to_tpm(hek_counts)
del hek_counts['riboseq_HEK_1']
del hek_counts['riboseq_HEK_2']
merged_tpm = pd.merge(hek_counts, tpm, how='outer', left_index=True, right_index=True)
assert merged_tpm.shape[0] == merged_tpm.dropna().shape[0]
merged_tpm.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0_level_0,external_transcript_name,transcript_biotype,ensembl_gene_id,transcript_length,l_5utr,l_cds,l_3utr,ribolace_HEK_1,ribolace_HEK_2,riboseq_HEK_1,riboseq_HEK_2
ensembl_transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENST00000614365,5_8S_rRNA.1-201,rRNA,ENSG00000275877,152,0,0,0,1,1,2.69257,1.04415
ENST00000364415,5S_rRNA.1-201,rRNA,ENSG00000201285,116,0,0,0,0,0,0.0,0.0
ENST00000603504,7SK.6-201,misc_RNA,ENSG00000271394,247,0,0,0,5,51,2.982539,3.855323
ENST00000263100,A1BG-001,protein_coding,ENSG00000121410,1722,62,1488,172,6,4,0.190137,0.921665
ENST00000595014,A1BG-002,retained_intron,ENSG00000121410,2301,0,0,0,5,3,0.462453,0.758722


In [7]:
merged_tpm.reset_index(inplace=True)
hek_counts = merged_tpm

In [8]:
# only protein coding genes for riboseq table
print(hek_counts.shape[0]) # all geneids
hek_counts = hek_counts[hek_counts['transcript_biotype']=='protein_coding']
print(hek_counts.shape[0]) # only protein-coding geneids

# Check a few of these to make sure we didn't screw anything up.
hek_counts[hek_counts['ensembl_gene_id'].isin(['ENSG00000096384', 'ENSG00000080824', 'ENSG00000075624'])]

196147
79971


Unnamed: 0,ensembl_transcript_id,external_transcript_name,transcript_biotype,ensembl_gene_id,transcript_length,l_5utr,l_cds,l_3utr,ribolace_HEK_1,ribolace_HEK_2,riboseq_HEK_1,riboseq_HEK_2
6119,ENST00000331789,ACTB-001,protein_coding,ENSG00000075624,1917,192,1128,597,640,450,62.511451,88.338242
6126,ENST00000432588,ACTB-008,protein_coding,ENSG00000075624,568,77,491,0,207,136,73.928117,110.371058
6127,ENST00000443528,ACTB-009,protein_coding,ENSG00000075624,569,261,308,0,120,70,37.978016,67.500897
6128,ENST00000414620,ACTB-010,protein_coding,ENSG00000075624,561,319,242,0,109,63,31.224215,54.601037
6130,ENST00000417101,ACTB-012,protein_coding,ENSG00000075624,472,174,298,0,128,88,48.210697,86.080427
70542,ENST00000334701,HSP90AA1-001,protein_coding,ENSG00000080824,3510,282,2565,663,487,338,29.943222,24.236177
70543,ENST00000216281,HSP90AA1-002,protein_coding,ENSG00000080824,3379,206,2199,974,510,367,35.270677,27.900032
70546,ENST00000553585,HSP90AA1-005,protein_coding,ENSG00000080824,581,59,522,0,93,66,32.544415,24.311979
70551,ENST00000558600,HSP90AA1-010,protein_coding,ENSG00000080824,595,295,300,0,0,1,0.55028,1.066963
70558,ENST00000371646,HSP90AB1-003,protein_coding,ENSG00000096384,2572,109,2175,288,585,418,53.243377,55.41302


# We are ranking based on the average riboseq occupancy value between replicates

In [9]:
def get_avg_riboseq_counts(row):
    """just returns the avg of two reps."""
    return (row['riboseq_HEK_1'] + row['riboseq_HEK_2'])/2.

hek_counts['avg_riboseq'] = hek_counts.apply(get_avg_riboseq_counts, axis=1)
hek_counts = hek_counts[['external_transcript_name', 'ensembl_transcript_id', 'ensembl_gene_id', 'transcript_biotype', 'avg_riboseq']]
hek_counts.sort_values(by=['avg_riboseq'], inplace=True, ascending=False)
hek_counts.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,external_transcript_name,ensembl_transcript_id,ensembl_gene_id,transcript_biotype,avg_riboseq
68408,HIST1H3B-001,ENST00000621411,ENSG00000274267,protein_coding,2967.735424
68422,HIST1H4C-001,ENST00000377803,ENSG00000197061,protein_coding,1828.167891
68436,HIST2H2AA3-001,ENST00000369159,ENSG00000203812,protein_coding,1652.179639
68437,HIST2H2AA4-001,ENST00000607355,ENSG00000272196,protein_coding,1625.573037
68433,HIST1H4K-001,ENST00000611927,ENSG00000273542,protein_coding,1561.316934


# Okay so a ton of the top transcripts belong to histones, which lack pA and may not be informative for the RPS stuff. So let's remove them, select the highest avg_riboseq tx and take the top 200 genes.
- top 200 genes should give us the signal we're looking for. If all good, we can use the top Q1.
- Removing anything in the histone_genes.bed file
- Removing any gene that starts with 'HIST'

In [10]:
histones = pd.read_csv('/home/rmarina/projects/apobec/ref_files/histone_genes.bed', sep='\t', names=['chrom','start','end','name','score','strand'])
histones['ensembl_gene_id'] = histones['name'].apply(lambda x: x.split('.')[0])
histones = set(histones['ensembl_gene_id'])
'ENSG00000274267' in histones  # Apparently I'm missing quite a few from the histone_genes.bed file. Probably because these were hg18 or that the list was not complete.

False

In [11]:
hek_counts.head()

Unnamed: 0,external_transcript_name,ensembl_transcript_id,ensembl_gene_id,transcript_biotype,avg_riboseq
68408,HIST1H3B-001,ENST00000621411,ENSG00000274267,protein_coding,2967.735424
68422,HIST1H4C-001,ENST00000377803,ENSG00000197061,protein_coding,1828.167891
68436,HIST2H2AA3-001,ENST00000369159,ENSG00000203812,protein_coding,1652.179639
68437,HIST2H2AA4-001,ENST00000607355,ENSG00000272196,protein_coding,1625.573037
68433,HIST1H4K-001,ENST00000611927,ENSG00000273542,protein_coding,1561.316934


In [12]:
riboseq_hek_counts = hek_counts[(~hek_counts['ensembl_gene_id'].isin(histones)) & (~hek_counts['external_transcript_name'].str.startswith('HIST'))]
print(riboseq_hek_counts.shape[0])
riboseq_hek_counts.head()

79847


Unnamed: 0,external_transcript_name,ensembl_transcript_id,ensembl_gene_id,transcript_biotype,avg_riboseq
188510,WDR74-005,ENST00000538098,ENSG00000133316,protein_coding,1188.642492
97677,MTRNR2L8-001,ENST00000536684,ENSG00000255823,protein_coding,709.997982
97668,MTRNR2L12-001,ENST00000600213,ENSG00000269028,protein_coding,548.488883
188506,WDR74-001,ENST00000525239,ENSG00000133316,protein_coding,374.158075
97675,MTRNR2L6-001,ENST00000604952,ENSG00000270672,protein_coding,353.914001


### Sort by avg riboseq values (highest first) and then drop duplicate geneIDs, keeping just the first one. This effectively keeps only the transcript with the highest riboseq occupancy level (by tpm) per gene, and drops the rest. 

In [13]:
riboseq_hek_counts.sort_values('avg_riboseq', ascending=False, inplace=True)
riboseq_hek_counts.drop_duplicates(['ensembl_gene_id'], inplace=True, keep='first')
riboseq_hek_counts.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,external_transcript_name,ensembl_transcript_id,ensembl_gene_id,transcript_biotype,avg_riboseq
188510,WDR74-005,ENST00000538098,ENSG00000133316,protein_coding,1188.642492
97677,MTRNR2L8-001,ENST00000536684,ENSG00000255823,protein_coding,709.997982
97668,MTRNR2L12-001,ENST00000600213,ENSG00000269028,protein_coding,548.488883
97675,MTRNR2L6-001,ENST00000604952,ENSG00000270672,protein_coding,353.914001
91836,MIF-001,ENST00000215754,ENSG00000240972,protein_coding,319.183043


### Subset the top 200

In [14]:
riboseq_hek_counts.reset_index(drop=True, inplace=True) # reorder the index too just to make sure.
riboseq_hek_counts_top = riboseq_hek_counts.head(top)
print(riboseq_hek_counts_top.shape[0])
riboseq_hek_counts_top.head(10)

200


Unnamed: 0,external_transcript_name,ensembl_transcript_id,ensembl_gene_id,transcript_biotype,avg_riboseq
0,WDR74-005,ENST00000538098,ENSG00000133316,protein_coding,1188.642492
1,MTRNR2L8-001,ENST00000536684,ENSG00000255823,protein_coding,709.997982
2,MTRNR2L12-001,ENST00000600213,ENSG00000269028,protein_coding,548.488883
3,MTRNR2L6-001,ENST00000604952,ENSG00000270672,protein_coding,353.914001
4,MIF-001,ENST00000215754,ENSG00000240972,protein_coding,319.183043
5,MTRNR2L1-001,ENST00000540040,ENSG00000256618,protein_coding,282.344461
6,PTCH2-002,ENST00000438067,ENSG00000117425,protein_coding,224.240603
7,AP000350.10-005,ENST00000433835,ENSG00000251357,protein_coding,211.545748
8,COX8A-001,ENST00000314133,ENSG00000176340,protein_coding,205.089205
9,RPLP1-006,ENST00000560274,ENSG00000137818,protein_coding,202.649509


In [15]:
riboseq_hek_counts_top.to_csv(os.path.join(output_dir, 'top{}_riboseq.txt'.format(top)), sep='\t', index=False, header=True)  # so Kris can check them out.

# Read in genePred downloaded from UCSC table browser. Filter transcript table to include the genes from above. 
- also need to transform from Gencode-like accessions to ENSEMBL-like.
- now our genePred table should only be transcripts intersected with our ribosome occupancy list with the highest ribosome occupancy levels per gene.

In [16]:
genepred = pd.read_csv(os.path.join(input_dir, 'hg19_gencode_v19.genePred'), sep='\t')
genepred['ensembl_transcript_id'] = genepred['name'].apply(lambda x:x.split('.')[0])
genepred = genepred[(genepred['cdsStartStat']!='none') & (genepred['cdsEndStat']!='none')]  # in case we're accidentally grabbing processed transcripts or other non-protein-coding transcripts without CDS

print(genepred.shape[0])
genepred.head()

99523


Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,ensembl_transcript_id
0,0,ENST00000237247.6,chr1,+,66999065,67210057,67000041,67208778,27,"66999065,66999928,67091529,67098752,67099762,6...","66999090,67000051,67091593,67098777,67099846,6...",0,SGIP1,cmpl,cmpl,"-1,0,1,2,0,0,0,1,0,0,0,1,2,1,1,1,1,1,0,1,1,2,2...",ENST00000237247
1,0,ENST00000371039.1,chr1,+,66999274,67210768,67000041,67208778,22,"66999274,66999928,67091529,67098752,67105459,6...","66999355,67000051,67091593,67098777,67105516,6...",0,SGIP1,cmpl,cmpl,-1012001012111011220211,ENST00000371039
2,0,ENST00000424320.1,chr1,+,66999297,67145425,67000041,67145425,13,"66999297,66999928,67091529,67098752,67101626,6...","66999355,67000051,67091593,67098777,67101698,6...",0,SGIP1,cmpl,incmpl,-1012000101211,ENST00000424320
3,0,ENST00000371035.3,chr1,+,66999822,67208882,67000041,67208778,22,"66999822,67091529,67098752,67108492,67109226,6...","67000051,67091593,67098777,67108547,67109402,6...",0,SGIP1,cmpl,cmpl,0120100012111011220211,ENST00000371035
5,0,ENST00000371036.3,chr1,+,66999868,67213982,67000044,67208778,21,"66999868,67091529,67098752,67105459,67108492,6...","67000051,67091593,67098777,67105516,67108547,6...",0,SGIP1,cmpl,cmpl,012001012111011220211,ENST00000371036


In [17]:
genepred = genepred[genepred['ensembl_transcript_id'].isin(riboseq_hek_counts_top['ensembl_transcript_id'])]
del genepred['ensembl_transcript_id']
genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq.top{}.genePred'.format(top)),
    sep='\t',
    header=True,
    index=False
)
print("After intersecting hg19 annotations, we're left with {} transcripts.".format(genepred.shape[0]))

After intersecting hg19 annotations, we're left with 194 transcripts.


# Try with Q1 genes too. Essentially running the same annotation filtering, except instead of taking the top 200, split the list into 4 quartiles based on ranked occupancy (tpm).

In [18]:
riboseq_hek_counts['pct'] = riboseq_hek_counts['avg_riboseq'].rank(pct=True)
q1 = riboseq_hek_counts[(riboseq_hek_counts['pct'] <= 1) & (riboseq_hek_counts['pct'] > 0.75)][['ensembl_transcript_id']]
q2 = riboseq_hek_counts[(riboseq_hek_counts['pct'] <= 0.75) & (riboseq_hek_counts['pct'] > 0.5)][['ensembl_transcript_id']]
q3 = riboseq_hek_counts[(riboseq_hek_counts['pct'] <= 0.5) & (riboseq_hek_counts['pct'] > 0.25)][['ensembl_transcript_id']]
q4 = riboseq_hek_counts[(riboseq_hek_counts['pct'] <= 0.25) & (riboseq_hek_counts['pct'] > 0.0)][['ensembl_transcript_id']]

print(q1.shape[0], q2.shape[0], q3.shape[0], q4.shape[0])
q1.head()

4910 4909 4909 4909


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ensembl_transcript_id
0,ENST00000538098
1,ENST00000536684
2,ENST00000600213
3,ENST00000604952
4,ENST00000215754


In [19]:
genepred = pd.read_csv(os.path.join(input_dir, 'hg19_gencode_v19.genePred'), sep='\t')
genepred['ensembl_transcript_id'] = genepred['name'].apply(lambda x:x.split('.')[0])
genepred.head()

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,ensembl_transcript_id
0,0,ENST00000237247.6,chr1,+,66999065,67210057,67000041,67208778,27,"66999065,66999928,67091529,67098752,67099762,6...","66999090,67000051,67091593,67098777,67099846,6...",0,SGIP1,cmpl,cmpl,"-1,0,1,2,0,0,0,1,0,0,0,1,2,1,1,1,1,1,0,1,1,2,2...",ENST00000237247
1,0,ENST00000371039.1,chr1,+,66999274,67210768,67000041,67208778,22,"66999274,66999928,67091529,67098752,67105459,6...","66999355,67000051,67091593,67098777,67105516,6...",0,SGIP1,cmpl,cmpl,-1012001012111011220211,ENST00000371039
2,0,ENST00000424320.1,chr1,+,66999297,67145425,67000041,67145425,13,"66999297,66999928,67091529,67098752,67101626,6...","66999355,67000051,67091593,67098777,67101698,6...",0,SGIP1,cmpl,incmpl,-1012000101211,ENST00000424320
3,0,ENST00000371035.3,chr1,+,66999822,67208882,67000041,67208778,22,"66999822,67091529,67098752,67108492,67109226,6...","67000051,67091593,67098777,67108547,67109402,6...",0,SGIP1,cmpl,cmpl,0120100012111011220211,ENST00000371035
4,0,ENST00000468286.1,chr1,+,66999838,67142779,66999838,66999838,10,"66999838,67091529,67098752,67105459,67108492,6...","67000051,67091593,67098777,67105516,67108547,6...",0,SGIP1,none,none,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,",ENST00000468286


In [20]:
genepred = genepred[genepred['ensembl_transcript_id'].isin(q1['ensembl_transcript_id'])]
del genepred['ensembl_transcript_id']
genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq.Q1.genePred'),
    sep='\t',
    header=True,
    index=False
)
print("After intersecting hg19 annotations, we're left with {} transcripts.".format(genepred.shape[0]))

After intersecting hg19 annotations, we're left with 4677 transcripts.


# Let's use Fred's X3 data
- nice that this is already normalized
- only downside here is that we need to take the longest transcript instead of using the transcript-level measurements from the GSE112353_HEK_count_table

In [21]:
def gene_id_to_protein_coding(db):
    """
    returns whether or not a gene is protein coding or not.
    """
    genes = db.features_of_type('gene')
    gene_name_dict = {}
    for gene in genes:
        gene_id = gene.attributes['gene_id'][0] if type(gene.attributes['gene_id']) == list else gene.attributes['gene_id']
        try:
            gene_name_dict[gene_id] = gene.attributes['gene_type'][0]
        except KeyError:
            print(gene.attributes.keys())
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_name_dict
    
db_file = '/projects/ps-yeolab4/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db'
DATABASE = gffutils.FeatureDB(db_file)
gene_id_to_pc = gene_id_to_protein_coding(DATABASE)
print(gene_id_to_pc['ENSG00000100320.18'])

def get_gene_is_pc(row, d=gene_id_to_pc):
    """
    Helper function that uses the gene_id_to_pc dictionary to assign whether or not
    a gene is protein coding. Returns 'unannotated' in case the geneid doesn't exist
    in the dictionary.
    """
    try:
        return d[row['Geneid']]
    except KeyError:
        return 'unannotated'

  "method of this object." % self.version)


protein_coding


In [22]:
fred = pd.read_excel(os.path.join(input_dir, '293_Normalized_Expression_Complete.xlsx'))

fred_x3 = fred[['Geneid','(X3poly/X3)']]
fred_x3['gene_type'] = fred_x3.apply(get_gene_is_pc, axis=1)
fred_x3 = fred_x3[fred_x3['gene_type']=='protein_coding']
del fred_x3['gene_type']
fred_x3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Geneid,(X3poly/X3)
4,ENSG00000105880.4,0.309637
9,ENSG00000108797.7,0.182967
10,ENSG00000204377.3,0.118694
18,ENSG00000174938.10,0.251648
19,ENSG00000226180.2,0.230794


In [23]:
fred_x3['pct'] = fred_x3['(X3poly/X3)'].rank(pct=True)
q1 = fred_x3[(fred_x3['pct'] <= 1) & (fred_x3['pct'] > 0.75)][['Geneid']]
q2 = fred_x3[(fred_x3['pct'] <= 0.75) & (fred_x3['pct'] > 0.5)][['Geneid']]
q3 = fred_x3[(fred_x3['pct'] <= 0.5) & (fred_x3['pct'] > 0.25)][['Geneid']]
q4 = fred_x3[(fred_x3['pct'] <= 0.25) & (fred_x3['pct'] > 0.0)][['Geneid']]

top_fred = fred_x3.sort_values(by=['(X3poly/X3)'], ascending=False)[:top]

print(q1.shape[0], q2.shape[0], q3.shape[0], q4.shape[0], top_fred.shape[0])
top_fred

2593 2593 2593 2593 200


Unnamed: 0,Geneid,(X3poly/X3),pct
9326,ENSG00000119705.5,6.235998,1.000000
9152,ENSG00000112695.7,6.115120,0.999904
9379,ENSG00000189043.5,5.919634,0.999807
9065,ENSG00000164405.6,5.913501,0.999711
8393,ENSG00000131495.4,5.715314,0.999614
11039,ENSG00000134056.7,5.559692,0.999518
9252,ENSG00000119013.4,5.460142,0.999422
9164,ENSG00000100387.8,5.377217,0.999325
9847,ENSG00000136810.8,5.346234,0.999229
9075,ENSG00000203760.4,5.317707,0.999132


In [24]:
# now the painful part. We need to transform geneids to transcript ids.
def gene_id_to_transcript(db, annotation='ensembl'):
    '''
    given a gene name, returns a list of associated transcript IDs (one-to-many)
    '''
    genes = db.features_of_type('transcript')
    gene_id_dict = defaultdict(list)
    for gene in genes:
        try:
            if annotation == 'ensembl':
                gene_id_dict[gene.attributes['gene_id'][0].split('.')[0]].append(gene.attributes['transcript_id'][0])
            elif annotation == 'gencode':
                gene_id_dict[gene.attributes['gene_id'][0]].append(gene.attributes['transcript_id'][0])
            else:
                print("something wrong here.")
                return 1
        except KeyError as e:
            print("Warning. Key not found for {}".format(gene))
            return 1
    return gene_id_dict

geneid_to_txid_dictionary = gene_id_to_transcript(DATABASE, annotation='gencode')
geneid_to_txid_dictionary['ENSG00000100320.18']

['ENST00000405409.2',
 'ENST00000414461.2',
 'ENST00000449924.2',
 'ENST00000262829.7',
 'ENST00000397303.2',
 'ENST00000359369.4',
 'ENST00000463509.1',
 'ENST00000416721.2',
 'ENST00000495377.2',
 'ENST00000438146.2',
 'ENST00000473487.2',
 'ENST00000408983.2',
 'ENST00000491982.1',
 'ENST00000397305.3']

In [25]:
def compute_length(row):
    starts = row['exonStarts'].split(',')
    ends = row['exonEnds'].split(',')
    length = 0
    for start, end in zip(starts, ends):
        if start != "":
            length += (int(end) - int(start))
    return length

In [26]:
genepred = pd.read_csv(os.path.join(input_dir, 'hg19_gencode_v19.genePred'), sep='\t')
genepred['len'] = genepred.apply(compute_length, axis=1)
genepred.head()

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
0,0,ENST00000237247.6,chr1,+,66999065,67210057,67000041,67208778,27,"66999065,66999928,67091529,67098752,67099762,6...","66999090,67000051,67091593,67098777,67099846,6...",0,SGIP1,cmpl,cmpl,"-1,0,1,2,0,0,0,1,0,0,0,1,2,1,1,1,1,1,0,1,1,2,2...",3997
1,0,ENST00000371039.1,chr1,+,66999274,67210768,67000041,67208778,22,"66999274,66999928,67091529,67098752,67105459,6...","66999355,67000051,67091593,67098777,67105516,6...",0,SGIP1,cmpl,cmpl,-1012001012111011220211,4080
2,0,ENST00000424320.1,chr1,+,66999297,67145425,67000041,67145425,13,"66999297,66999928,67091529,67098752,67101626,6...","66999355,67000051,67091593,67098777,67101698,6...",0,SGIP1,cmpl,incmpl,-1012000101211,951
3,0,ENST00000371035.3,chr1,+,66999822,67208882,67000041,67208778,22,"66999822,67091529,67098752,67108492,67109226,6...","67000051,67091593,67098777,67108547,67109402,6...",0,SGIP1,cmpl,cmpl,0120100012111011220211,2180
4,0,ENST00000468286.1,chr1,+,66999838,67142779,66999838,66999838,10,"66999838,67091529,67098752,67105459,67108492,6...","67000051,67091593,67098777,67105516,67108547,6...",0,SGIP1,none,none,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,",846


In [27]:
all_q1_tx = []
for geneid in q1['Geneid']:
    all_q1_tx += geneid_to_txid_dictionary[geneid]

top_tx = []
for geneid in top_fred['Geneid']:
    top_tx += geneid_to_txid_dictionary[geneid]
    
len(all_q1_tx), len(top_tx)

(22742, 1290)

In [28]:
q1_genepred = genepred[genepred['name'].isin(all_q1_tx)]

# Sanity check. Make sure we're capturing the longest transcript here (so we don't really have to do it in R)

In [29]:
q1_genepred[q1_genepred['name2']=='SCP2']  # sanity check. 

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
210,15,ENST00000371514.3,chr1,+,53392900,53517375,53393068,53516376,16,"53392900,53407467,53413680,53416426,53420411,5...","53393137,53407525,53413752,53416558,53420476,5...",0,SCP2,cmpl,cmpl,0011101220112010,2811
211,15,ENST00000478631.2,chr1,+,53392948,53517073,53393068,53458949,17,"53392948,53407467,53413680,53416426,53420411,5...","53393137,53407525,53413752,53416558,53420476,5...",0,SCP2,cmpl,cmpl,"0,0,1,1,1,0,1,2,2,0,1,1,-1,-1,-1,-1,-1,",5204
212,15,ENST00000528311.1,chr1,+,53392957,53516575,53416470,53516376,15,"53392957,53413680,53416426,53420411,53427174,5...","53393137,53413752,53416558,53420476,53427301,5...",0,SCP2,cmpl,cmpl,"-1,-1,0,1,0,1,2,2,0,1,1,2,0,1,0,",1896
213,15,ENST00000371509.4,chr1,+,53392957,53516642,53393068,53516376,15,"53392957,53407467,53413680,53420411,53427174,5...","53393137,53407525,53413752,53420476,53427301,5...",0,SCP2,cmpl,cmpl,001101220112010,1889
214,15,ENST00000407246.2,chr1,+,53392957,53516932,53393068,53516376,15,"53392957,53407467,53416426,53420411,53427174,5...","53393137,53407525,53416558,53420476,53427301,5...",0,SCP2,cmpl,cmpl,001101220112010,2239
7855,992,ENST00000371513.5,chr1,+,53392989,53459904,53393068,53458949,11,"53392989,53407467,53413680,53420411,53427174,5...","53393137,53407525,53413752,53420476,53427301,5...",0,SCP2,cmpl,cmpl,00110122011,2003
7856,992,ENST00000528809.1,chr1,+,53393051,53442384,53393051,53393051,7,"53393051,53407467,53413680,53416426,53420411,5...","53393137,53407525,53413752,53416558,53420476,5...",0,SCP2,none,none,"-1,-1,-1,-1,-1,-1,-1,",570
7860,992,ENST00000529363.2,chr1,+,53407474,53446125,53407474,53446125,8,"53407474,53413680,53416426,53420411,53427174,5...","53407525,53413752,53416558,53420476,53427301,5...",0,SCP2,incmpl,cmpl,11110120,720
7861,992,ENST00000473584.1,chr1,+,53429312,53453752,53429312,53429312,6,"53429312,53440440,53442354,53443888,53446067,5...","53429392,53440504,53442441,53444039,53446215,5...",0,SCP2,none,none,"-1,-1,-1,-1,-1,-1,",582
7862,993,ENST00000430330.2,chr1,+,53480599,53516798,53480692,53516376,5,5348059953493649535045885351352953516280,5348071553493743535047185351360953516798,0,SCP2,cmpl,cmpl,02010,938


In [30]:
q1_genepred = q1_genepred.sort_values(by=['len'], ascending=False).drop_duplicates(['name2'])
q1_genepred[q1_genepred['name2']=='SCP2']  # sanity check. 

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
211,15,ENST00000478631.2,chr1,+,53392948,53517073,53393068,53458949,17,"53392948,53407467,53413680,53416426,53420411,5...","53393137,53407525,53413752,53416558,53420476,5...",0,SCP2,cmpl,cmpl,"0,0,1,1,1,0,1,2,2,0,1,1,-1,-1,-1,-1,-1,",5204


In [31]:
del q1_genepred['len']
q1_genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq.Q1fx3.genePred'.format(top)),
    sep='\t',
    header=True,
    index=False
)

### More sanity checks.

In [32]:
genepred[genepred['name2']=='ALG14']

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
9128,1313,ENST00000370205.5,chr1,-,95439962,95538501,95448631,95538454,4,95439962954926849553042195538318,95448862954928169553057395538501,0,ALG14,cmpl,cmpl,0010,9367
9129,1313,ENST00000507727.2,chr1,-,95449105,95492759,95449105,95449105,2,9544910595492684,9544918995492759,0,ALG14,none,none,"-1,-1,",159
9130,1313,ENST00000495856.1,chr1,-,95501376,95538430,95501376,95501376,4,95501376955070489553042195538318,95501438955071689553057395538430,0,ALG14,none,none,"-1,-1,-1,-1,",446


In [33]:
top_genepred = genepred[genepred['name'].isin(top_tx)]
top_genepred = top_genepred.sort_values(by=['len'], ascending=False).drop_duplicates(['name2'])
top_genepred[top_genepred['name2']=='ALG14'] # sanity check. 

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
9128,1313,ENST00000370205.5,chr1,-,95439962,95538501,95448631,95538454,4,95439962954926849553042195538318,95448862954928169553057395538501,0,ALG14,cmpl,cmpl,10,9367


In [34]:
del top_genepred['len']
top_genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq.top{}fx3.genePred'.format(top)),
    sep='\t',
    header=True,
    index=False
)

# Check metagene profiles of 20 IRES genes. 
- In theory, certain datasets should show no change (ask Kris, or maybe it's better I not know) in their metagene profile.

In [35]:
goi = [
    'ENSG00000120868.9',
    'ENSG00000107262.12',
    'ENSG00000171791.10',
    'ENSG00000248333.3',
    'ENSG00000111276.6',
    'ENSG00000009307.11',
    'ENSG00000114867.15',
    'ENSG00000110321.11',
    'ENSG00000102081.9',
    'ENSG00000100644.12',
    'ENSG00000044574.7',
    'ENSG00000070444.10',
    'ENSG00000136997.10',
    'ENSG00000186416.8',
    'ENSG00000102317.13',
    'ENSG00000159216.14',
    'ENSG00000139514.8',
    'ENSG00000141510.11',
    'ENSG00000112715.16',
    'ENSG00000101966.8'
]
all_goi_tx = []
for geneid in goi:
    all_goi_tx += geneid_to_txid_dictionary[geneid]

all_goi_tx

['ENST00000551964.1',
 'ENST00000359972.2',
 'ENST00000357310.1',
 'ENST00000339433.3',
 'ENST00000333991.1',
 'ENST00000547743.1',
 'ENST00000552268.1',
 'ENST00000550527.1',
 'ENST00000547045.1',
 'ENST00000549007.1',
 'ENST00000555047.1',
 'ENST00000546491.1',
 'ENST00000552929.1',
 'ENST00000547666.1',
 'ENST00000493917.1',
 'ENST00000472232.3',
 'ENST00000379701.1',
 'ENST00000467389.2',
 'ENST00000379707.2',
 'ENST00000379704.2',
 'ENST00000468274.1',
 'ENST00000473464.1',
 'ENST00000473781.1',
 'ENST00000488499.1',
 'ENST00000398117.1',
 'ENST00000333681.4',
 'ENST00000590515.1',
 'ENST00000589955.1',
 'ENST00000444484.1',
 'ENST00000513088.2',
 'ENST00000407249.3',
 'ENST00000317673.7',
 'ENST00000340677.5',
 'ENST00000341832.6',
 'ENST00000341028.7',
 'ENST00000477087.1',
 'ENST00000228872.4',
 'ENST00000396340.1',
 'ENST00000442489.1',
 'ENST00000339438.6',
 'ENST00000438362.2',
 'ENST00000358528.4',
 'ENST00000261443.5',
 'ENST00000530886.1',
 'ENST00000369530.1',
 'ENST0000

In [36]:
ires_genepred = genepred[genepred['name'].isin(all_goi_tx)]
ires_genepred = ires_genepred.sort_values(by=['len'], ascending=False).drop_duplicates(['name2'])
print(ires_genepred.shape[0])
ires_genepred

20


Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
60332,918,ENST00000480614.1,chr6,+,43740908,43754221,43740908,43740908,3,437409084374619643746625,437454024374627343754221,0,VEGFA,none,none,"-1,-1,-1,",12167
88335,1523,ENST00000371199.3,chrX,+,122993876,123047829,123019512,123041031,7,"122993876,123019480,123022468,123025087,123026...","122994143,123020389,123022568,123025166,123026...",0,XIAP,cmpl,cmpl,-1012011,8591
155891,16,ENST00000398117.1,chr18,-,60790578,60987361,60795857,60985899,2,6079057860985314,6079599260987361,0,BCL2,cmpl,cmpl,00,7461
118264,101,ENST00000380752.5,chr13,-,30083546,30169825,30088616,30110325,13,"30083546,30090274,30091280,30091709,30093583,3...","30088720,30090383,30091447,30091927,30093686,3...",0,SLC7A1,cmpl,cmpl,"1,0,1,2,1,2,1,2,1,1,0,-1,-1,",7347
175368,107,ENST00000344691.4,chr21,-,36160097,36260987,36164431,36259409,6,"36160097,36171597,36206706,36231770,36252853,3...","36164907,36171759,36206898,36231875,36253010,3...",0,RUNX1,cmpl,cmpl,111100,7274
109081,167,ENST00000357310.1,chr12,+,99039077,99129204,99042137,99126344,26,"99039077,99042096,99042403,99043264,99052937,9...","99039613,99042275,99042593,99043462,99053121,9...",0,APAF1,cmpl,cmpl,"-1,0,0,1,1,2,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0...",7055
99632,667,ENST00000532383.1,chr11,-,10818600,10829575,10818600,10818600,15,"10818600,10820538,10820759,10821098,10821978,1...","10819428,10820660,10820971,10821895,10822194,1...",0,EIF4G2,none,none,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,",6769
39145,1989,ENST00000342981.4,chr3,+,184032953,184053146,184033584,184052696,32,"184032953,184033550,184033919,184035108,184035...","184033333,184033644,184034006,184035285,184035...",0,EIF4G1,cmpl,cmpl,"-1,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,2...",5667
145661,602,ENST00000174618.4,chr17,-,2287353,2304412,2290194,2304006,6,228735322911502297336229759822981682303933,229094322913432297448229764022987482304412,0,MNT,cmpl,cmpl,102210,4996
88755,1706,ENST00000218200.8,chrX,+,146993468,147032632,146993697,147030364,16,"146993468,147003450,147007057,147009839,147010...","146993748,147003503,147007151,147009911,147010...",0,FMR1,cmpl,cmpl,0020020001000110,4333


In [37]:
del ires_genepred['len']
ires_genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq.IRES.genePred'),
    sep='\t',
    header=True,
    index=False
)

# Check metagene profiles for TORIN genes
- Between certain datasets, these genes should change (and should not between others).

In [38]:
goi = [
    "ENSG00000156508.13",
    "ENSG00000114942.9",
    "ENSG00000104529.13",
    "ENSG00000124802.7",
    "ENSG00000254772.5",
    "ENSG00000167658.11",
    "ENSG00000104408.5",
    "ENSG00000175390.8",
    "ENSG00000147677.6",
    "ENSG00000149806.6",
    "ENSG00000135486.13",
    "ENSG00000181163.9",
    "ENSG00000070756.9",
    "ENSG00000198755.6",
    "ENSG00000197958.8",
    "ENSG00000142541.12",
    "ENSG00000188846.9",
    "ENSG00000063177.8",
    "ENSG00000105640.8",
    "ENSG00000108298.5",
    "ENSG00000122026.6",
    "ENSG00000116251.5",
    "ENSG00000125691.8",
    "ENSG00000198242.9",
    "ENSG00000114391.8",
    "ENSG00000156482.6",
    "ENSG00000144713.8",
    "ENSG00000109475.12",
    "ENSG00000136942.10",
    "ENSG00000241343.5",
    "ENSG00000145592.9",
    "ENSG00000197756.5",
    "ENSG00000172809.8",
    "ENSG00000198918.7",
    "ENSG00000174444.10",
    "ENSG00000122406.8",
    "ENSG00000089009.11",
    "ENSG00000147604.9",
    "ENSG00000148303.12",
    "ENSG00000161016.11",
    "ENSG00000163682.11",
    "ENSG00000089157.11",
    "ENSG00000137818.7",
    "ENSG00000177600.4",
    "ENSG00000124614.9",
    "ENSG00000142534.2",
    "ENSG00000164587.7",
    "ENSG00000115268.5",
    "ENSG00000134419.11",
    "ENSG00000105193.4",
    "ENSG00000231500.2",
    "ENSG00000105372.2",
    "ENSG00000008988.5",
    "ENSG00000171858.13",
    "ENSG00000138326.14",
    "ENSG00000118181.6",
    "ENSG00000197728.5",
    "ENSG00000143947.8",
    "ENSG00000149273.10",
    "ENSG00000198034.6",
    "ENSG00000083845.4",
    "ENSG00000170889.9",
    "ENSG00000133112.12",
]
all_goi_tx = []
for geneid in goi:
    all_goi_tx += geneid_to_txid_dictionary[geneid]

len(all_goi_tx)

662

In [39]:
torin_genepred = genepred[genepred['name'].isin(all_goi_tx)]
torin_genepred = torin_genepred[(torin_genepred['cdsStartStat']!='none') & (torin_genepred['cdsEndStat']!='none')]
torin_genepred = torin_genepred.sort_values(by=['len'], ascending=False).drop_duplicates(['name2'])
print(torin_genepred.shape[0])
torin_genepred

63


Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
50152,896,ENST00000274242.5,chr5,-,40825363,40835437,40832605,40835287,4,40825363408342824083457240835284,40832675408343674083470840835437,0,RPL37,cmpl,cmpl,2100,7686
96486,80,ENST00000533626.1,chr11,+,7991797,8023409,8008899,8017569,10,"7991797,8003216,8008444,8013327,8013630,801443...","7991854,8003330,8009263,8013398,8013710,801457...",0,EIF3F,cmpl,cmpl,"-1,-1,0,1,0,2,2,1,0,0,",7540
60881,1151,ENST00000316292.9,chr6,-,74225472,74230741,74227532,74229749,7,"74225472,74227752,74228076,74228420,74228654,7...","74227657,74227987,74228333,74228571,74228951,7...",0,EEF1A1,cmpl,cmpl,1010000,4441
113324,1002,ENST00000546500.1,chr12,+,54673976,54680871,54674591,54678097,10,"54673976,54675169,54675578,54675873,54676177,5...","54674606,54675286,54675725,54676084,54676270,5...",0,HNRNPA1,cmpl,cmpl,"0,0,0,0,1,1,1,1,1,-1,",4121
73060,185,ENST00000521861.1,chr8,-,117654368,117768060,117657244,117768036,8,"117654368,117658709,117661044,117668094,117669...","117657342,117658842,117661165,117668244,117669...",0,EIF3H,cmpl,cmpl,10221100,3959
76907,1361,ENST00000318607.5,chr8,-,101715143,101734940,101716525,101733811,15,"101715143,101716524,101717153,101717816,101718...","101715587,101716618,101717284,101717901,101719...",0,PABPC1,cmpl,cmpl,-101011000012010,3485
161384,615,ENST00000309311.6,chr19,-,3976053,3985467,3976551,3985378,15,"3976053,3977212,3977425,3977816,3979326,397980...","3976745,3977345,3977608,3978170,3979434,398006...",0,EEF2,cmpl,cmpl,100002100201200,3164
27244,2243,ENST00000491306.1,chr2,+,217362911,217368274,217363597,217366127,4,217362911217363992217364671217366063,217363600217364121217364754217368274,0,RPL37A,cmpl,cmpl,0002,3112
149791,867,ENST00000479035.2,chr17,-,37004119,37010096,37006384,37009963,5,3700411937006614370088563700927437009950,3700646737006728370089853700935837010096,0,RPL23,cmpl,cmpl,11110,2821
94060,1193,ENST00000440692.1,chr10,+,79793517,79816570,79793659,79814768,5,7979351779795109797952687979695179814288,7979366279795175797954787979706279816570,0,RPS24,cmpl,cmpl,00000,2814


In [40]:
del torin_genepred['len']
torin_genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq.TORIN.genePred'),
    sep='\t',
    header=True,
    index=False
)

# Let's make some metagenes with another riboseq dataset - genelist from Ryan/Kris

In [54]:
riboseq2 = pd.read_csv(os.path.join(input_dir, 'rps2_for_boxplots_RepsCombined_quartiles.txt'), sep='\t', index_col=0)
genepred = pd.read_csv(os.path.join(input_dir, 'hg19_gencode_v19.genePred'), sep='\t')
genepred['len'] = genepred.apply(compute_length, axis=1)

In [55]:
# we'll just take q1 for now.
riboseq2 = riboseq2[riboseq2['Occupancy Quartile']==1]
print("{} genes in q1.".format(riboseq2.shape[0]))
riboseq2.head()

3598 genes in q1.


Unnamed: 0,Occupancy Quartile
ENSG00000000003,1
ENSG00000135047,1
ENSG00000135046,1
ENSG00000135018,1
ENSG00000135002,1


In [56]:
genepred.head()

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
0,0,ENST00000237247.6,chr1,+,66999065,67210057,67000041,67208778,27,"66999065,66999928,67091529,67098752,67099762,6...","66999090,67000051,67091593,67098777,67099846,6...",0,SGIP1,cmpl,cmpl,"-1,0,1,2,0,0,0,1,0,0,0,1,2,1,1,1,1,1,0,1,1,2,2...",3997
1,0,ENST00000371039.1,chr1,+,66999274,67210768,67000041,67208778,22,"66999274,66999928,67091529,67098752,67105459,6...","66999355,67000051,67091593,67098777,67105516,6...",0,SGIP1,cmpl,cmpl,-1012001012111011220211,4080
2,0,ENST00000424320.1,chr1,+,66999297,67145425,67000041,67145425,13,"66999297,66999928,67091529,67098752,67101626,6...","66999355,67000051,67091593,67098777,67101698,6...",0,SGIP1,cmpl,incmpl,-1012000101211,951
3,0,ENST00000371035.3,chr1,+,66999822,67208882,67000041,67208778,22,"66999822,67091529,67098752,67108492,67109226,6...","67000051,67091593,67098777,67108547,67109402,6...",0,SGIP1,cmpl,cmpl,0120100012111011220211,2180
4,0,ENST00000468286.1,chr1,+,66999838,67142779,66999838,66999838,10,"66999838,67091529,67098752,67105459,67108492,6...","67000051,67091593,67098777,67105516,67108547,6...",0,SGIP1,none,none,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,",846


In [57]:
geneid_to_txid_dictionary = gene_id_to_transcript(DATABASE, annotation='ensembl')

all_q1_tx = []
for geneid in riboseq2.index:
    all_q1_tx += geneid_to_txid_dictionary[geneid]

len(all_q1_tx)

31911

In [58]:
q1_genepred = genepred[genepred['name'].isin(all_q1_tx)]
print(q1_genepred.shape[0])
q1_genepred.head()

31911


Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
21,1,ENST00000374379.4,chr1,+,25071847,25170815,25072044,25167428,6,"25071847,25124232,25140584,25153500,25166350,2...","25072116,25124342,25140710,25153607,25166532,2...",0,CLIC4,cmpl,cmpl,002210,4346
22,1,ENST00000497755.1,chr1,+,25071872,25166398,25071872,25071872,6,"25071872,25119667,25124232,25140584,25153500,2...","25072116,25119704,25124342,25140710,25153607,2...",0,CLIC4,none,none,"-1,-1,-1,-1,-1,-1,",672
23,1,ENST00000488683.1,chr1,+,25071872,25170758,25072044,25167428,7,"25071872,25124232,25140584,25153500,25166350,2...","25072116,25124342,25140710,25153607,25166532,2...",0,CLIC4,cmpl,cmpl,"0,0,2,2,1,0,-1,",2723
109,9,ENST00000343813.5,chr1,-,6281252,6296000,6285139,6295971,5,62812526291961629353362949456295776,62853226292179629370362950346296000,0,ICMT,cmpl,cmpl,01200,4771
110,9,ENST00000489498.1,chr1,-,6283550,6296032,6294504,6295971,6,628355062919616293533629446462949456295776,628532262921796293703629457162950346296032,0,ICMT,cmpl,cmpl,"-1,-1,-1,2,0,0,",2612


In [59]:
# q1_genepred = q1_genepred[(q1_genepred['cdsStartStat']!='none') & (q1_genepred['cdsEndStat']!='none')]
print(q1_genepred.shape[0])
q1_genepred = q1_genepred.sort_values(by=['len'], ascending=False).drop_duplicates(['name2'])
print(q1_genepred.shape[0])

31911
3597


In [60]:
q1_genepred.head()

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len
101979,1060,ENST00000378024.4,chr11,-,62283376,62314307,62284215,62303570,5,6228337662302474623034166230394062314131,6230154662302662623035706230403962314307,0,AHNAK,cmpl,cmpl,"0,1,0,-1,-1,",18787
118231,95,ENST00000382298.3,chr13,-,23902964,24007841,23904274,23985378,10,"23902964,23927923,23928657,23932473,23939304,2...","23915829,23928015,23930146,23932620,23939416,2...",0,SACS,cmpl,cmpl,"1,2,1,1,0,1,0,2,0,-1,",15639
18467,218,ENST00000243326.5,chr2,+,152266454,152338686,152266937,152331585,35,"152266454,152267773,152271336,152273074,152273...","152267041,152267852,152271433,152273202,152273...",0,RIF1,cmpl,cmpl,"0,2,0,1,0,2,0,0,1,0,1,1,1,1,0,0,2,0,0,0,1,2,0,...",15003
85036,124,ENST00000342160.3,chrX,-,53559056,53711114,53560269,53681051,83,"53559056,53560967,53561476,53562344,53563107,5...","53560372,53561158,53561658,53562462,53563213,5...",0,HUWE1,cmpl,cmpl,"2,0,1,0,2,2,2,1,1,1,0,1,2,1,0,1,0,2,1,0,1,2,1,...",14796
42261,251,ENST00000441802.2,chr4,-,187508936,187645009,187509745,187630981,27,"187508936,187516842,187517693,187518835,187519...","187510374,187516980,187518325,187518946,187519...",0,FAT1,cmpl,cmpl,"1,1,2,2,1,0,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,0,0,...",14786


In [61]:
riboseq2.reset_index(inplace=True)
riboseq2.head()

Unnamed: 0,index,Occupancy Quartile
0,ENSG00000000003,1
1,ENSG00000135047,1
2,ENSG00000135046,1
3,ENSG00000135018,1
4,ENSG00000135002,1


In [62]:
q1_genepred['ensg'] = q1_genepred['name'].apply(lambda x: x.split('.')[0])
merged = pd.merge(q1_genepred, riboseq2, how='outer', left_on=['ensg'], right_on=['index'])

In [63]:
merged[merged.isnull().any(axis=1)]

Unnamed: 0,#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames,len,ensg,index,Occupancy Quartile
0,1060.0,ENST00000378024.4,chr11,-,62283376.0,62314307.0,62284215.0,62303570.0,5.0,6228337662302474623034166230394062314131,6230154662302662623035706230403962314307,0.0,AHNAK,cmpl,cmpl,"0,1,0,-1,-1,",18787.0,ENST00000378024,,
1,95.0,ENST00000382298.3,chr13,-,23902964.0,24007841.0,23904274.0,23985378.0,10.0,"23902964,23927923,23928657,23932473,23939304,2...","23915829,23928015,23930146,23932620,23939416,2...",0.0,SACS,cmpl,cmpl,"1,2,1,1,0,1,0,2,0,-1,",15639.0,ENST00000382298,,
2,218.0,ENST00000243326.5,chr2,+,152266454.0,152338686.0,152266937.0,152331585.0,35.0,"152266454,152267773,152271336,152273074,152273...","152267041,152267852,152271433,152273202,152273...",0.0,RIF1,cmpl,cmpl,"0,2,0,1,0,2,0,0,1,0,1,1,1,1,0,0,2,0,0,0,1,2,0,...",15003.0,ENST00000243326,,
3,124.0,ENST00000342160.3,chrX,-,53559056.0,53711114.0,53560269.0,53681051.0,83.0,"53559056,53560967,53561476,53562344,53563107,5...","53560372,53561158,53561658,53562462,53563213,5...",0.0,HUWE1,cmpl,cmpl,"2,0,1,0,2,2,2,1,1,1,0,1,2,1,0,1,0,2,1,0,1,2,1,...",14796.0,ENST00000342160,,
4,251.0,ENST00000441802.2,chr4,-,187508936.0,187645009.0,187509745.0,187630981.0,27.0,"187508936,187516842,187517693,187518835,187519...","187510374,187516980,187518325,187518946,187519...",0.0,FAT1,cmpl,cmpl,"1,1,2,2,1,0,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,0,0,...",14786.0,ENST00000441802,,
5,170.0,ENST00000360184.4,chr14,+,102430864.0,102517128.0,102431028.0,102516900.0,78.0,"102430864,102442048,102445655,102446055,102446...","102431284,102442136,102445829,102446311,102446...",0.0,DYNC1H1,cmpl,cmpl,"0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,...",14333.0,ENST00000360184,,
6,819.0,ENST00000394670.4,chr17,+,30677135.0,30708905.0,30677304.0,30696778.0,12.0,"30677135,30678804,30685521,30687616,30687910,3...","30677345,30678931,30685660,30687784,30687986,3...",0.0,ZNF207,cmpl,cmpl,020112210001,13781.0,ENST00000394670,,
7,171.0,ENST00000375735.2,chr11,+,102980159.0,103350591.0,102980303.0,103349981.0,89.0,"102980159,102984265,102984831,102985905,102987...","102980498,102984436,102984967,102986024,102987...",0.0,DYNC2H1,cmpl,cmpl,"0,0,0,1,0,1,0,0,0,1,0,2,0,0,0,0,2,0,2,1,0,0,2,...",13678.0,ENST00000375735,,
8,119.0,ENST00000523565.1,chr8,-,48685668.0,48872743.0,48685668.0,48685668.0,86.0,"48685668,48689404,48690246,48691019,48691288,4...","48686938,48689544,48690435,48691221,48691360,4...",0.0,PRKDC,none,none,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-...",13508.0,ENST00000523565,,
9,288.0,ENST00000366844.3,chr1,-,225674536.0,225840844.0,225685476.0,225840392.0,15.0,"225674536,225686048,225688693,225692692,225695...","225685514,225686106,225688772,225692755,225695...",0.0,ENAH,cmpl,cmpl,102210201121020,13168.0,ENST00000366844,,


In [49]:
del q1_genepred['len']
q1_genepred.to_csv(
    os.path.join(output_dir, 'hg19_gencode_v19.riboseq2.Q1.genePred'),
    sep='\t',
    header=True,
    index=False
)