# DaniO5P predictions on all MPRA sequences

Evaluate length and CNN models on all the MPRA sequences.

This notebook generates the supplementary table with all calculated MRL and model predictions,
and serves as an example of how to make model predictions.

In [1]:
import json
import os
import pickle
import sys

import matplotlib
from matplotlib import pyplot

import pandas

utils_dir = '../utils'
sys.path.append(utils_dir)
import custom_plots
import seq_utils
import cnn_vgg

In [2]:
%matplotlib inline
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.sans-serif'] = 'Arial'

In [3]:
# Sublibraries
tpm_fraction_list = ['input', '80S', 'LMW', 'HMW']
pol_fraction_list = ['80S', 'LMW', 'HMW']
timepoint_list = [2, 4, 6, 10]

min_input_tpm_col = 'min_TPM_input'

tpm_cols = [f'gmean_TPM_{f}_{t}hpf' for f in tpm_fraction_list for t in timepoint_list]
input_tpm_cols = [f'gmean_TPM_input_{t}hpf' for t in timepoint_list]
input_log2_tpm_cols = [f'log2_TPM_input_{t}hpf' for t in timepoint_list]
diff_log2_tpm_input_cols = [f'diff_log2_TPM_input_{t}-2hpf' for t in timepoint_list[1:]]

mrl_cols = [f'MRL_{t}hpf' for t in timepoint_list]
log2_mrl_cols = [f'log2_MRL_{t}hpf' for t in timepoint_list]
res_log2_mrl_cols = [f'res_log2_MRL_{t}hpf' for t in timepoint_list]

res_diff_log2_tpm_input_cols = [f'res_diff_log2_TPM_input_{t}-2hpf' for t in timepoint_list[1:]]

In [4]:
# Load measurements
data = pandas.read_csv(
    '../00_data/Zb_5UTR_MPRA_TPM_MRL.tsv.gz',
    index_col=0,
    sep='\t',
)
data

Unnamed: 0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,...,diff_log2_TPM_input_6-2hpf,diff_log2_TPM_input_10-2hpf,MRL_2hpf,log2_MRL_2hpf,MRL_4hpf,log2_MRL_4hpf,MRL_6hpf,log2_MRL_6hpf,MRL_10hpf,log2_MRL_10hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318.0,20318,...,-0.012463,-0.558270,5.026628,2.329591,6.713248,2.747011,7.002960,2.807965,6.480393,2.696081
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681.0,72681,...,-0.055230,-0.243785,5.450300,2.446336,6.196041,2.631347,8.359783,3.063465,4.441558,2.151066
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446.0,27446,...,-0.058991,-0.058277,5.911159,2.563441,10.441205,3.384216,7.626433,2.931009,6.260806,2.646348
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092.0,113092,...,0.102668,0.830415,14.368484,3.844836,12.294140,3.619899,11.405933,3.511713,9.108052,3.187143
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320.0,20320,...,-0.081402,0.238507,7.103448,2.828519,6.990700,2.805437,8.646954,3.112192,4.903643,2.293854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060.0,27060,...,0.390930,0.082304,8.956233,3.162892,13.598507,3.765376,8.083156,3.014919,20.048542,4.325425
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762.0,86762,...,0.514384,-0.326691,11.157909,3.479995,11.045525,3.465390,8.266639,3.047301,12.727341,3.669859
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,...,-0.001837,0.051171,13.771432,3.783607,19.339298,4.273464,17.089539,4.095042,7.074025,2.822531
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,...,-0.215189,0.088363,13.484335,3.753213,14.144042,3.822123,19.909238,4.315366,16.071784,4.006458


In [5]:
# Max sequence length used for one-hot encoding
max_seq_len = data['insert_length'].max()

In [6]:
# Evaluate length model

# Predictor function
poly_n = 2
def predict_from_seq_len(x, a):
    y_pred = 0
    for i in range(poly_n + 1):
        y_pred += a[i]*(x**i)
    return y_pred

# Load model parameters
with open(f'../01_length_model/poly_{poly_n}_params.pickle', 'rb') as f:
    length_model_params = pickle.load(f)

# Evaluate model on every sequence
for y_idx, ycol in enumerate(log2_mrl_cols + diff_log2_tpm_input_cols):
    y_pred = predict_from_seq_len(data['insert_length'].values, length_model_params[ycol])
    data['pred_len_' + ycol] = y_pred

data

Unnamed: 0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,...,log2_MRL_6hpf,MRL_10hpf,log2_MRL_10hpf,pred_len_log2_MRL_2hpf,pred_len_log2_MRL_4hpf,pred_len_log2_MRL_6hpf,pred_len_log2_MRL_10hpf,pred_len_diff_log2_TPM_input_4-2hpf,pred_len_diff_log2_TPM_input_6-2hpf,pred_len_diff_log2_TPM_input_10-2hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318.0,20318,...,2.807965,6.480393,2.696081,3.729522,3.835332,3.563218,3.637126,-0.024803,0.239230,-0.090263
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681.0,72681,...,3.063465,4.441558,2.151066,3.129446,3.157052,3.153084,2.675878,0.105875,0.036108,0.057175
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446.0,27446,...,2.931009,6.260806,2.646348,2.582127,2.609266,2.810702,1.759990,0.121639,-0.243822,0.069192
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092.0,113092,...,3.511713,9.108052,3.187143,2.862373,2.874112,2.979019,2.237596,0.136392,-0.079596,0.090065
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320.0,20320,...,3.112192,4.903643,2.293854,2.849288,2.860783,2.970729,2.215829,0.137110,-0.085976,0.090756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060.0,27060,...,3.014919,20.048542,4.325425,3.267331,3.308956,3.245558,2.898937,0.081614,0.088060,0.030125
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762.0,86762,...,3.047301,12.727341,3.669859,3.557691,3.637156,3.444010,3.364057,0.018383,0.186344,-0.041217
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,...,4.095042,7.074025,2.822531,2.456200,2.523231,2.749815,1.527164,0.066886,-0.361664,0.002834
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,...,4.315366,16.071784,4.006458,2.751251,2.763197,2.909640,2.051478,0.139154,-0.136831,0.091985


In [7]:
# Chromsome splits
with open('../02_cnn/chr_splits.json', 'r') as f:
    chr_splits_info = json.load(f)
# chr_splits_info

In [8]:
# Load average ensemble model
model = cnn_vgg.load_model(os.path.join('../02_cnn/models_cnn_vgg/', f'model_avg.h5'))
model



<keras.engine.functional.Functional at 0x18d8a5ff130>

In [10]:
# Generate CNN predictions
seqs_onehot = seq_utils.one_hot_encode(data['insert_seq'], max_seq_len=max_seq_len, padding='right', mask_val=0)
y_pred = model.predict(seqs_onehot)
for y_idx, ycol in enumerate(log2_mrl_cols + diff_log2_tpm_input_cols):
    data['pred_cnn_ens_' + ycol] = y_pred[:, y_idx]
data

Unnamed: 0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,...,pred_len_diff_log2_TPM_input_4-2hpf,pred_len_diff_log2_TPM_input_6-2hpf,pred_len_diff_log2_TPM_input_10-2hpf,pred_cnn_ens_log2_MRL_2hpf,pred_cnn_ens_log2_MRL_4hpf,pred_cnn_ens_log2_MRL_6hpf,pred_cnn_ens_log2_MRL_10hpf,pred_cnn_ens_diff_log2_TPM_input_4-2hpf,pred_cnn_ens_diff_log2_TPM_input_6-2hpf,pred_cnn_ens_diff_log2_TPM_input_10-2hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318.0,20318,...,-0.024803,0.239230,-0.090263,-1.179693,-0.951195,-0.919687,-1.033110,-0.078608,-0.275502,-0.817384
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681.0,72681,...,0.105875,0.036108,0.057175,-0.610542,-0.316138,-0.217837,-0.220324,-0.129356,-0.111522,-0.078279
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446.0,27446,...,0.121639,-0.243822,0.069192,0.510013,0.515244,0.521851,0.586490,-0.009150,0.081602,0.160492
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092.0,113092,...,0.136392,-0.079596,0.090065,0.677019,0.406474,0.444826,0.528360,0.009686,0.142486,0.467795
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320.0,20320,...,0.137110,-0.085976,0.090756,-0.170920,0.064585,0.077172,0.120527,-0.028025,0.053350,0.120577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060.0,27060,...,0.081614,0.088060,0.030125,0.323986,0.124513,0.142785,0.049408,0.108482,0.176995,0.310650
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762.0,86762,...,0.018383,0.186344,-0.041217,0.176307,0.060197,0.127280,0.020198,0.193845,0.165420,0.060420
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,...,0.066886,-0.361664,0.002834,0.123057,0.005172,-0.122078,-0.352123,0.022883,-0.049307,-0.125676
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,...,0.139154,-0.136831,0.091985,0.747183,0.519835,0.511643,0.598310,0.077057,0.324958,0.694782


In [11]:
# Generate full predictions: length model + cnn ensemble
for y_idx, ycol in enumerate(log2_mrl_cols + diff_log2_tpm_input_cols):
    data['pred_full_' + ycol] = data['pred_len_' + ycol] + data['pred_cnn_ens_' + ycol]

data

Unnamed: 0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,...,pred_cnn_ens_diff_log2_TPM_input_4-2hpf,pred_cnn_ens_diff_log2_TPM_input_6-2hpf,pred_cnn_ens_diff_log2_TPM_input_10-2hpf,pred_full_log2_MRL_2hpf,pred_full_log2_MRL_4hpf,pred_full_log2_MRL_6hpf,pred_full_log2_MRL_10hpf,pred_full_diff_log2_TPM_input_4-2hpf,pred_full_diff_log2_TPM_input_6-2hpf,pred_full_diff_log2_TPM_input_10-2hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318.0,20318,...,-0.078608,-0.275502,-0.817384,2.549829,2.884137,2.643531,2.604016,-0.103411,-0.036273,-0.907648
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681.0,72681,...,-0.129356,-0.111522,-0.078279,2.518904,2.840914,2.935248,2.455554,-0.023481,-0.075414,-0.021104
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446.0,27446,...,-0.009150,0.081602,0.160492,3.092140,3.124510,3.332553,2.346480,0.112489,-0.162220,0.229684
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092.0,113092,...,0.009686,0.142486,0.467795,3.539392,3.280586,3.423845,2.765956,0.146078,0.062890,0.557860
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320.0,20320,...,-0.028025,0.053350,0.120577,2.678368,2.925368,3.047901,2.336356,0.109086,-0.032626,0.211333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060.0,27060,...,0.108482,0.176995,0.310650,3.591318,3.433469,3.388342,2.948345,0.190096,0.265055,0.340774
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762.0,86762,...,0.193845,0.165420,0.060420,3.733998,3.697353,3.571290,3.384255,0.212228,0.351765,0.019203
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,...,0.022883,-0.049307,-0.125676,2.579257,2.528403,2.627737,1.175040,0.089769,-0.410971,-0.122842
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,...,0.077057,0.324958,0.694782,3.498435,3.283032,3.421283,2.649789,0.216211,0.188127,0.786767


In [15]:
# Save only sequence, measurements, and predictions
cols_to_save = ['insert_seq'] + [min_input_tpm_col] + log2_mrl_cols + diff_log2_tpm_input_cols + \
    ['pred_len_' + ycol for ycol in log2_mrl_cols + diff_log2_tpm_input_cols] + \
    ['pred_cnn_ens_' + ycol for ycol in log2_mrl_cols + diff_log2_tpm_input_cols] + \
    ['pred_full_' + ycol for ycol in log2_mrl_cols + diff_log2_tpm_input_cols]

data.to_csv('Zb_5UTR_MPRA_TPM_MRL_full_predictions.tsv.gz', sep='\t', columns=cols_to_save)