# Data preprocessing and MRL calculation

This notebook combines the sequence metadata in `Zb_5UTR_MPRA.tsv` with TPM counts in `tpm_data_all_samples` and saves the following into `Zb_5UTR_MPRA_TPM_MRL.tsv.gz`:
- 5'UTR insert ID and metadata
- Average (geometric mean) TPM counts across replicates for input (total RNA), 80S, LMW, and HWM fractions, for all timepoints, and their logarithms.
- Minimum input TPM across replicates and timepoints, which will be later be used as a measure of sequence data quality.
- Differences in mean log2 input TPM with respect to the first timepoint (2hpf) as a measure of RNA stability.
- MRL and log2(MRL) calculated from TPMs for every timepoint, as indicated in the paper.

In [1]:
import numpy
import pandas

In [2]:
# Descriptions of different sublibraries
fraction_list = ['input', '80S', 'LMW', 'HMW']
pol_fraction_list = [f for f in fraction_list if f!='input']
timepoint_list = [2, 4, 6, 10]
replicate_list = ['A', 'B', 'C']

# Load metadata

In [3]:
metadata_df = pandas.read_csv(
    'Zb_5UTR_MPRA.tsv',
    sep='\t',
    index_col=0,
    dtype={'index': 'str'},
)
# Drop any RRS columns
metadata_df = metadata_df.loc[:, ~metadata_df.columns.str.startswith('mean_log2_RRS')]

metadata_df

Unnamed: 0_level_0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,insert_seq
insert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318,AACGTCAACAAACGATGACGTAGCTTTTGTACGCATGCGTAAGGAT...
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681,GAGCCCAGGACGCGCGCACTCTGCTGCTCGTTCTTTGTCCTCCATT...
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446,AGAGGACTACCAGTACTATCCGCTGCGGACAGAAAGTGGAGGGCGA...
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092,GATTTGACTTACGGAGTTGCAACACAGTGAAGAGGCGGAGTGCGTG...
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320,ATCACGCACCGTTGCCAAACAATGGGCAGGAAAAAGCCACGTTGAC...
...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060,AAGAATCCACACATCTTTGAACGCAGCGGAATTGAGCGTAGTAGAG...
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762,AATGACATTCTGCTACAGGACAAACACGAGACTGACATGATCACTA...
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,ACGTGTTAATTGCATGATATTGTAAACCAACTTTTACTTTTTTCTT...
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,ATTTTCGTCTACTTGCAGGTTTTTCTGTCCATGTGTTCCAGAATAG...


In [4]:
# Column "index" indicates whether sequences come from the same utr
# Split this column to have this information more easily accessible
index_split_cols = metadata_df['index'].str.split('.', expand=True)
metadata_df.insert(loc=metadata_df.shape[1]-1, column='index_base', value=index_split_cols[0].values)
metadata_df.insert(loc=metadata_df.shape[1]-1, column='index_suffix', value=index_split_cols[1].values)
metadata_df

Unnamed: 0_level_0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,index_suffix,insert_seq
insert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318,20318,,AACGTCAACAAACGATGACGTAGCTTTTGTACGCATGCGTAAGGAT...
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681,72681,,GAGCCCAGGACGCGCGCACTCTGCTGCTCGTTCTTTGTCCTCCATT...
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446,27446,,AGAGGACTACCAGTACTATCCGCTGCGGACAGAAAGTGGAGGGCGA...
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092,113092,,GATTTGACTTACGGAGTTGCAACACAGTGAAGAGGCGGAGTGCGTG...
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320,20320,,ATCACGCACCGTTGCCAAACAATGGGCAGGAAAAAGCCACGTTGAC...
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060,27060,,AAGAATCCACACATCTTTGAACGCAGCGGAATTGAGCGTAGTAGAG...
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762,86762,,AATGACATTCTGCTACAGGACAAACACGAGACTGACATGATCACTA...
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,6,ACGTGTTAATTGCATGATATTGTAAACCAACTTTTACTTTTTTCTT...
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,2,ATTTTCGTCTACTTGCAGGTTTTTCTGTCCATGTGTTCCAGAATAG...


# Load and preprocess TPM data

In [5]:
# Load file
tpm_df = pandas.read_csv('tpm_data_all_samples.tsv', sep='\t', index_col=0)

# Only retain UMI-deduplicated values
tpm_cols = tpm_df.columns.tolist()
tpm_cols = [c for c in tpm_cols if c[:4]=='TPM_']
tpm_df = tpm_df[tpm_cols]

tpm_df

Unnamed: 0_level_0,TPM_mapped_input_2hpf_repA,TPM_mapped_input_2hpf_repB,TPM_mapped_input_2hpf_repC,TPM_mapped_frac1_2hpf_repA,TPM_mapped_frac1_2hpf_repB,TPM_mapped_frac1_2hpf_repC,TPM_mapped_frac2_2hpf_repA,TPM_mapped_frac2_2hpf_repB,TPM_mapped_frac2_2hpf_repC,TPM_mapped_frac3_2hpf_repA,...,TPM_mapped_input_10hpf_repC,TPM_mapped_frac1_10hpf_repA,TPM_mapped_frac1_10hpf_repB,TPM_mapped_frac1_10hpf_repC,TPM_mapped_frac2_10hpf_repA,TPM_mapped_frac2_10hpf_repB,TPM_mapped_frac2_10hpf_repC,TPM_mapped_frac3_10hpf_repA,TPM_mapped_frac3_10hpf_repB,TPM_mapped_frac3_10hpf_repC
insert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,171.146495,201.114752,210.049338,221.606150,212.378930,214.481189,92.888872,96.090673,99.731641,61.664448,...,147.312122,191.441468,250.770023,184.583434,56.032806,116.952629,82.933503,52.747412,40.851123,41.737711
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,97.959820,90.685908,112.749877,55.329010,56.114138,44.249196,35.854400,34.452093,47.921968,59.400849,...,94.316457,69.627556,76.402271,72.712279,35.067928,29.542186,50.479323,31.334690,15.944171,19.739347
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,64.030714,48.272061,43.130988,38.119199,19.933327,21.143461,15.032291,20.513721,23.605636,29.974065,...,66.394997,33.598089,11.367812,17.254783,19.344269,7.243603,22.813465,24.197116,40.851123,26.257381
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,67.674108,80.805719,83.688486,50.773472,42.757065,44.841650,64.597963,68.833412,91.666927,109.200022,...,132.341076,97.936423,79.229856,78.449261,112.563104,89.302387,93.574218,94.680660,143.799857,79.216405
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,81.974429,74.801185,86.610681,55.666457,44.313229,39.509558,65.276945,80.758463,62.829468,45.366536,...,93.214294,83.353067,105.620651,148.249213,63.145890,58.976314,50.479323,24.197116,14.283708,22.590987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000005482_ENSDART00000169300_4321_rapgef2_17462.3,63.028781,57.005929,63.618465,37.275581,23.823737,29.042857,22.501091,26.398812,33.014468,27.257746,...,61.618958,64.480489,40.586192,47.852022,44.052876,37.569675,53.139502,12.598558,24.246488,17.702462
ENSDARG00000037178_ENSDART00000054066_201_zic2b_19629.1,86.893011,83.753400,86.482233,74.226057,48.203638,49.778773,77.724945,88.966616,98.020944,63.928047,...,104.052229,58.475578,81.114913,89.923226,42.181012,45.597165,62.716145,32.226887,22.586025,35.423366
ENSDARG00000037178_ENSDART00000054066_201_zic2b_19629.2,28.780878,20.214510,30.543071,16.016403,11.633787,11.071730,24.538036,23.611137,29.837459,32.237663,...,37.371375,24.161800,23.620681,24.904093,26.082980,21.514696,29.197894,17.951738,7.641854,22.998364
ENSDARG00000101903_ENSDART00000168627_8580_zfyve9a_20081.1,99.553805,93.033135,106.070574,122.312292,73.880343,78.019115,98.999708,81.223076,99.976026,78.867799,...,153.649558,124.529601,138.609145,172.153306,91.223852,70.125606,105.279004,66.130363,70.739465,63.736075


In [6]:
# Make multiindex version of TPM dataframe
tpm_mi_df = tpm_df.copy()
tpm_cols = tpm_mi_df.columns.str.extract("^TPM_mapped_([a-zA-Z0-9]+)_(\d+)hpf_rep([ABC])$")
tpm_cols[1] = tpm_cols[1].astype(int)
tpm_cols.replace({'frac1': '80S', 'frac2': 'LMW', 'frac3': 'HMW'}, inplace=True)
tpm_mi_df.columns = pandas.MultiIndex.from_frame(
    tpm_cols,
    names=['fraction', 'timepoint', 'replicate'],
)
tpm_mi_df = tpm_mi_df.loc[metadata_df.index]
tpm_mi_df

fraction,input,input,input,80S,80S,80S,LMW,LMW,LMW,HMW,...,input,80S,80S,80S,LMW,LMW,LMW,HMW,HMW,HMW
timepoint,2,2,2,2,2,2,2,2,2,2,...,10,10,10,10,10,10,10,10,10,10
replicate,A,B,C,A,B,C,A,B,C,A,...,C,A,B,C,A,B,C,A,B,C
insert_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,171.146495,201.114752,210.049338,221.606150,212.378930,214.481189,92.888872,96.090673,99.731641,61.664448,...,147.312122,191.441468,250.770023,184.583434,56.032806,116.952629,82.933503,52.747412,40.851123,41.737711
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,97.959820,90.685908,112.749877,55.329010,56.114138,44.249196,35.854400,34.452093,47.921968,59.400849,...,94.316457,69.627556,76.402271,72.712279,35.067928,29.542186,50.479323,31.334690,15.944171,19.739347
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,64.030714,48.272061,43.130988,38.119199,19.933327,21.143461,15.032291,20.513721,23.605636,29.974065,...,66.394997,33.598089,11.367812,17.254783,19.344269,7.243603,22.813465,24.197116,40.851123,26.257381
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,67.674108,80.805719,83.688486,50.773472,42.757065,44.841650,64.597963,68.833412,91.666927,109.200022,...,132.341076,97.936423,79.229856,78.449261,112.563104,89.302387,93.574218,94.680660,143.799857,79.216405
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,81.974429,74.801185,86.610681,55.666457,44.313229,39.509558,65.276945,80.758463,62.829468,45.366536,...,93.214294,83.353067,105.620651,148.249213,63.145890,58.976314,50.479323,24.197116,14.283708,22.590987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,1.910848,1.545867,2.220257,1.506171,1.000000,1.789940,2.584291,1.309742,1.733156,1.452720,...,1.367388,2.715689,1.000000,2.912327,1.374373,1.000000,1.000000,5.460984,7.641854,1.814754
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,1.728679,1.764213,1.834913,1.674895,1.518721,1.987425,1.905309,1.309742,1.733156,2.358159,...,2.836938,2.715689,1.000000,1.000000,2.123118,1.000000,2.596107,1.000000,1.000000,2.222131
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,2.821697,1.600453,2.798274,1.674895,1.518721,1.987425,3.489600,2.858450,3.199467,2.810879,...,2.469551,5.289222,1.000000,4.824655,1.374373,1.891943,2.064071,1.000000,1.000000,1.000000
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,4.552309,1.545867,3.247842,2.855960,2.556164,1.394970,3.036945,3.787674,7.109631,3.263599,...,3.388020,2.715689,6.655170,5.780819,3.620610,13.487206,7.384429,1.000000,4.320927,3.036886


In [7]:
# Get min input TPM across replicates as measure of data quality
min_input_tpm_df = tpm_mi_df['input'].min(axis=1)
min_input_tpm_df = pandas.DataFrame(min_input_tpm_df, columns=['min_TPM_input'])
min_input_tpm_df

Unnamed: 0_level_0,min_TPM_input
insert_id,Unnamed: 1_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,112.333151
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,79.508099
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,38.674480
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,67.674108
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,65.137289
...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,1.367388
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,1.000000
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,1.275200
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,1.545867


In [8]:
# Get geom mean TPM to append to summary table
tpm_gmean_df = numpy.exp(numpy.log(tpm_mi_df).groupby(level=[0, 1], axis=1).mean())
tpm_gmean_df

fraction,80S,80S,80S,80S,HMW,HMW,HMW,HMW,LMW,LMW,LMW,LMW,input,input,input,input
timepoint,2,4,6,10,2,4,6,10,2,4,6,10,2,4,6,10
insert_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,216.119597,300.098846,344.106715,206.935398,52.395790,55.702172,59.340141,44.803430,96.196505,132.609072,149.614766,81.607008,193.364948,180.217975,191.701721,131.317241
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,51.599266,72.615184,66.370199,72.861548,44.664956,54.740053,68.632506,21.444703,38.973036,45.577612,54.166972,37.395736,100.054038,108.142798,96.296120,84.498332
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,25.232864,28.564950,28.856285,18.748520,26.113239,40.141316,34.204028,29.607813,19.380357,32.142153,20.427659,14.731017,51.084741,44.266252,49.038036,49.062308
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,46.001865,50.122090,51.030345,84.750185,100.220723,99.442134,83.137348,102.552193,74.144157,70.502780,65.049109,97.980228,77.062564,88.856876,82.746495,137.032432
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,46.019622,63.623896,82.644983,109.283419,35.883390,42.745489,42.463992,19.838702,69.189005,57.990429,68.420218,57.285593,80.981555,87.051978,76.538821,95.539704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,1.391781,2.147890,1.283261,1.992386,1.132558,2.145362,1.527603,4.230865,1.803522,1.765713,1.810106,1.111821,1.871833,1.874528,2.454419,1.981724
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,1.716269,1.405241,2.204592,1.395169,1.548860,2.081752,1.522542,1.304938,1.629282,1.478309,1.879646,1.766440,1.775389,2.103417,2.535939,1.415631
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,1.716269,1.811403,2.652752,2.944101,2.407357,2.832935,3.156488,1.000000,3.171962,4.031745,3.385818,1.750837,2.329242,1.995212,2.326278,2.413340
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,2.167547,2.470124,5.410523,4.709876,2.613460,3.085654,3.186368,2.358677,4.340625,3.281056,5.077191,7.117709,2.837916,2.731825,2.444676,3.017168


In [9]:
# Get mean log2 TPM to append to summary table
log2_tpm_df = numpy.log2(tpm_mi_df).groupby(level=[0, 1], axis=1).mean()
log2_tpm_df

fraction,80S,80S,80S,80S,HMW,HMW,HMW,HMW,LMW,LMW,LMW,LMW,input,input,input,input
timepoint,2,4,6,10,2,4,6,10,2,4,6,10,2,4,6,10
insert_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,7.755686,8.229294,8.426712,7.693037,5.711379,5.799662,5.890936,5.485537,6.587913,7.051036,7.225109,6.350621,7.595182,7.493599,7.582719,7.036913
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,5.689279,6.182199,6.052464,6.187086,5.481071,5.774525,6.100820,4.422549,5.284404,5.510253,5.759342,5.224802,6.644636,6.756794,6.589406,6.400851
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,4.657232,4.836174,4.850814,4.228705,4.706710,5.327016,5.096094,4.887906,4.276523,5.006395,4.352452,3.880785,5.674821,5.468135,5.615829,5.616543
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,5.523620,5.647375,5.673283,6.405145,6.647037,6.635785,6.377425,6.680215,6.212261,6.139608,6.023457,6.614419,6.267958,6.473412,6.370626,7.098374
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,5.524177,5.991497,6.368855,6.771931,5.165244,5.417700,5.408168,4.310246,6.112471,5.857743,6.096351,5.840100,6.339521,6.443805,6.258120,6.578029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,0.476932,1.102920,0.359815,0.994497,0.179585,1.101221,0.611269,2.080953,0.850817,0.820251,0.856074,0.152924,0.904452,0.906528,1.295382,0.986756
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,0.779276,0.490817,1.140512,0.480439,0.631207,1.057798,0.606482,0.383981,0.704236,0.563948,0.910461,0.820845,0.828135,1.072735,1.342520,0.501445
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,0.779276,0.857108,1.407490,1.557827,1.267450,1.502298,1.658320,0.000000,1.665375,2.011404,1.759504,0.808045,1.219860,0.996542,1.218023,1.271031
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,1.116063,1.304583,2.435768,2.235689,1.385961,1.625576,1.671913,1.237978,2.117903,1.714160,2.344031,2.831413,1.504832,1.449865,1.289643,1.593195


In [10]:
# Get difference in input log2 TPM with respect to the first timepoint
# log2_tpm_df = numpy.log2(tpm_mi_df).groupby(level=[0, 1], axis=1).mean()
input_log2_tpm_df = log2_tpm_df['input']
input_diff_log2_tpm_df = pandas.DataFrame()
colnames = []
for timepoint in timepoint_list[1:]:
    input_diff_log2_tpm_df = pandas.concat(
        (
            input_diff_log2_tpm_df,
            input_log2_tpm_df[timepoint] - input_log2_tpm_df[timepoint_list[0]],
        ),
        axis=1,
    )
    colnames.append(f"diff_log2_TPM_input_{timepoint}-{timepoint_list[0]}hpf")
input_diff_log2_tpm_df.columns = colnames
input_diff_log2_tpm_df

Unnamed: 0,diff_log2_TPM_input_4-2hpf,diff_log2_TPM_input_6-2hpf,diff_log2_TPM_input_10-2hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,-0.101583,-0.012463,-0.558270
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,0.112158,-0.055230,-0.243785
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,-0.206685,-0.058991,-0.058277
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,0.205453,0.102668,0.830415
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,0.104284,-0.081402,0.238507
...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,0.002076,0.390930,0.082304
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,0.244599,0.514384,-0.326691
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,-0.223319,-0.001837,0.051171
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,-0.054967,-0.215189,0.088363


# Estimate MRL

In [11]:
# The following values are "estimated" (by eye) from the slide Madalena sent
pol_value_list = [1, 3.5, 8]

In [12]:
# Calculate MRL
mrl_df = pandas.DataFrame()

for timepoint in timepoint_list:
    mrl_timepoint_col = f'MRL_{timepoint}hpf'
    log2_mrl_timepoint_col = f'log2_MRL_{timepoint}hpf'
    
    mrl_timepoint_df = (tpm_gmean_df[pol_fraction_list].xs(timepoint, axis=1, level=1) / tpm_gmean_df['input'][[timepoint]].values * pol_value_list).sum(axis=1)

    mrl_df[mrl_timepoint_col] = mrl_timepoint_df
    mrl_df[log2_mrl_timepoint_col] = numpy.log2(mrl_timepoint_df)

mrl_df

Unnamed: 0_level_0,MRL_2hpf,log2_MRL_2hpf,MRL_4hpf,log2_MRL_4hpf,MRL_6hpf,log2_MRL_6hpf,MRL_10hpf,log2_MRL_10hpf
insert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,5.026628,2.329591,6.713248,2.747011,7.002960,2.807965,6.480393,2.696081
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,5.450300,2.446336,6.196041,2.631347,8.359783,3.063465,4.441558,2.151066
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,5.911159,2.563441,10.441205,3.384216,7.626433,2.931009,6.260806,2.646348
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,14.368484,3.844836,12.294140,3.619899,11.405933,3.511713,9.108052,3.187143
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,7.103448,2.828519,6.990700,2.805437,8.646954,3.112192,4.903643,2.293854
...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,8.956233,3.162892,13.598507,3.765376,8.083156,3.014919,20.048542,4.325425
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,11.157909,3.479995,11.045525,3.465390,8.266639,3.047301,12.727341,3.669859
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,13.771432,3.783607,19.339298,4.273464,17.089539,4.095042,7.074025,2.822531
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,13.484335,3.753213,14.144042,3.822123,19.909238,4.315366,16.071784,4.006458


# Compile data and save

In [13]:
# Prepare TPM gmean df to append
tpm_gmean_df_to_append = tpm_gmean_df.copy()
tpm_gmean_df_to_append.columns = [f'gmean_TPM_{fraction}_{timepoint}hpf' for fraction, timepoint in tpm_gmean_df_to_append.columns]
display(tpm_gmean_df_to_append)

# Prepare mean log2 input TPM
log2_tpm_df_to_append = log2_tpm_df.copy()
log2_tpm_df_to_append.columns = [f'log2_TPM_{fraction}_{timepoint}hpf' for fraction, timepoint in log2_tpm_df_to_append.columns]
display(log2_tpm_df_to_append)

# Prepare difference in log2 input TPM
input_diff_log2_tpm_df_to_append = input_diff_log2_tpm_df.copy()
display(input_diff_log2_tpm_df_to_append)

Unnamed: 0_level_0,gmean_TPM_80S_2hpf,gmean_TPM_80S_4hpf,gmean_TPM_80S_6hpf,gmean_TPM_80S_10hpf,gmean_TPM_HMW_2hpf,gmean_TPM_HMW_4hpf,gmean_TPM_HMW_6hpf,gmean_TPM_HMW_10hpf,gmean_TPM_LMW_2hpf,gmean_TPM_LMW_4hpf,gmean_TPM_LMW_6hpf,gmean_TPM_LMW_10hpf,gmean_TPM_input_2hpf,gmean_TPM_input_4hpf,gmean_TPM_input_6hpf,gmean_TPM_input_10hpf
insert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,216.119597,300.098846,344.106715,206.935398,52.395790,55.702172,59.340141,44.803430,96.196505,132.609072,149.614766,81.607008,193.364948,180.217975,191.701721,131.317241
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,51.599266,72.615184,66.370199,72.861548,44.664956,54.740053,68.632506,21.444703,38.973036,45.577612,54.166972,37.395736,100.054038,108.142798,96.296120,84.498332
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,25.232864,28.564950,28.856285,18.748520,26.113239,40.141316,34.204028,29.607813,19.380357,32.142153,20.427659,14.731017,51.084741,44.266252,49.038036,49.062308
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,46.001865,50.122090,51.030345,84.750185,100.220723,99.442134,83.137348,102.552193,74.144157,70.502780,65.049109,97.980228,77.062564,88.856876,82.746495,137.032432
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,46.019622,63.623896,82.644983,109.283419,35.883390,42.745489,42.463992,19.838702,69.189005,57.990429,68.420218,57.285593,80.981555,87.051978,76.538821,95.539704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,1.391781,2.147890,1.283261,1.992386,1.132558,2.145362,1.527603,4.230865,1.803522,1.765713,1.810106,1.111821,1.871833,1.874528,2.454419,1.981724
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,1.716269,1.405241,2.204592,1.395169,1.548860,2.081752,1.522542,1.304938,1.629282,1.478309,1.879646,1.766440,1.775389,2.103417,2.535939,1.415631
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,1.716269,1.811403,2.652752,2.944101,2.407357,2.832935,3.156488,1.000000,3.171962,4.031745,3.385818,1.750837,2.329242,1.995212,2.326278,2.413340
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,2.167547,2.470124,5.410523,4.709876,2.613460,3.085654,3.186368,2.358677,4.340625,3.281056,5.077191,7.117709,2.837916,2.731825,2.444676,3.017168


Unnamed: 0_level_0,log2_TPM_80S_2hpf,log2_TPM_80S_4hpf,log2_TPM_80S_6hpf,log2_TPM_80S_10hpf,log2_TPM_HMW_2hpf,log2_TPM_HMW_4hpf,log2_TPM_HMW_6hpf,log2_TPM_HMW_10hpf,log2_TPM_LMW_2hpf,log2_TPM_LMW_4hpf,log2_TPM_LMW_6hpf,log2_TPM_LMW_10hpf,log2_TPM_input_2hpf,log2_TPM_input_4hpf,log2_TPM_input_6hpf,log2_TPM_input_10hpf
insert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,7.755686,8.229294,8.426712,7.693037,5.711379,5.799662,5.890936,5.485537,6.587913,7.051036,7.225109,6.350621,7.595182,7.493599,7.582719,7.036913
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,5.689279,6.182199,6.052464,6.187086,5.481071,5.774525,6.100820,4.422549,5.284404,5.510253,5.759342,5.224802,6.644636,6.756794,6.589406,6.400851
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,4.657232,4.836174,4.850814,4.228705,4.706710,5.327016,5.096094,4.887906,4.276523,5.006395,4.352452,3.880785,5.674821,5.468135,5.615829,5.616543
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,5.523620,5.647375,5.673283,6.405145,6.647037,6.635785,6.377425,6.680215,6.212261,6.139608,6.023457,6.614419,6.267958,6.473412,6.370626,7.098374
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,5.524177,5.991497,6.368855,6.771931,5.165244,5.417700,5.408168,4.310246,6.112471,5.857743,6.096351,5.840100,6.339521,6.443805,6.258120,6.578029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,0.476932,1.102920,0.359815,0.994497,0.179585,1.101221,0.611269,2.080953,0.850817,0.820251,0.856074,0.152924,0.904452,0.906528,1.295382,0.986756
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,0.779276,0.490817,1.140512,0.480439,0.631207,1.057798,0.606482,0.383981,0.704236,0.563948,0.910461,0.820845,0.828135,1.072735,1.342520,0.501445
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,0.779276,0.857108,1.407490,1.557827,1.267450,1.502298,1.658320,0.000000,1.665375,2.011404,1.759504,0.808045,1.219860,0.996542,1.218023,1.271031
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,1.116063,1.304583,2.435768,2.235689,1.385961,1.625576,1.671913,1.237978,2.117903,1.714160,2.344031,2.831413,1.504832,1.449865,1.289643,1.593195


Unnamed: 0,diff_log2_TPM_input_4-2hpf,diff_log2_TPM_input_6-2hpf,diff_log2_TPM_input_10-2hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,-0.101583,-0.012463,-0.558270
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,0.112158,-0.055230,-0.243785
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,-0.206685,-0.058991,-0.058277
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,0.205453,0.102668,0.830415
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,0.104284,-0.081402,0.238507
...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,0.002076,0.390930,0.082304
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,0.244599,0.514384,-0.326691
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,-0.223319,-0.001837,0.051171
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,-0.054967,-0.215189,0.088363


In [14]:
# Compile data to save

# Start from summary table
df_to_save = metadata_df.copy()

# Add TPM, log2_TPM, input_diff_log2_TPM, MRL, log2_MRL
df_to_save = pandas.concat((df_to_save, min_input_tpm_df), axis=1)
df_to_save = pandas.concat((df_to_save, tpm_gmean_df_to_append), axis=1)
df_to_save = pandas.concat((df_to_save, log2_tpm_df_to_append), axis=1)
df_to_save = pandas.concat((df_to_save, input_diff_log2_tpm_df_to_append), axis=1)
df_to_save = pandas.concat((df_to_save, mrl_df), axis=1)

# Save
df_to_save.to_csv('Zb_5UTR_MPRA_TPM_MRL.tsv.gz', sep='\t')
df_to_save

Unnamed: 0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,...,diff_log2_TPM_input_6-2hpf,diff_log2_TPM_input_10-2hpf,MRL_2hpf,log2_MRL_2hpf,MRL_4hpf,log2_MRL_4hpf,MRL_6hpf,log2_MRL_6hpf,MRL_10hpf,log2_MRL_10hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318,20318,...,-0.012463,-0.558270,5.026628,2.329591,6.713248,2.747011,7.002960,2.807965,6.480393,2.696081
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681,72681,...,-0.055230,-0.243785,5.450300,2.446336,6.196041,2.631347,8.359783,3.063465,4.441558,2.151066
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446,27446,...,-0.058991,-0.058277,5.911159,2.563441,10.441205,3.384216,7.626433,2.931009,6.260806,2.646348
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092,113092,...,0.102668,0.830415,14.368484,3.844836,12.294140,3.619899,11.405933,3.511713,9.108052,3.187143
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320,20320,...,-0.081402,0.238507,7.103448,2.828519,6.990700,2.805437,8.646954,3.112192,4.903643,2.293854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060,27060,...,0.390930,0.082304,8.956233,3.162892,13.598507,3.765376,8.083156,3.014919,20.048542,4.325425
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762,86762,...,0.514384,-0.326691,11.157909,3.479995,11.045525,3.465390,8.266639,3.047301,12.727341,3.669859
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,...,-0.001837,0.051171,13.771432,3.783607,19.339298,4.273464,17.089539,4.095042,7.074025,2.822531
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,...,-0.215189,0.088363,13.484335,3.753213,14.144042,3.822123,19.909238,4.315366,16.071784,4.006458
