In [1]:
import pandas as pd
import os
import scipy.stats as stats
from scipy.stats import spearmanr, pearsonr

def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2

def performances(label, pred):
    
    r = r2(label, pred)
    try:
        pearson_r = pearsonr(label, pred)[0]
    except:
        pearson_r = 1e-9
    try:
        sp_cor = spearmanr(label, pred)[0]
    except:
        sp_cor = 1e-9
    
    print(f'r-squared = {r:.4f} | pearson r = {pearson_r:.4f} | spearman R = {sp_cor:.4f}')
        
    return [r, pearson_r, sp_cor]



# RNAFM

In [63]:
pwd = '/home/ubuntu/RNA-FM/tutorials/cao_results/'
rnafm = {}
for f in sorted(os.listdir(pwd)):
    if 'results' in f:
        res_df = pd.read_csv(pwd + f)
        metrics = performances(list(res_df.y_pred), list(res_df.y_true))
        rnafm[f.replace('results.csv', 'alldata')] = metrics
    elif 'metrics' in f:
        cv_metrics = pd.read_csv(pwd + f, index_col = 0)
        rnafm[f.replace('metrics.csv', 'meanfold')] = cv_metrics.loc['mean']
rnafm = pd.DataFrame(rnafm).T
rnafm

r-squared = 0.0296 | pearson r = 0.1722 | spearman R = 0.1617
r-squared = 0.0092 | pearson r = 0.0957 | spearman R = 0.0930
r-squared = 0.0036 | pearson r = 0.0600 | spearman R = 0.0542
r-squared = 0.0022 | pearson r = -0.0467 | spearman R = -0.0456
r-squared = 0.0240 | pearson r = 0.1549 | spearman R = 0.1429
r-squared = 0.1022 | pearson r = 0.3197 | spearman R = 0.2808
r-squared = 0.0564 | pearson r = 0.2375 | spearman R = 0.2005
r-squared = 0.2338 | pearson r = 0.4835 | spearman R = 0.4061
r-squared = 0.4970 | pearson r = 0.7050 | spearman R = 0.6631
r-squared = 0.5384 | pearson r = 0.7338 | spearman R = 0.7304
r-squared = 0.2975 | pearson r = 0.5455 | spearman R = 0.5848
r-squared = 0.0271 | pearson r = 0.1647 | spearman R = 0.1434
r-squared = 0.4967 | pearson r = 0.7048 | spearman R = 0.6335


Unnamed: 0,R2,Pearson R,Spearman R
RNAFM_MLP_HEK_rnaseq_log_meanfold,0.022954,0.147983,0.137854
RNAFM_MLP_HEK_rnaseq_log_alldata,0.029647,0.172183,0.161692
RNAFM_MLP_HEK_te_log_meanfold,0.001172,0.01706,0.018339
RNAFM_MLP_HEK_te_log_alldata,0.009163,0.095721,0.092985
RNAFM_MLP_Muscle_rnaseq_log_meanfold,0.013441,0.053237,0.052203
RNAFM_MLP_Muscle_rnaseq_log_alldata,0.003597,0.059971,0.054188
RNAFM_MLP_Muscle_te_log_meanfold,0.013665,0.012394,-0.014167
RNAFM_MLP_Muscle_te_log_alldata,0.002185,-0.04674,-0.045573
RNAFM_MLP_pc3_rnaseq_log_meanfold,0.013995,0.108213,0.096017
RNAFM_MLP_pc3_rnaseq_log_alldata,0.02399,0.154886,0.142922


In [64]:
pwd = '/home/ubuntu/RNA-FM/tutorials/cao_results/'
rnafm_mlp = {}
for f in sorted(os.listdir(pwd)):
    if 'metrics' in f and 'MLP' in f:
        cv_metrics = pd.read_csv(pwd + f, index_col = 0)
        rnafm_mlp[f.replace('metrics.csv', 'meanfold')] = cv_metrics.loc['mean']
rnafm_mlp = pd.DataFrame(rnafm_mlp).T
rnafm_mlp = rnafm_mlp.add_prefix('RNAFM_MLP_Test_')
rnafm_mlp['data'] = ['_'.join(s.split('_')[2:-2]) for s in rnafm_mlp.index]
rnafm_mlp

Unnamed: 0,RNAFM_MLP_Test_R2,RNAFM_MLP_Test_Pearson R,RNAFM_MLP_Test_Spearman R,data
RNAFM_MLP_HEK_rnaseq_log_meanfold,0.022954,0.147983,0.137854,HEK_rnaseq
RNAFM_MLP_HEK_te_log_meanfold,0.001172,0.01706,0.018339,HEK_te
RNAFM_MLP_Muscle_rnaseq_log_meanfold,0.013441,0.053237,0.052203,Muscle_rnaseq
RNAFM_MLP_Muscle_te_log_meanfold,0.013665,0.012394,-0.014167,Muscle_te
RNAFM_MLP_pc3_rnaseq_log_meanfold,0.013995,0.108213,0.096017,pc3_rnaseq
RNAFM_MLP_pc3_te_log_meanfold,0.003728,0.034268,0.03544,pc3_te


In [4]:
pwd = '/home/ubuntu/RNA-FM/tutorials/cao_results/'
rnafm_resnet = {}
for f in sorted(os.listdir(pwd)):
    if 'metrics' in f and 'ResNet' in f and 'MLP' not in f:
        cv_metrics = pd.read_csv(pwd + f, index_col = 0)
        rnafm_resnet[f.replace('metrics.csv', 'meanfold')] = cv_metrics.loc['mean']
rnafm_resnet = pd.DataFrame(rnafm_resnet).T
rnafm_resnet = rnafm_resnet.add_prefix('RNAFM_ResNet_Test_')
rnafm_resnet['data'] = ['_'.join(s.split('_')[2:-2]) for s in rnafm_resnet.index]
rnafm_resnet

Unnamed: 0,RNAFM_ResNet_Test_R2,RNAFM_ResNet_Test_Pearson R,RNAFM_ResNet_Test_Spearman R,data
RNAFM_ResNet_HEK_rnaseq_log_meanfold,0.043393,0.202691,0.17178,HEK_rnaseq
RNAFM_ResNet_HEK_te_log_meanfold,0.016635,0.076708,0.063899,HEK_te
RNAFM_ResNet_Muscle_rnaseq_log_meanfold,0.54527,0.704663,0.631998,Muscle_rnaseq
RNAFM_ResNet_Muscle_te_log_meanfold,0.074334,0.192222,0.177788,Muscle_te
RNAFM_ResNet_Muscle_te_log_metrics_reloadCV.csv,0.008939,0.049202,0.038811,Muscle_te_log
RNAFM_ResNet_pc3_rnaseq_log_meanfold,0.016225,0.115396,0.097705,pc3_rnaseq
RNAFM_ResNet_pc3_te_log_meanfold,0.040445,0.130057,0.113203,pc3_te


# RNABERT

In [5]:
pwd = '/home/ubuntu/RNABERT/'
files = [f for f in os.listdir(pwd) if '_CNN.csv' in f]
rnabert_mlp = pd.DataFrame()
for f in files:
    rnabert_mlp = rnabert_mlp.append(pd.read_csv(pwd + f, index_col = 0).loc['mean'])
rnabert_mlp.index = files
rnabert_mlp = rnabert_mlp.sort_values('Test_SpearmanR', ascending = False)[['Test_R2', 'Test_PearsonR', 'Test_SpearmanR', 'Train_R2', 'Train_PearsonR',
       'Train_SpearmanR', 'best_epoch']]

rnabert_mlp = rnabert_mlp.add_prefix('RNABERT_')
rnabert_mlp['data'] = ['_'.join(s.split('_')[1:3]) for s in rnabert_mlp.index]
rnabert_mlp = rnabert_mlp[['RNABERT_Test_R2', 'RNABERT_Test_PearsonR', 'RNABERT_Test_SpearmanR', 'data']].drop_duplicates('data', keep = 'first')
rnabert_mlp

Unnamed: 0,RNABERT_Test_R2,RNABERT_Test_PearsonR,RNABERT_Test_SpearmanR,data
RNABERT_Muscle_rnaseq_log_CNNlayer0_epoch300_finetunedFalse_lr0.01_dropout30.5_CNN.csv,0.4472,0.6646,0.5993,Muscle_rnaseq
RNABERT_Muscle_te_log_CNNlayer0_epoch300_finetunedFalse_lr0.001_dropout30.5_CNN.csv,0.3521,0.5889,0.5606,Muscle_te
RNABERT_pc3_te_log_CNNlayer0_epoch300_finetunedFalse_lr0.0001_dropout30.5_CNN.csv,0.2771,0.5258,0.4788,pc3_te
RNABERT_HEK_rnaseq_log_CNNlayer0_epoch300_finetunedFalse_lr0.001_dropout30.5_CNN.csv,0.2478,0.4971,0.468,HEK_rnaseq
RNABERT_pc3_rnaseq_log_CNNlayer0_epoch300_finetunedFalse_lr0.001_dropout30.5_CNN.csv,0.2239,0.4725,0.4466,pc3_rnaseq
RNABERT_HEK_te_log_CNNlayer0_epoch300_finetunedFalse_lr0.0001_dropout30.5_CNN.csv,0.2106,0.4588,0.4118,HEK_te


# Kipoi

In [6]:
kipoi = {}
for cell_line in ['Muscle', 'pc3', 'HEK']:
    for label_type in ['te_log', 'rnaseq_log']:
        fn = f'kipoi_{cell_line}_{label_type}_10foldcv'
        metrics_cv = pd.read_csv(f'/home/ubuntu/5UTR/Modelling/{fn}_metrics.csv', index_col = 0)
        kipoi[fn + '_meanfold'] = metrics_cv.loc['mean'][['Test_R2', 'Test_PearsonR', 'Test_SpearmanR']]
        
        res_df = pd.read_csv(f'/home/ubuntu/5UTR/Modelling/{fn}_results.csv')
        metrics = performances(list(res_df.y_pred), list(res_df[label_type]))
        kipoi[fn + '_alldata'] = metrics
kipoi = pd.DataFrame(kipoi).T
kipoi

r-squared = 0.7275 | pearson r = -0.8529 | spearman R = -0.8767
r-squared = 0.0004 | pearson r = -0.0198 | spearman R = -0.0083
r-squared = 0.0431 | pearson r = -0.2076 | spearman R = -0.2407
r-squared = 0.0128 | pearson r = 0.1130 | spearman R = 0.0910
r-squared = 0.1223 | pearson r = -0.3498 | spearman R = -0.3711
r-squared = 0.0372 | pearson r = 0.1930 | spearman R = 0.1665


Unnamed: 0,Test_R2,Test_PearsonR,Test_SpearmanR
kipoi_Muscle_te_log_10foldcv_meanfold,0.012941,0.058718,0.057258
kipoi_Muscle_te_log_10foldcv_alldata,0.727478,-0.852923,-0.876706
kipoi_Muscle_rnaseq_log_10foldcv_meanfold,0.012774,0.00745,0.009575
kipoi_Muscle_rnaseq_log_10foldcv_alldata,0.000391,-0.019766,-0.008311
kipoi_pc3_te_log_10foldcv_meanfold,0.001694,0.03101,0.026349
kipoi_pc3_te_log_10foldcv_alldata,0.043102,-0.20761,-0.240668
kipoi_pc3_rnaseq_log_10foldcv_meanfold,0.033205,0.169635,0.140317
kipoi_pc3_rnaseq_log_10foldcv_alldata,0.012758,0.112951,0.091016
kipoi_HEK_te_log_10foldcv_meanfold,0.001353,0.001639,0.005659
kipoi_HEK_te_log_10foldcv_alldata,0.122334,-0.349763,-0.37107


In [7]:
kipoi = {}
for cell_line in ['Muscle', 'pc3', 'HEK']:
    for label_type in ['te_log', 'rnaseq_log']:
        fn = f'kipoi_{cell_line}_{label_type}_10foldcv'
        metrics_cv = pd.read_csv(f'/home/ubuntu/5UTR/Modelling/{fn}_metrics.csv', index_col = 0)
        kipoi[fn + '_meanfold'] = metrics_cv.loc['mean'][['Test_R2', 'Test_PearsonR', 'Test_SpearmanR']]
        
kipoi = pd.DataFrame(kipoi).T
kipoi = kipoi.add_prefix('Kipoi_')
kipoi['data'] = ['_'.join(s.split('_')[1:-3]) for s in kipoi.index]
kipoi

Unnamed: 0,Kipoi_Test_R2,Kipoi_Test_PearsonR,Kipoi_Test_SpearmanR,data
kipoi_Muscle_te_log_10foldcv_meanfold,0.012941,0.058718,0.057258,Muscle_te
kipoi_Muscle_rnaseq_log_10foldcv_meanfold,0.012774,0.00745,0.009575,Muscle_rnaseq
kipoi_pc3_te_log_10foldcv_meanfold,0.001694,0.03101,0.026349,pc3_te
kipoi_pc3_rnaseq_log_10foldcv_meanfold,0.033205,0.169635,0.140317,pc3_rnaseq
kipoi_HEK_te_log_10foldcv_meanfold,0.001353,0.001639,0.005659,HEK_te
kipoi_HEK_rnaseq_log_10foldcv_meanfold,0.045568,0.209863,0.178606,HEK_rnaseq


# ESM_MLP

In [8]:
muscle_esm_df, muscle_metrics = pd.DataFrame(), []
for i in range(10):
    temp = pd.read_csv(f'/home/ubuntu/esm2/Cao/y_pred/FeatCVESM2lr1e-5_DDP_M3.1.1e-4_Muscle_te_log_27CaoFeats_utr_seqlen100_AvgEmbFalse_BosEmbTrue_CNNlayer0_epoch300_patiences0_nodes40_dropout30.5_finetuneTrue_huberlossTrue_magicFalse_lr0.0001_val_fold{i}.csv')
    muscle_metrics.append(performances(list(temp.y_true), list(temp.y_pred)))
    muscle_esm_df = muscle_esm_df.append(temp)
muscle_metrics = pd.DataFrame(muscle_metrics, index = list(range(10)), columns = ['Test_R2', 'Test_PearsonR', 'Test_SpearmanR'])
muscle_metrics.loc['mean'] = muscle_metrics.mean(axis = 0)
muscle_metrics.loc['std'] = muscle_metrics.std(axis = 0)
muscle_metrics.loc['alldata_mean'] = performances(list(muscle_esm_df.y_true), list(muscle_esm_df.y_pred))
muscle_metrics

r-squared = 0.5712 | pearson r = 0.7558 | spearman R = 0.7485
r-squared = 0.3787 | pearson r = 0.6154 | spearman R = 0.6058
r-squared = 0.5075 | pearson r = 0.7124 | spearman R = 0.6865
r-squared = 0.4809 | pearson r = 0.6935 | spearman R = 0.6583
r-squared = 0.4417 | pearson r = 0.6646 | spearman R = 0.5996
r-squared = 0.4687 | pearson r = 0.6846 | spearman R = 0.7103
r-squared = 0.3999 | pearson r = 0.6324 | spearman R = 0.6320
r-squared = 0.4767 | pearson r = 0.6904 | spearman R = 0.6385
r-squared = 0.5301 | pearson r = 0.7281 | spearman R = 0.6694
r-squared = 0.5921 | pearson r = 0.7695 | spearman R = 0.7104
r-squared = 0.4758 | pearson r = 0.6898 | spearman R = 0.6687


Unnamed: 0,Test_R2,Test_PearsonR,Test_SpearmanR
0,0.571171,0.755759,0.748486
1,0.378724,0.615405,0.605777
2,0.507527,0.71241,0.686513
3,0.480945,0.693502,0.658256
4,0.44171,0.664613,0.599576
5,0.468726,0.684636,0.710342
6,0.399895,0.632372,0.632033
7,0.476652,0.6904,0.638506
8,0.530134,0.728103,0.669422
9,0.592059,0.769454,0.710427


In [9]:
pc3_esm_df, pc3_metrics = pd.DataFrame(), []
for i in range(10):
    temp = pd.read_csv(f'/home/ubuntu/esm2/Cao/y_pred/FeatCVESM2lr1e-5_ESM2SI_3.1_P.1e-4.dr5_pc3_te_log_27CaoFeats_utr_seqlen100_AvgEmbFalse_BosEmbTrue_CNNlayer0_epoch300_patiences0_nodes40_dropout30.5_finetuneTrue_huberlossTrue_magicFalse_lr0.0001_val_fold{i}.csv')
    pc3_metrics.append(performances(list(temp.y_true), list(temp.y_pred)))
    pc3_esm_df = pc3_esm_df.append(temp)
pc3_metrics = pd.DataFrame(pc3_metrics, index = list(range(10)), columns = ['Test_R2', 'Test_PearsonR', 'Test_SpearmanR'])
pc3_metrics.loc['mean'] = pc3_metrics.mean(axis = 0)
pc3_metrics.loc['std'] = pc3_metrics.std(axis = 0)
pc3_metrics.loc['alldata_mean'] = performances(list(pc3_esm_df.y_true), list(pc3_esm_df.y_pred))
pc3_metrics

r-squared = 0.4775 | pearson r = 0.6910 | spearman R = 0.6400
r-squared = 0.5019 | pearson r = 0.7084 | spearman R = 0.6527
r-squared = 0.4636 | pearson r = 0.6809 | spearman R = 0.6038
r-squared = 0.4280 | pearson r = 0.6542 | spearman R = 0.6150
r-squared = 0.4591 | pearson r = 0.6775 | spearman R = 0.6420
r-squared = 0.4891 | pearson r = 0.6994 | spearman R = 0.6189
r-squared = 0.4933 | pearson r = 0.7023 | spearman R = 0.6278
r-squared = 0.4780 | pearson r = 0.6914 | spearman R = 0.6468
r-squared = 0.4485 | pearson r = 0.6697 | spearman R = 0.6444
r-squared = 0.4162 | pearson r = 0.6452 | spearman R = 0.6141
r-squared = 0.4649 | pearson r = 0.6819 | spearman R = 0.6305


Unnamed: 0,Test_R2,Test_PearsonR,Test_SpearmanR
0,0.477469,0.690991,0.639953
1,0.501882,0.708436,0.652653
2,0.463572,0.680861,0.603829
3,0.428006,0.654221,0.615013
4,0.459063,0.677542,0.642042
5,0.48914,0.699386,0.618881
6,0.49326,0.702324,0.627768
7,0.47804,0.691405,0.646819
8,0.448521,0.669717,0.644363
9,0.41623,0.645159,0.614094


In [10]:
HEK_esm_df, HEK_metrics = pd.DataFrame(), []
for i in range(10):
    temp = pd.read_csv(f'/home/ubuntu/esm2/Cao/y_pred/CVESM2lr1e-5_ESM2SI_3.1.1e-2.H.dropout5_HEK_te_log_utr_seqlen100_AvgEmbFalse_BosEmbTrue_CNNlayer0_epoch300_patiences0_nodes40_dropout30.5_finetuneTrue_huberlossTrue_magicFalse_lr0.01_val_fold{i}.csv')
    HEK_metrics.append(performances(list(temp.y_true), list(temp.y_pred)))
    HEK_esm_df = HEK_esm_df.append(temp)
HEK_metrics = pd.DataFrame(HEK_metrics, index = list(range(10)), columns = ['Test_R2', 'Test_PearsonR', 'Test_SpearmanR'])
HEK_metrics.loc['mean'] = HEK_metrics.mean(axis = 0)
HEK_metrics.loc['std'] = HEK_metrics.std(axis = 0)
HEK_metrics.loc['alldata_mean'] = performances(list(HEK_esm_df.y_true), list(HEK_esm_df.y_pred))
HEK_metrics

r-squared = 0.3851 | pearson r = 0.6206 | spearman R = 0.6035
r-squared = 0.4374 | pearson r = 0.6613 | spearman R = 0.6178
r-squared = 0.3486 | pearson r = 0.5904 | spearman R = 0.5635
r-squared = 0.4328 | pearson r = 0.6579 | spearman R = 0.6064
r-squared = 0.4362 | pearson r = 0.6604 | spearman R = 0.6162
r-squared = 0.4074 | pearson r = 0.6382 | spearman R = 0.6153
r-squared = 0.4202 | pearson r = 0.6482 | spearman R = 0.5879
r-squared = 0.4199 | pearson r = 0.6480 | spearman R = 0.5988
r-squared = 0.4185 | pearson r = 0.6469 | spearman R = 0.6195
r-squared = 0.4106 | pearson r = 0.6408 | spearman R = 0.6080
r-squared = 0.4051 | pearson r = 0.6365 | spearman R = 0.5985


Unnamed: 0,Test_R2,Test_PearsonR,Test_SpearmanR
0,0.385098,0.620563,0.60354
1,0.437369,0.661339,0.617768
2,0.348569,0.590397,0.563545
3,0.432835,0.657902,0.606385
4,0.436162,0.660426,0.616173
5,0.407352,0.638241,0.615326
6,0.420173,0.648207,0.587895
7,0.419867,0.647972,0.598786
8,0.418491,0.646909,0.619503
9,0.410646,0.640817,0.607958


In [11]:
esm_mlp = pd.DataFrame()
esm_mlp = esm_mlp.append(muscle_metrics.loc['mean', :])
esm_mlp = esm_mlp.append(pc3_metrics.loc['mean', :])
esm_mlp = esm_mlp.append(HEK_metrics.loc['mean', :])
for f in ['CVESM2lr1e-5_RNAlog_ESM2SI_3.1.1e-2.H.dropout5_HEK_rnaseq_log_utr_seqlen100_AvgEmbFalse_BosEmbTrue_CNNlayer0_epoch300_patiences0_nodes40_dropout30.5_finetuneTrue_huberlossTrue_magicFalse_lr0.01_metrics.csv',
'FeatCVESM2lr1e-5_RNAlog_ESM2SI_3.1_P.1e-4.dr5_pc3_rnaseq_log_27CaoFeats_utr_seqlen100_AvgEmbFalse_BosEmbTrue_CNNlayer0_epoch300_patiences0_nodes40_dropout30.5_finetuneTrue_huberlossTrue_magicFalse_lr0.0001_metrics.csv',
'FeatCVESM2lr1e-5_RNAlog_DDP_M3.1.1e-4_Muscle_rnaseq_log_27CaoFeats_utr_seqlen100_AvgEmbFalse_BosEmbTrue_CNNlayer0_epoch300_patiences0_nodes40_dropout30.5_finetuneTrue_huberlossTrue_magicFalse_lr0.0001_metrics.csv']:
    esm_mlp = esm_mlp.append(pd.read_csv(f'/home/ubuntu/esm2/Cao/results/{f}', index_col = 0).loc['mean', :])

esm_mlp = esm_mlp.add_prefix('ESM_MLP_')
esm_mlp['data'] = ['Muscle_te',
'pc3_te',
'HEK_te',
'Muscle_rnaseq',
'pc3_rnaseq',
'HEK_rnaseq']
esm_mlp = esm_mlp[['ESM_MLP_Test_R2', 'ESM_MLP_Test_PearsonR', 'ESM_MLP_Test_SpearmanR', 'data']]
esm_mlp

Unnamed: 0,ESM_MLP_Test_R2,ESM_MLP_Test_PearsonR,ESM_MLP_Test_SpearmanR,data
mean,0.484754,0.694665,0.665934,Muscle_te
mean,0.465518,0.682004,0.630541,pc3_te
mean,0.411656,0.641277,0.603688,HEK_te
mean,0.424789,0.651494,0.615477,Muscle_rnaseq
mean,0.354287,0.594308,0.562893,pc3_rnaseq
mean,0.485679,0.694588,0.652459,HEK_rnaseq


# ESM_ResNet

# Optimus

In [28]:
from tqdm import tqdm

In [69]:
optimus, optimus_f = pd.DataFrame(), []
for f in tqdm(os.listdir('/home/ubuntu/CNN/')):
    if 'Optimus_Keras' in f and '_metrics.csv' in f:
        optimus = optimus.append(pd.read_csv(f'/home/ubuntu/CNN/{f}', index_col = 0).loc['mean'])
        optimus_f.append(f)
optimus['file'] = optimus_f
optimus = optimus.set_index('file')
optimus = optimus.add_prefix('Optimus_')
optimus['data'] = ['_'.join(s.split('_')[2:4]) for s in optimus.index]
optimus = optimus.sort_values(by = ['data', 'Optimus_Test_SpearmanR'], ascending = False)
optimus = optimus.drop_duplicates('data', keep = 'first')
optimus

100%|██████████| 233/233 [00:00<00:00, 2536.41it/s]



Unnamed: 0_level_0,Optimus_Test_R2,Optimus_Test_PearsonR,Optimus_Test_SpearmanR,data
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Optimus_Keras_pc3_te_log_10foldcv_epochs200_metrics.csv,0.526478,0.725482,0.688503,pc3_te
Optimus_Keras_pc3_rnaseq_log_10foldcv_epochs200_metrics.csv,0.504787,0.710177,0.676105,pc3_rnaseq
Optimus_Keras_Muscle_te_log_10foldcv_epochs200_metrics.csv,0.476667,0.688124,0.670826,Muscle_te
Optimus_Keras_Muscle_rnaseq_log_10foldcv_epochs200_metrics.csv,0.56452,0.750097,0.691968,Muscle_rnaseq
Optimus_Keras_HEK_te_log_10foldcv_epochs200_metrics.csv,0.448824,0.669613,0.635757,HEK_te
Optimus_Keras_HEK_rnaseq_log_10foldcv_epochs200_metrics.csv,0.53623,0.732016,0.692626,HEK_rnaseq


Unnamed: 0_level_0,Optimus_Test_R2,Optimus_Test_PearsonR,Optimus_Test_SpearmanR,data
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Optimus_Keras_pc3_te_log_10foldcv_epochs200_metrics.csv,0.526478,0.725482,0.688503,pc3_te
Optimus_Keras_pc3_rnaseq_log_10foldcv_epochs200_metrics.csv,0.504787,0.710177,0.676105,pc3_rnaseq
Optimus_Keras_Muscle_te_log_10foldcv_epochs200_metrics.csv,0.476667,0.688124,0.670826,Muscle_te
Optimus_Keras_Muscle_rnaseq_log_10foldcv_epochs200_metrics.csv,0.56452,0.750097,0.691968,Muscle_rnaseq
Optimus_Keras_HEK_te_log_10foldcv_epochs200_metrics.csv,0.448824,0.669613,0.635757,HEK_te
Optimus_Keras_HEK_rnaseq_log_10foldcv_epochs200_metrics.csv,0.53623,0.732016,0.692626,HEK_rnaseq


In [71]:
optimus, optimus_f = pd.DataFrame(), []
for f in tqdm(os.listdir('/home/ubuntu/CNN/')):
    if 'Optimus_Keras' in f and '_metrics.csv' in f and 'epochs' not in f:
        optimus = optimus.append(pd.read_csv(f'/home/ubuntu/CNN/{f}', index_col = 0).loc['mean'])
        optimus_f.append(f)
optimus['file'] = optimus_f
optimus = optimus.set_index('file')
optimus = optimus.add_prefix('Optimus_')
optimus['data'] = ['_'.join(s.split('_')[2:4]) for s in optimus.index]
optimus = optimus.sort_values(by = ['data', 'Optimus_Test_SpearmanR'], ascending = False)
optimus = optimus.drop_duplicates('data', keep = 'first')
optimus

100%|██████████| 233/233 [00:00<00:00, 12040.57it/s]



Unnamed: 0_level_0,Optimus_Test_R2,Optimus_Test_PearsonR,Optimus_Test_SpearmanR,data
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Optimus_Keras_pc3_te_log_10foldcv_metrics.csv,0.189364,0.43265,0.380428,pc3_te
Optimus_Keras_pc3_rnaseq_log_10foldcv_metrics.csv,0.054976,0.22759,0.189629,pc3_rnaseq
Optimus_Keras_Muscle_te_log_10foldcv_metrics.csv,0.192694,0.433315,0.414386,Muscle_te
Optimus_Keras_Muscle_rnaseq_log_10foldcv_metrics.csv,0.033868,0.112316,0.152015,Muscle_rnaseq
Optimus_Keras_HEK_te_log_10foldcv_metrics.csv,0.168875,0.405266,0.360436,HEK_te
Optimus_Keras_HEK_rnaseq_log_10foldcv_metrics.csv,0.051391,0.214049,0.184561,HEK_rnaseq


Unnamed: 0_level_0,Optimus_Test_R2,Optimus_Test_PearsonR,Optimus_Test_SpearmanR,data
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Optimus_Keras_pc3_te_log_10foldcv_metrics.csv,0.189364,0.43265,0.380428,pc3_te
Optimus_Keras_pc3_rnaseq_log_10foldcv_metrics.csv,0.054976,0.22759,0.189629,pc3_rnaseq
Optimus_Keras_Muscle_te_log_10foldcv_metrics.csv,0.192694,0.433315,0.414386,Muscle_te
Optimus_Keras_Muscle_rnaseq_log_10foldcv_metrics.csv,0.033868,0.112316,0.152015,Muscle_rnaseq
Optimus_Keras_HEK_te_log_10foldcv_metrics.csv,0.168875,0.405266,0.360436,HEK_te
Optimus_Keras_HEK_rnaseq_log_10foldcv_metrics.csv,0.051391,0.214049,0.184561,HEK_rnaseq


# rnabert_ResNet

# 整合

In [72]:
metrics_df = pd.merge(rnabert_mlp, esm_mlp, on = 'data', how = 'outer')
metrics_df = pd.merge(metrics_df, rnafm_resnet, on = 'data', how = 'outer')
metrics_df = pd.merge(metrics_df, kipoi, on = 'data', how = 'outer')
metrics_df = pd.merge(metrics_df, optimus, on = 'data', how = 'outer')
metrics_df = pd.merge(metrics_df, rnafm_mlp, on = 'data', how = 'outer')
metrics_df

Unnamed: 0,RNABERT_Test_R2,RNABERT_Test_PearsonR,RNABERT_Test_SpearmanR,data,ESM_MLP_Test_R2,ESM_MLP_Test_PearsonR,ESM_MLP_Test_SpearmanR,RNAFM_ResNet_Test_R2,RNAFM_ResNet_Test_Pearson R,RNAFM_ResNet_Test_Spearman R,Kipoi_Test_R2,Kipoi_Test_PearsonR,Kipoi_Test_SpearmanR,Optimus_Test_R2,Optimus_Test_PearsonR,Optimus_Test_SpearmanR,RNAFM_MLP_Test_R2,RNAFM_MLP_Test_Pearson R,RNAFM_MLP_Test_Spearman R
0,0.4472,0.6646,0.5993,Muscle_rnaseq,0.424789,0.651494,0.615477,0.54527,0.704663,0.631998,0.012774,0.00745,0.009575,0.033868,0.112316,0.152015,0.013441,0.053237,0.052203
1,0.3521,0.5889,0.5606,Muscle_te,0.484754,0.694665,0.665934,0.074334,0.192222,0.177788,0.012941,0.058718,0.057258,0.192694,0.433315,0.414386,0.013665,0.012394,-0.014167
2,0.2771,0.5258,0.4788,pc3_te,0.465518,0.682004,0.630541,0.040445,0.130057,0.113203,0.001694,0.03101,0.026349,0.189364,0.43265,0.380428,0.003728,0.034268,0.03544
3,0.2478,0.4971,0.468,HEK_rnaseq,0.485679,0.694588,0.652459,0.043393,0.202691,0.17178,0.045568,0.209863,0.178606,0.051391,0.214049,0.184561,0.022954,0.147983,0.137854
4,0.2239,0.4725,0.4466,pc3_rnaseq,0.354287,0.594308,0.562893,0.016225,0.115396,0.097705,0.033205,0.169635,0.140317,0.054976,0.22759,0.189629,0.013995,0.108213,0.096017
5,0.2106,0.4588,0.4118,HEK_te,0.411656,0.641277,0.603688,0.016635,0.076708,0.063899,0.001353,0.001639,0.005659,0.168875,0.405266,0.360436,0.001172,0.01706,0.018339
6,,,,Muscle_te_log,,,,0.008939,0.049202,0.038811,,,,,,,,,


Unnamed: 0,RNABERT_Test_R2,RNABERT_Test_PearsonR,RNABERT_Test_SpearmanR,data,ESM_MLP_Test_R2,ESM_MLP_Test_PearsonR,ESM_MLP_Test_SpearmanR,RNAFM_ResNet_Test_R2,RNAFM_ResNet_Test_Pearson R,RNAFM_ResNet_Test_Spearman R,Kipoi_Test_R2,Kipoi_Test_PearsonR,Kipoi_Test_SpearmanR,Optimus_Test_R2,Optimus_Test_PearsonR,Optimus_Test_SpearmanR,RNAFM_MLP_Test_R2,RNAFM_MLP_Test_Pearson R,RNAFM_MLP_Test_Spearman R
0,0.4472,0.6646,0.5993,Muscle_rnaseq,0.424789,0.651494,0.615477,0.54527,0.704663,0.631998,0.012774,0.00745,0.009575,0.033868,0.112316,0.152015,0.013441,0.053237,0.052203
1,0.3521,0.5889,0.5606,Muscle_te,0.484754,0.694665,0.665934,0.074334,0.192222,0.177788,0.012941,0.058718,0.057258,0.192694,0.433315,0.414386,0.013665,0.012394,-0.014167
2,0.2771,0.5258,0.4788,pc3_te,0.465518,0.682004,0.630541,0.040445,0.130057,0.113203,0.001694,0.03101,0.026349,0.189364,0.43265,0.380428,0.003728,0.034268,0.03544
3,0.2478,0.4971,0.468,HEK_rnaseq,0.485679,0.694588,0.652459,0.043393,0.202691,0.17178,0.045568,0.209863,0.178606,0.051391,0.214049,0.184561,0.022954,0.147983,0.137854
4,0.2239,0.4725,0.4466,pc3_rnaseq,0.354287,0.594308,0.562893,0.016225,0.115396,0.097705,0.033205,0.169635,0.140317,0.054976,0.22759,0.189629,0.013995,0.108213,0.096017
5,0.2106,0.4588,0.4118,HEK_te,0.411656,0.641277,0.603688,0.016635,0.076708,0.063899,0.001353,0.001639,0.005659,0.168875,0.405266,0.360436,0.001172,0.01706,0.018339
6,,,,Muscle_te_log,,,,0.008939,0.049202,0.038811,,,,,,,,,


In [73]:
metrics_df[[i for i in metrics_df.columns if 'Spearman' in i or i == 'data']]

Unnamed: 0,RNABERT_Test_SpearmanR,data,ESM_MLP_Test_SpearmanR,RNAFM_ResNet_Test_Spearman R,Kipoi_Test_SpearmanR,Optimus_Test_SpearmanR,RNAFM_MLP_Test_Spearman R
0,0.5993,Muscle_rnaseq,0.615477,0.631998,0.009575,0.152015,0.052203
1,0.5606,Muscle_te,0.665934,0.177788,0.057258,0.414386,-0.014167
2,0.4788,pc3_te,0.630541,0.113203,0.026349,0.380428,0.03544
3,0.468,HEK_rnaseq,0.652459,0.17178,0.178606,0.184561,0.137854
4,0.4466,pc3_rnaseq,0.562893,0.097705,0.140317,0.189629,0.096017
5,0.4118,HEK_te,0.603688,0.063899,0.005659,0.360436,0.018339
6,,Muscle_te_log,,0.038811,,,


Unnamed: 0,RNABERT_Test_SpearmanR,data,ESM_MLP_Test_SpearmanR,RNAFM_ResNet_Test_Spearman R,Kipoi_Test_SpearmanR,Optimus_Test_SpearmanR,RNAFM_MLP_Test_Spearman R
0,0.5993,Muscle_rnaseq,0.615477,0.631998,0.009575,0.152015,0.052203
1,0.5606,Muscle_te,0.665934,0.177788,0.057258,0.414386,-0.014167
2,0.4788,pc3_te,0.630541,0.113203,0.026349,0.380428,0.03544
3,0.468,HEK_rnaseq,0.652459,0.17178,0.178606,0.184561,0.137854
4,0.4466,pc3_rnaseq,0.562893,0.097705,0.140317,0.189629,0.096017
5,0.4118,HEK_te,0.603688,0.063899,0.005659,0.360436,0.018339
6,,Muscle_te_log,,0.038811,,,
