In [36]:
import numpy as np
from scipy import stats
import pandas as pd
'''
返回四个星号，表示非常显著；
如果p-value < 0.01，则返回三个星号（*），表示非常显著；
如果p-value < 0.05，则返回两个星号（），表示显著；
如果p-value < 0.1，则返回一个星号（*），表示趋近显著；
如果p-value ≥ 0.1，则不返回任何星号，表示不显著。
'''
def pairedT(before, after):
    t_statistic, p_value = stats.ttest_rel(before, after)
    print(f"t = {t_statistic:.4f} | p-value = {p_value:.4f} ｜ {p_value_star(p_value)}", )
    return [t_statistic, p_value, p_value_star(p_value)]
def p_value_star(p_value):
    if p_value < 0.001:
        return '****'
    elif p_value < 0.01:
        return '***'
    elif p_value < 0.05:
        return '**'
    elif p_value < 0.1:
        return '*'
    else:
        return ''

# Figure 2

In [70]:
d = {'UTRsemiLM': UTRsemiLM,
    'RNAFM_ResNet': RNAFM_ResNet,
    'FramePool': FramePool,
    'Optimus': Optimus,
    'RNAFM_MLP': RNAFM_MLP,
    'RNABERT_MLP': RNABERT_MLP,
    'Cao_RF': Cao_RF
    }
methods = ['UTRsemiLM', 'RNAFM_ResNet', 'FramePool', 'Optimus', 'RNAFM_MLP', 'RNABERT_MLP', 'Cao_RF']

In [130]:
UTRsemiLM = [0.96,0.92,0.91,0.9,0.88,0.89,0.78,0.84,0.93,0.88,0.9,0.9,0.88,0.89,0.83,0.74,0.88,0.72,0.9,0.89,0.59,0.65,0.67,0.63,0.6,0.62,0.56,0.65]
RNAFM_ResNet = [0.96,0.92,0.9,0.9,0.87,0.89,0.78,0.77,0.92,0.88,0.9,0.9,0.87,0.89,0.83,0.7,0.86,0.69,0.87,0.87,0.54,0.62,0.18,0.11,0.06,0.63,0.1,0.17]
FramePool = [0.94,0.9,0.88,0.88,0.86,0.86,0.75,0.78,0.9,0.85,0.88,0.88,0.86,0.87,0.81,0.69,0.85,0.69,0.86,0.85,0.53,0.63,0.06,0.03,0.01,0.01,0.14,0.18]
Optimus = [0.95,0.9,0.84,0.82,0.82,0.84,0.69,0.78,0.92,0.86,0.84,0.82,0.82,0.84,0.76,0.7,0.87,0.71,0.84,0.82,0.56,0.64,0.41,0.38,0.36,0.15,0.19,0.18]
RNAFM_MLP = [0.52,0.5,0.47,0.51,0.46,0.5,0.39,0.42,0.49,0.46,0.48,0.51,0.47,0.5,0.44,0.38,0.45,0.38,0.46,0.49,0.28,0.35,-0.01,0.04,0.02,0.05,0.1,0.14]

Cao_RF = [0.63,0.63,0.55,0.64,0.59,0.57] # cao te/rnaseq
RNABERT_MLP = [0.18,0.56,0.48,0.41,0.6,0.45,0.47] # unmod1; cao te/rnaseq

pairedT(UTRsemiLM, RNAFM_ResNet)
pairedT(UTRsemiLM, FramePool)
pairedT(UTRsemiLM, Optimus)
pairedT(UTRsemiLM, RNAFM_MLP)

df = []
for p1 in methods:
    for p2 in methods[:-2]:
        if p1 != p2:
            print(p1, p2, end = ':')
            
            if p1 == 'RNABERT_MLP':
                t = [d[p2][0]] + d[p2][-6:]
                df.append([p1, p2] + pairedT(d[p1], t))
            elif p1 == 'Cao_RF':
                t = d[p2][-6:]
                df.append([p1, p2] + pairedT(d[p1], t))
            else:
                df.append([p1, p2] + pairedT(d[p1], d[p2]))
                
df = pd.DataFrame(df, columns = ['Method_1', 'Method_2', 't_statistic', 'p_value', 'significance'])
df['p'] = round(df['p_value'], 4)
df = df.drop_duplicates(['p_value', 'significance', 'p'])
df

t = 2.7944 | p-value = 0.0095 ｜ ***
t = 3.3940 | p-value = 0.0021 ｜ ***
t = 4.3207 | p-value = 0.0002 ｜ ****
t = 27.1188 | p-value = 0.0000 ｜ ****
UTRsemiLM RNAFM_ResNet:t = 2.7944 | p-value = 0.0095 ｜ ***
UTRsemiLM FramePool:t = 3.3940 | p-value = 0.0021 ｜ ***
UTRsemiLM Optimus:t = 4.3207 | p-value = 0.0002 ｜ ****
UTRsemiLM RNAFM_MLP:t = 27.1188 | p-value = 0.0000 ｜ ****
RNAFM_ResNet UTRsemiLM:t = -2.7944 | p-value = 0.0095 ｜ ***
RNAFM_ResNet FramePool:t = 1.8574 | p-value = 0.0742 ｜ *
RNAFM_ResNet Optimus:t = 0.3748 | p-value = 0.7108 ｜ 
RNAFM_ResNet RNAFM_MLP:t = 12.3944 | p-value = 0.0000 ｜ ****
FramePool UTRsemiLM:t = -3.3940 | p-value = 0.0021 ｜ ***
FramePool RNAFM_ResNet:t = -1.8574 | p-value = 0.0742 ｜ *
FramePool Optimus:t = -1.3896 | p-value = 0.1760 ｜ 
FramePool RNAFM_MLP:t = 10.0717 | p-value = 0.0000 ｜ ****
Optimus UTRsemiLM:t = -4.3207 | p-value = 0.0002 ｜ ****
Optimus RNAFM_ResNet:t = -0.3748 | p-value = 0.7108 ｜ 
Optimus FramePool:t = 1.3896 | p-value = 0.1760 ｜ 
Optimu

Unnamed: 0,Method_1,Method_2,t_statistic,p_value,significance,p
0,UTRsemiLM,RNAFM_ResNet,2.79438,0.009450846,***,0.0095
1,UTRsemiLM,FramePool,3.393972,0.002143048,***,0.0021
2,UTRsemiLM,Optimus,4.320714,0.0001885747,****,0.0002
3,UTRsemiLM,RNAFM_MLP,27.11876,4.010995e-21,****,0.0
5,RNAFM_ResNet,FramePool,1.857355,0.07420035,*,0.0742
6,RNAFM_ResNet,Optimus,0.374771,0.7107583,,0.7108
7,RNAFM_ResNet,RNAFM_MLP,12.394383,1.17991e-12,****,0.0
10,FramePool,Optimus,-1.389551,0.1760185,,0.176
11,FramePool,RNAFM_MLP,10.07171,1.21777e-10,****,0.0
15,Optimus,RNAFM_MLP,17.686179,2.242907e-16,****,0.0


In [131]:
df_pivot = df.pivot(index='Method_1', columns='Method_2', values='p')
df_pivot

Method_2,FramePool,Optimus,RNAFM_MLP,RNAFM_ResNet,UTRsemiLM
Method_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cao_RF,0.0,0.0012,0.0,0.0041,0.3098
FramePool,,0.176,0.0,,
Optimus,,,0.0,,
RNABERT_MLP,0.1933,0.6312,0.0309,0.4383,0.0613
RNAFM_ResNet,0.0742,0.7108,0.0,,
UTRsemiLM,0.0021,0.0002,0.0,0.0095,


In [132]:
df.pivot(index='Method_1', columns='Method_2', values='significance')

Method_2,FramePool,Optimus,RNAFM_MLP,RNAFM_ResNet,UTRsemiLM
Method_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cao_RF,****,***,****,***,
FramePool,,,****,,
Optimus,,,****,,
RNABERT_MLP,,,**,,*
RNAFM_ResNet,*,,****,,
UTRsemiLM,***,****,****,***,
