# Generation of tables and figures

In [1]:
import pickle

v_acc_d = {}

f = open('results/RfamNovel_constant.pckl', 'rb')
v_acc_d.update({'Constant' : pickle.load(f)})
f.close()

f = open('results/RfamNovel_random.pckl', 'rb')
v_acc_d.update({'Random' : pickle.load(f)})
f.close()

f = open('results/RfamNovel_new.pckl', 'rb')
v_acc_d.update({'New' : pickle.load(f)})
f.close()

import numpy as np
from tensorflow import keras
from sklearn import preprocessing


train_labels=np.load("train_labels.npy")
val_labels=np.load("val_labels.npy")
test_labels=np.load("test_labels.npy")

num_classes = len(np.unique(train_labels))

le = preprocessing.LabelEncoder()
le.fit(train_labels)
train_labels_num = le.transform(train_labels)
val_labels_num = le.transform(val_labels)
test_labels_num = le.transform(test_labels)

train_labels_bin = keras.utils.to_categorical(train_labels_num, num_classes)
val_labels_bin = keras.utils.to_categorical(val_labels_num, num_classes)
test_labels_bin = keras.utils.to_categorical(test_labels_num, num_classes)





### Plots the accuracy/MCC vs boundary noise

In [2]:
# plots accuracy or MCC VS bnoise
from sklearn.metrics import *
from ExpConfiguration import *
from math import sqrt

import matplotlib.pyplot as plt
y_true = np.argmax(test_labels_bin, axis=1)

# CHANGE HERE plot parameters
nl=3  # cnn layer to plot
#padd = 'Constant'
#padd = 'Random'
padd = 'New' 
#pmetric = 'MCC'
#pmetricf = matthews_corrcoef
pmetric = 'ACC'
pmetricf = accuracy_score

v_acc = v_acc_d[padd]
fig, ax = plt.subplots()
plt.xlabel('Boundary noise')
plt.ylabel(pmetric)
#plt.title('CNN n. layers = '+str(nl))
plt.ylim(0.5, 1) 
for en in seqEncoders:
    mtr=[]
    mtrErr=[]
    for bn in bnoise:
        y_pred = v_acc[str(nl)][en['filename']][str(bn)]
        #print('%.3f' % interval)
        mtr.append(pmetricf(y_true,y_pred))
        mtrErr.append(1.96 * sqrt( (pmetricf(y_true,y_pred) * (1 - pmetricf(y_true,y_pred))) / len(y_pred)))
        #print(en['filename'],bn,matthews_corrcoef(y_true,y_pred),accuracy_score(y_true,y_pred))
            
    ax.plot(bnoise, mtr, label=en['filename'],marker='o',markersize=3)
    ax.errorbar(bnoise, mtr, yerr=mtrErr)

# add Eden results
mtr=[]
for bn in bnoise:
    y_pred = np.loadtxt('eden/test_pred_eden_'+str(bn)+'.txt',dtype='str')
    y_true = np.loadtxt('eden/test_labels_eden_'+str(bn)+'.txt',dtype='str')
    mtr.append(pmetricf(y_true,y_pred))
    #mtr.append(accuracy_score(y_true,y_pred))
    #print('EdeN',bn,matthews_corrcoef(y_true,y_pred),accuracy_score(y_true,y_pred))

ax.plot(bnoise, mtr, label='EdeN',marker='o',markersize=3)
# add nRC results
mtr=[]
for bn in bnoise:
    y_pred = np.loadtxt('nrc/test_pred_nrc_'+str(bn)+'.txt',dtype='str')
    y_true = np.loadtxt('nrc/test_labels_nrc_'+str(bn)+'.txt',dtype='str')
    mtr.append(pmetricf(y_true,y_pred))
    #print('nRC',bn,matthews_corrcoef(y_true,y_pred),accuracy_score(y_true,y_pred))


ax.plot(bnoise, mtr, label='nRC',marker='o',markersize=3)
ax.legend()
ax.grid(linestyle='--')
plt.grid(True)
plt.savefig('figs/plot_bnoise-m'+pmetric+'_nl'+str(nl)+'_p'+padd+'.pdf')




### Plots accuracy with different padding schemas

In [None]:
from sklearn.metrics import *
from ExpConfiguration import *
import matplotlib.pyplot as plt
import pandas as pd

# CHANGE HERE plot parameters
nl=3  # cnn layer to plot
bn = 0 # boundary noise
#pmetric = 'MCC'
#pmetricf = matthews_corrcoef
pmetric = 'ACC'
pmetricf = accuracy_score


y_true = np.argmax(test_labels_bin, axis=1)
index = ['New', 'Constant', 'Random']
cols = {}
colsErr = {}
for en in seqEncoders:
    y_pred = v_acc_d['New'][str(nl)][en['filename']][str(bn)]
    pnew = pmetricf(y_true,y_pred)
    pnewInt = 1.96 * sqrt( (pnew * (1 - pnew)) / len(y_pred))

    y_pred = v_acc_d['Constant'][str(nl)][en['filename']][str(bn)]
    pcns = pmetricf(y_true,y_pred)
    pcnsInt = 1.96 * sqrt( (pcns * (1 - pcns)) / len(y_pred))

    y_pred = v_acc_d['Random'][str(nl)][en['filename']][str(bn)]
    prnd = pmetricf(y_true,y_pred)
    prndInt = 1.96 * sqrt( (prnd * (1 - prnd)) / len(y_pred))

    errors = [pnewInt,pcnsInt,prndInt]
    performance = [pnew,pcns,prnd]
    cols.update({en['filename'] : performance})
    colsErr.update({en['filename'] : errors})



df = pd.DataFrame(cols, index=index)
dfErr = pd.DataFrame(colsErr, index=index)
ax = df.plot.bar(rot=0,ylim=(0.5,1),yerr=dfErr)
ax.grid(linestyle='--')
plt.grid(True)
ax.legend(loc='lower left')
#plt.title('Input padding symbol')
plt.ylabel('ACC')
plt.savefig('figs/plot-padding.pdf')




### Plots accuracy/MCC with different CNN n. of layers

In [None]:
from sklearn.metrics import *
from ExpConfiguration import *
import matplotlib.pyplot as plt
import pandas as pd

# CHANGE HERE plot parameters
bn = 0 # boundary noise
padd = 'New'  # padding to plot
#pmetric = 'MCC'
#pmetricf = matthews_corrcoef
pmetric = 'ACC'
pmetricf = accuracy_score


y_true = np.argmax(test_labels_bin, axis=1)
index = ['0', '1', '2','3']
cols = {}
for en in seqEncoders:
    y_pred = v_acc[str(0)][en['filename']][str(bn)]
    p0 = pmetricf(y_true,y_pred)
    p0E = 1.96 * sqrt( (p0 * (1 - p0)) / len(y_pred))
    y_pred = v_acc[str(1)][en['filename']][str(bn)]
    p1 = pmetricf(y_true,y_pred)
    p1E = 1.96 * sqrt( (p1 * (1 - p1)) / len(y_pred))

    y_pred = v_acc[str(2)][en['filename']][str(bn)]
    p2 = pmetricf(y_true,y_pred)
    p2E = 1.96 * sqrt( (p2 * (1 - p2)) / len(y_pred))

    y_pred = v_acc[str(3)][en['filename']][str(bn)]
    p3 = pmetricf(y_true,y_pred)
    p3E = 1.96 * sqrt( (p3 * (1 - p3)) / len(y_pred))

    
    performance = [p0,p1,p2,p3]
    errors = [p0E,p1E,p2E,p3E]
    cols.update({en['filename'] : performance})
    colsErr.update({en['filename'] : errors})



df = pd.DataFrame(cols, index=index)
dfErr = pd.DataFrame(colsErr, index=index)
ax = df.plot.bar(rot=0,ylim=(0.5,1),yerr=dfErr)
ax.grid(linestyle='--')
plt.grid(True)
ax.legend(loc='upper left')

#plt.title('CNN number of layers')
plt.ylabel('ACC')
plt.xlabel('CNN n. of layers')
plt.savefig('figs/plot-cnnlayers.pdf')



## Generates tables with precisions, recalls, and F1-measures

In [7]:
# tables precision recall f1 and macro/weighted averages 
# at certain bnoise and n CNN layers 

from sklearn.metrics import *
import pandas as pd
import numpy as np
from ExpConfiguration import *

# CHANGE HERE plot parameters
nl=3  # cnn layer 
bn = 0 # boundary noise
padd = 'New'  # padding to plot


v_acc = v_acc_d[padd]
y_true = np.argmax(test_labels_bin, axis=1)
y_true = le.inverse_transform(y_true)

dfs = {}
for en in seqEncoders:
    y_pred = v_acc[str(nl)][en['filename']][str(bn)]
    y_pred = le.inverse_transform(y_pred)
    cr = classification_report(y_true,y_pred,output_dict=True,digits=2)
    df1 = pd.DataFrame(cr).drop(index=['support'])
    df1.drop(columns=['accuracy'])
    df1.index = ['P','R','F1']
    df1=df1.drop(index='P')
    df1=df1.drop(index='R')
    dfs[en['filename']] = df1

y_pred = np.loadtxt('eden/test_pred_eden_'+str(bn)+'.txt',dtype='str')
y_true = np.loadtxt('eden/test_labels_eden_'+str(bn)+'.txt',dtype='str')
cr = classification_report(y_true,y_pred,output_dict=True,digits=2)
df1 = pd.DataFrame(cr).drop(index=['support'])
df1.drop(columns=['accuracy'])
df1.index = ['P','R','F1']
df1=df1.drop(index='P')
df1=df1.drop(index='R')
dfs['EdeN'] = df1

y_pred = np.loadtxt('nrc/test_pred_nrc_'+str(bn)+'.txt',dtype='str')
y_true = np.loadtxt('nrc/test_labels_nrc_'+str(bn)+'.txt',dtype='str')
cr = classification_report(y_true,y_pred,output_dict=True,digits=2)
df1 = pd.DataFrame(cr) #.drop(index=['support'])
df1.drop(columns=['accuracy'])
df1.index = ['P','R','F1','Class size']
df1=df1.drop(index='P')
df1=df1.drop(index='R')
dfs['nRC'] = df1

df = pd.concat(dfs)
df = df.transpose()
df.astype({('nRC','Class size'): 'int32'})

with open('tables/prf-table_bn'+str(bn)+'_nl'+str(nl)+'_p'+padd+'.tex','w') as tf:
    tf.write(df.to_latex(float_format="{:0.2f}".format))

df


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0_level_0,1mer,2mer,3mer,EdeN,Hilbert,Morton,Snake,nRC,nRC
Unnamed: 0_level_1,F1,F1,F1,F1,F1,F1,F1,F1,Class size
RF00001,0.897066,0.787957,0.899382,0.841225,0.764539,0.772002,0.878281,0.846629,3886.0
RF00005,0.911015,0.94514,0.942117,0.928107,0.892441,0.940821,0.899352,0.942399,2309.0
RF00015,0.864253,0.909502,0.832579,0.914027,0.841629,0.710407,0.687783,0.936652,221.0
RF00016,0.80597,0.977612,0.955224,0.58209,0.395522,0.350746,0.858209,0.738806,134.0
RF00019,0.982456,0.980263,0.991228,0.997807,0.97807,0.984649,0.991228,0.991228,456.0
RF00020,0.313953,0.348837,0.453488,0.963415,0.267442,0.22093,0.186047,0.987805,82.0
RF00026,0.984004,0.995587,0.98952,0.972421,0.957529,0.928847,0.971318,0.968009,1813.0
RF00029,0.4,0.6,0.4,1.0,0.2,0.0,0.0,1.0,5.0
RF00050,0.997354,1.0,0.994709,0.997354,0.986772,0.989418,0.989418,0.989418,378.0
RF00059,0.983985,0.975016,0.975016,0.955798,0.969891,0.973094,0.966047,0.976938,1561.0


In [5]:
dfs

{'3mer':      RF00001   RF00005   RF00015   RF00016   RF00019   RF00020   RF00026  \
 P   0.942557  0.941100  0.904177  0.941176  0.976242  0.428571  0.967638   
 R   0.990085  0.940086  0.989247  0.927536  0.961702  0.406250  0.946702   
 F1  0.899382  0.942117  0.832579  0.955224  0.991228  0.453488  0.989520   
 
      RF00029   RF00050   RF00059  ...   RF00906   RF01055   RF01059   RF01705  \
 P   0.137931  0.986877  0.968193  ...  0.791367  0.862903  0.947368  0.724191   
 R   0.083333  0.979167  0.961466  ...  0.859375  0.781022  0.900000  0.955285   
 F1  0.400000  0.994709  0.975016  ...  0.733333  0.963964  1.000000  0.583127   
 
      RF01725   RF01739   RF01942  accuracy  macro avg  weighted avg  
 P   0.840764  0.456522  0.963253  0.929663   0.805788      0.931862  
 R   0.725275  0.295775  0.937595  0.929663   0.776302      0.940760  
 F1  1.000000  1.000000  0.990354  0.929663   0.875073      0.929663  
 
 [3 rows x 32 columns],
 '2mer':      RF00001   RF00005   RF00015 

# Results with RNAGCN/nRC dataset and improved architecture

In [None]:
import pickle

v_acc_d = {}

f = open('results/RNAGCN_nRC_ModelImproved_new.pckl', 'rb')
v_acc_d.update({'Improved' : pickle.load(f)})
f.close()

f = open('results/RNAGCN_nRC_new.pckl', 'rb')
v_acc_d.update({'Standard' : pickle.load(f)})
f.close()

import numpy as np
from tensorflow import keras
from sklearn import preprocessing

train_labels=np.load("dataset_nRC_train_labels.npy")
test_labels=np.load("dataset_nRC_test_labels.npy")

num_classes = len(np.unique(train_labels))
print('Total classes: ',num_classes)

le = preprocessing.LabelEncoder()
le.fit(train_labels)
train_labels_num = le.transform(train_labels)
test_labels_num = le.transform(test_labels)

train_labels_bin = keras.utils.to_categorical(train_labels_num, num_classes)
test_labels_bin = keras.utils.to_categorical(test_labels_num, num_classes)


In [None]:
from sklearn.metrics import *
from ExpConfiguration import *
import matplotlib.pyplot as plt
import pandas as pd

# CHANGE HERE plot parameters
bn = 0 # boundary noise
padd = 'New'  # padding to plot
#pmetric = 'MCC'
#pmetricf = matthews_corrcoef
pmetric = 'ACC'
pmetricf = accuracy_score


y_true = np.argmax(test_labels_bin, axis=1)
index = ['Standard', 'Improved']
cols = {}
for en in seqEncoders:
    y_pred = v_acc_d['Standard'][en['filename']][str(bn)]
    p0 = pmetricf(y_true,y_pred)
    y_pred = v_acc_d['Improved'][en['filename']][str(bn)]
    p1 = pmetricf(y_true,y_pred)
    
    performance = [p0,p1]
    cols.update({en['filename'] : performance})



df = pd.DataFrame(cols, index=index)



In [None]:
df