## TP Dataset Split

In [None]:
import os
import pandas as pd
from Bio import motifs
from Bio import SeqIO
import matplotlib.pyplot as plt
import re
plt.rcParams.update({'font.size': 14})

In [None]:
def get_statistics(p_file, n_file, motif_list):
    P = 0
    TP = 0
    N = 0
    FP = 0
    for rec in SeqIO.parse(p_file, "fasta"):
        found = False
        for motif in motif_list:
            regex = r""
            for cc in motif:
                regex += f"[{iupac[cc]}]"
            if len(re.findall(re.compile(regex, re.IGNORECASE), str(rec.seq))) > 0:
                found = True
                break
        if found:
            TP += 1
        P += 1
    for rec in SeqIO.parse(n_file, "fasta"):
        found = False
        for motif in motif_list:
            regex = r""
            for cc in motif:
                regex += f"[{iupac[cc]}]"
            if len(re.findall(re.compile(regex, re.IGNORECASE), str(rec.seq))) > 0:
                found = True
                break
        if found:
            FP += 1
        N += 1
    # ACC, TPR, TNR
    TN = (N-FP)
    ACC = (TP+TN) / (P+N)
    TPR = TP / P
    TNR = 1.0
    if (N > 0):
        TNR = TN / N
    print(f'P={P}, TP={TP}, N={N}, FP={FP}')
    return ACC, TPR, TNR

In [None]:
p_file = 'artificial_Riv19/Riv19_TP_test.fasta'
n_file = 'artificial_Riv19/Riv19_FP_test.fasta'

In [None]:
filename_list = list()
pwm_postprocess = list()
acc_list = list()
tpr_list = list()
tnr_list = list()

for filename in os.listdir('artificial_Riv19/'):
    if filename == p_file or filename == n_file:
        continue
    filename_list.append(filename)
    motif_list = pd.read_csv('../../artificial_Riv19_motifs/cisFinder/cisFinder_' + filename[:-6] + '.csv').motif

    acc, tpr, tnr = get_statistics(p_file, n_file, motif_list)
    acc_list.append(acc)
    tpr_list.append(tpr)
    tnr_list.append(tnr)

df_cisFinder = pd.DataFrame({'filename' : filename_list,
                             'TPR in Dataset' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
                             'Accuracy' : acc_list,
                             'TPR' : tpr_list,
                             'TNR' : tnr_list})

In [None]:
filename_list = list()
pwm_postprocess = list()
acc_list = list()
tpr_list = list()
tnr_list = list()

for filename in os.listdir('artificial_Riv19/'):
    if filename == p_file or filename == n_file:
        continue
    filename_list.append(filename)
    motif_list = pd.read_csv('../../artificial_Riv19_motifs/MEME_ChIP/MEME_ChIP_' + filename[:-6] + '.csv').motif

    acc, tpr, tnr = get_statistics(p_file, n_file, motif_list)
    acc_list.append(acc)
    tpr_list.append(tpr)
    tnr_list.append(tnr)

df_MEME_ChIP = pd.DataFrame({'filename' : filename_list,
                             'TPR in Dataset' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
                             'Accuracy' : acc_list,
                             'TPR' : tpr_list,
                             'TNR' : tnr_list})

In [None]:
filename_list = list()
pwm_postprocess = list()
acc_list = list()
tpr_list = list()
tnr_list = list()

for filename in os.listdir('artificial_Riv19/'):
    if filename == p_file or filename == n_file:
        continue
    filename_list.append(filename)
    motif_list = pd.read_csv('../../artificial_Riv19_motifs/vCNN/vCNN_' + filename[:-6] + '.csv').motif

    acc, tpr, tnr = get_statistics(p_file, n_file, motif_list)
    acc_list.append(acc)
    tpr_list.append(tpr)
    tnr_list.append(tnr)

df_vCNN = pd.DataFrame({'filename' : filename_list,
                        'TPR in Dataset' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
                        'Accuracy' : acc_list,
                        'TPR' : tpr_list,
                        'TNR' : tnr_list})

In [None]:
ax = df_cisFinder.plot(x = 'TPR in Dataset', y=['Accuracy', 'TPR', 'TNR'],zorder=10, color=['C0','C2','C1'])
ax.set_xlim(-0.02,1.02)
ax.set_ylim(-0.02,1.02)
ax.set_xlabel('TPR in Training Dataset')
ax.set_title('cisFinder')
legend = ax.legend()
legend.remove()
plt.show()

In [None]:
ax = df_MEME_ChIP.plot(x = 'TPR in Dataset', y=['Accuracy', 'TPR', 'TNR'],zorder=10, color=['C0','C2','C1'])
ax.set_xlim(-0.02,1.02)
ax.set_ylim(-0.02,1.02)
ax.set_xlabel('TPR in Training Dataset')
ax.set_title('MEME-ChIP')
legend = ax.legend()
legend.remove()
plt.show()

In [None]:
ax = df_vCNN.plot(x = 'TPR in Dataset', y=['Accuracy', 'TPR', 'TNR'],zorder=10, color=['C0','C2','C1'])
ax.set_xlim(-0.02,1.02)
ax.set_ylim(-0.02,1.02)
ax.set_xlabel('TPR in Training Dataset')
ax.set_title('vCNN')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),labels=['Accuracy', 'TPR', 'TNR'], title="Performance on\n Test Dataset:")
plt.show()