In [1]:
import os, pickle, sys, json
import numpy as np 
import pandas as pd
import altair as alt
import json 
import datetime
# Add parent dir to sys.path
sys.path.append(os.path.abspath(".."))
from utils.util import evaluate_model
from utils.Parameters import Parameters

In [3]:
#parameters for features encoding
parameters = Parameters()
desc_default_para = parameters.DESC_DEFAULT_PARA

# list of training data sets
data_names = ['benbow','islandpick','gicluster','rvm','mergedworvmgicluster']

# test data set ("literature_data.fasta", "benbow_test_data.fasta")
test_data = "literature_data.fasta"

# store evaluation results and models
final_scores = {}
models = {}

# path to training, testing data set, model destination
train_data_path = "../dataset/train_data/"
test_data_path = "../dataset/test_data/"
model_path = "../utils/models/"

load_model = False
save_model = False 

for data_name in data_names:
    print(data_name)
    
    train_file = os.path.join(train_data_path,"{}_data.fasta".format(data_name)) 
    test_file = os.path.join(test_data_path, test_data) 
    model_flag = test_data.split('_')[0]
    
    train_params = {
        'representation': 'RCKmer',
        'representation_params': desc_default_para,
        'model': 'SVM',
        'k': 7
    }

    if data_name == 'rvm':
        train_params.update({'representation':'RCKmer','k':5,'model':'RandomForest'})
    else:
        train_params.update({'representation':'RCKmer','k':7,'model':'SVM'})

    key = '{}_{}_{}'.format(train_params['model'],train_params['representation'],train_params['k'])

    if load_model:
        kwargs = {'model_path': "utils/models/benbow_SVM_RCKmer_7_fine_tuned_benbow.pkl"}
    else:
        kwargs = {}

    model, eval_scores = evaluate_model(train_file, test_file, train_params, **kwargs)

    final_scores.update({data_name:eval_scores})
    models.update({data_name:model})

    #save model
    if save_model:
        with open(os.path.join(model_path,'{}_{}_fine_tuned_{}.pkl'.format(data_name,key,model_flag)),'wb') as f:
            pickle.dump(model,f)

benbow
representation Train Data with RCKmer-7
representation Test Data with RCKmer-7
Training in progress
Training is done
Testing the model
islandpick
representation Train Data with RCKmer-7
representation Test Data with RCKmer-7
Training in progress
Training is done
Testing the model
gicluster
representation Train Data with RCKmer-7
representation Test Data with RCKmer-7
Training in progress
Training is done
Testing the model
rvm
representation Train Data with RCKmer-5
representation Test Data with RCKmer-5
Training in progress
Training is done
Testing the model
mergedworvmgicluster
representation Train Data with RCKmer-7
representation Test Data with RCKmer-7
Training in progress
Training is done
Testing the model


In [4]:
final_scores    

{'benbow': {'Accuracy': 0.9923664122137404,
  'Precision': 0.9875,
  'Recall': 0.9875,
  'F_1': 0.9875,
  'F_beta_0.5': 0.9875,
  'F_beta_2': 0.9875,
  'MCC': 0.9820054945054945,
  'TP': 79,
  'TN': 181,
  'FP': 1,
  'FN': 1},
 'islandpick': {'Accuracy': 0.9847328244274809,
  'Precision': 1.0,
  'Recall': 0.95,
  'F_1': 0.9743589743589743,
  'F_beta_0.5': 0.9895833333333334,
  'F_beta_2': 0.9595959595959596,
  'MCC': 0.9641420499455038,
  'TP': 76,
  'TN': 182,
  'FP': 0,
  'FN': 4},
 'gicluster': {'Accuracy': 0.916030534351145,
  'Precision': 0.881578947368421,
  'Recall': 0.8375,
  'F_1': 0.8589743589743589,
  'F_beta_0.5': 0.8723958333333334,
  'F_beta_2': 0.8459595959595959,
  'MCC': 0.7997806449591318,
  'TP': 67,
  'TN': 173,
  'FP': 9,
  'FN': 13},
 'rvm': {'Accuracy': 0.9656488549618321,
  'Precision': 0.9733333333333334,
  'Recall': 0.9125,
  'F_1': 0.9419354838709677,
  'F_beta_0.5': 0.9605263157894737,
  'F_beta_2': 0.9240506329113924,
  'MCC': 0.9185446615775036,
  'TP': 73

In [5]:
# save evaluation results
   
def myconverter(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, datetime.datetime):
        return obj.__str__()
    
json_obj = json.dumps(final_scores, indent=1, default=myconverter)
json_file = "../outputs/evaluation/classification_literature.json"

with open(json_file, 'w') as file:
    json.dump(json_obj, file, indent=4)


In [6]:
print(json_obj)

{
 "benbow": {
  "Accuracy": 0.9923664122137404,
  "Precision": 0.9875,
  "Recall": 0.9875,
  "F_1": 0.9875,
  "F_beta_0.5": 0.9875,
  "F_beta_2": 0.9875,
  "MCC": 0.9820054945054945,
  "TP": 79,
  "TN": 181,
  "FP": 1,
  "FN": 1
 },
 "islandpick": {
  "Accuracy": 0.9847328244274809,
  "Precision": 1.0,
  "Recall": 0.95,
  "F_1": 0.9743589743589743,
  "F_beta_0.5": 0.9895833333333334,
  "F_beta_2": 0.9595959595959596,
  "MCC": 0.9641420499455038,
  "TP": 76,
  "TN": 182,
  "FP": 0,
  "FN": 4
 },
 "gicluster": {
  "Accuracy": 0.916030534351145,
  "Precision": 0.881578947368421,
  "Recall": 0.8375,
  "F_1": 0.8589743589743589,
  "F_beta_0.5": 0.8723958333333334,
  "F_beta_2": 0.8459595959595959,
  "MCC": 0.7997806449591318,
  "TP": 67,
  "TN": 173,
  "FP": 9,
  "FN": 13
 },
 "rvm": {
  "Accuracy": 0.9656488549618321,
  "Precision": 0.9733333333333334,
  "Recall": 0.9125,
  "F_1": 0.9419354838709677,
  "F_beta_0.5": 0.9605263157894737,
  "F_beta_2": 0.9240506329113924,
  "MCC": 0.91854466

In [9]:
# Create confusion matrice of the evaluation results
def create_confusion_matrices(json_file):

    with open(json_file, 'r') as file:
        json_obj = json.load(file)
        eval_score = json.loads(json_obj)

    matrices = {}

    for data in eval_score.keys():
        TP = eval_score[data]['TP']
        FN = eval_score[data]['FN']
        TN = eval_score[data]['TN']
        FP = eval_score[data]['FP']
        
        if data in ['benbow','islandpick','gicluster','rvm']:
            matrices.update({data:[[TP,FN],[FP,TN]]})

    # Convert confusion matrices to a long-form DataFrame
    data = []
    for model, matrix in matrices.items():
        for i, row in enumerate(matrix):
            for j, value in enumerate(row):
                data.append({
                    "Data set": model,
                    "True Label": f"Class {i}",
                    "Predicted Label": f"Class {j}",
                    "Value": value
                })

    df = pd.DataFrame(data)
    df = df.replace({'Class 0':'GI','Class 1':'Non GI','benbow':'Benbow','islandpick':'IslandPick','gicluster':'GI-Cluster','rvm':'RVM'})

    # Base heatmap
    heatmap = alt.Chart(df).mark_rect().encode(
        x=alt.X("Predicted Label:N", title="Predicted Label", axis=alt.Axis(labelAngle=0 )),
        y=alt.Y("True Label:N", title="True Label"), 
        color=alt.Color("Value:Q", scale=alt.Scale(scheme="greys"), legend=None),#title="Count"),
        tooltip=["Data set", "True Label", "Predicted Label", "Value"]
    ).properties(
        width=150,
        height=150
    )

    # Text overlay
    text = alt.Chart(df).mark_text(baseline="middle", fontSize=12).encode(
        x=alt.X("Predicted Label:N"),
        y=alt.Y("True Label:N"),
        text=alt.Text("Value:Q", format=".0f"),  # Format as integer
        color=alt.condition(
            alt.datum.Value > 20,  # Contrast text color for better readability
            alt.value("white"),
            alt.value("black")
        )
    )

    # Combine heatmap and text
    combined = (heatmap + text).facet(
        column=alt.Column("Data set:N", sort=['Benbow','IslandPick','RVM','GI-Cluster'], title="Data set"),
        columns=2,
        title="Confusion Matrices"
    )

    return combined

json_file_benbow_test = "../outputs/evaluation/classification_benbow_test.json"
json_file_benbow_lit = "../outputs/evaluation/classification_literature.json"

conf_mat_benbow_test = create_confusion_matrices(json_file_benbow_test)
conf_mat_benbow_lit = create_confusion_matrices(json_file_benbow_lit)

# Display the chart
alt.vconcat(conf_mat_benbow_test,conf_mat_benbow_lit) 
