## Read Cross Validation Results

In [1]:
import numpy as np 
import pandas as pd
import altair as alt
import pickle
from os import listdir
from os.path import isfile, join
from utils.util import read_results, read_eval_result

alt.data_transformers.disable_max_rows()
#alt.renderers.enable('mimetype')
pd.set_option('display.max_rows', 500)

In [2]:
#read the cross validation results
mypath = "outputs/crossval"
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

final_df = pd.DataFrame()

for file in files:
    if file.endswith('.xlsx'):
        filename = join(mypath,file)
        df = read_results(filename)
        final_df = pd.concat([final_df,df])

#numbering the evaluation metrics as each metric has as many values as the number of n_fold cross validation (n=5)
eval_metrics = ['Accuracy','MCC','F_1','F_beta_0.5','F_beta_2','Precision','Recall']

for metric in eval_metrics:
    score_columns = final_df.filter(regex=metric).columns
    final_df['mean_{}'.format(metric)] = final_df[score_columns].mean(axis=1)
    final_df['std_{}'.format(metric)] = final_df[score_columns].std(axis=1)
    final_df['{}'.format(metric)] = final_df.apply(lambda x: '{:.3f}±{:.3f}'.format(x['mean_{}'.format(metric)],x['std_{}'.format(metric)]), axis=1)

#final_df = final_df[final_df['fold']=='StratifiedGroupKFold']
final_df.rename(columns={'mean_F_1':'mean_F1','F_1':'F1'}, inplace=True)

In [3]:
#rank the RCKmer and Kmer based on F1 score per dataset
kmer_family = final_df[(final_df.representation.str.startswith('Kmer'))].copy()
kmer_family['group_rank'] = kmer_family.groupby(['dataset','fold'])['mean_F1'].rank(method="first", ascending=False)

rckmer_family = final_df[(final_df.representation.str.startswith('RCKmer'))].copy()
rckmer_family['group_rank'] = rckmer_family.groupby(['dataset','fold'])['mean_F1'].rank(method="first", ascending=False)

kmer_table = pd.concat([kmer_family,rckmer_family])

In [4]:
#assign the rank of RCKmer and Kmer to the original table
df_merged = pd.merge(final_df, kmer_table[['dataset','representation','fold','group_rank']], on=['dataset','representation','fold'], how='left')
df_merged = df_merged.fillna(0.0)

In [5]:
#select the best RCKmer and Kmer only
sub_final_df = df_merged[df_merged['group_rank']<=1.0].copy()
sub_final_df['rank'] = sub_final_df.groupby(['dataset','representation','fold'])['mean_F1'].rank(method="first", ascending=False)
sub_final_df = sub_final_df[sub_final_df['rank']==1]

#show the top 5 performing representations along with the corresponding model sorted by F_1 score in descending order
sub_df_stratified_group_top_n = sub_final_df.groupby('dataset').apply(pd.DataFrame.nlargest, n=5, columns=['mean_F1'])
sub_df_stratified_group_top_n[['model','representation','F1','F_beta_0.5','F_beta_2','MCC','Accuracy','Precision','Recall']]#,'data_split_1', 'data_split_2','data_split_3','data_split_4','data_split_5' ]]

Unnamed: 0_level_0,Unnamed: 1_level_0,model,representation,F1,F_beta_0.5,F_beta_2,MCC,Accuracy,Precision,Recall
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
benbow,459,SVM,RCKmer-7,0.927±0.024,0.928±0.027,0.927±0.024,0.837±0.056,0.919±0.028,0.928±0.031,0.927±0.025
benbow,419,SVM,Kmer-6,0.912±0.028,0.910±0.030,0.914±0.031,0.801±0.062,0.901±0.031,0.909±0.034,0.915±0.035
benbow,73,RandomForest,PCPseTNC,0.880±0.042,0.860±0.055,0.901±0.031,0.716±0.107,0.860±0.053,0.848±0.064,0.916±0.027
benbow,638,SVM,Z_curve_48bit,0.879±0.028,0.881±0.030,0.877±0.031,0.730±0.063,0.866±0.031,0.883±0.034,0.876±0.035
benbow,1073,RandomForest,PseEIIP,0.878±0.045,0.862±0.058,0.896±0.033,0.714±0.111,0.859±0.055,0.851±0.067,0.908±0.030
gicluster,1024,SVM,RCKmer-7,0.678±0.149,0.660±0.185,0.708±0.133,0.565±0.212,0.815±0.091,0.652±0.212,0.738±0.152
gicluster,984,SVM,Kmer-6,0.667±0.135,0.647±0.174,0.700±0.122,0.550±0.194,0.806±0.085,0.639±0.205,0.733±0.148
gicluster,1984,LogisticRegression,Subsequence,0.616±0.122,0.596±0.186,0.657±0.059,0.457±0.219,0.743±0.173,0.590±0.226,0.705±0.105
gicluster,1982,LogisticRegression,Mismatch,0.607±0.131,0.595±0.194,0.637±0.067,0.439±0.244,0.738±0.189,0.594±0.233,0.678±0.105
gicluster,1996,SVM,CKSNAP,0.596±0.145,0.607±0.164,0.595±0.158,0.471±0.191,0.794±0.065,0.620±0.189,0.600±0.184


## Heatmap of Cross Validation Results across all data sets, models, and data representations

In [6]:
metric = 'F1'
data_eval = final_df[final_df['dataset'].isin({'benbow','islandpick','rvm','gicluster'})][['dataset','representation','model','mean_{}'.format(metric)]].copy()
data_eval = data_eval.replace({'DecisionTree': 3, 'NaiveBayes': 2, 'LogisticRegression': 1, 'RandomForest': 4, 'SVM': 5})
data_eval = data_eval.replace({'benbow': 'Benbow#(n_species = 167)', 'islandpick': 'IslandPick#(n_species = 104)', 
                    'rvm': 'RVM#(n_species = 32)', 'gicluster': 'GI-Cluster#(n_species = 9)'})
data_eval.rename(columns={'dataset':'Data_set','representation': 'Representation','model':'Model', 'mean_{}'.format(metric):'eval_metric'}, inplace=True)

heatmap = alt.Chart(data_eval).mark_rect(
).transform_calculate(
    Data_set="split(datum.Data_set, '#')"
).encode(
    x=alt.X('Representation:O', title='Data Representation',axis=alt.Axis(labelAngle=45, )),
    y=alt.Y('Model:O', axis=alt.Axis(labels=True, title=None, orient='right')), 
    color=alt.Color(
            'eval_metric:Q', 
            scale=alt.Scale(domain=[0, 0.5, 1.0], range=['white', 'lightgrey', 'black']), 
            legend=alt.Legend(title=metric, tickCount=5)
                            ),
    #tooltip=['Model', 'representation', 'eval_metric']
).facet(
    row=alt.Row('Data_set:N', sort=['Benbow#(n_species = 167)', 'IslandPick#(n_species = 104)', 'RVM#(n_species = 32)', 'GI-Cluster#(n_species = 9)'],
                header=alt.Header(title='Model', titleOrient="right"))
).properties(
    title=alt.TitleParams('Data_set', anchor='middle', orient='left')  # Add Left-side title
)

#boxplot
boxplot = alt.Chart(data_eval).mark_boxplot(
    color='gray'
).encode(
    x=alt.X('Model:O', title='Model', axis=alt.Axis(labelAngle=0,)), #sort=[1,'NaiveBayes','DecisionTree','RandomForest','SVM']),
    y=alt.Y('eval_metric:Q', title=metric),
    #color='Data_set:O',
    #column='Data_set:O'
    facet=alt.Facet(
        'Data_set:N', columns=4,
        sort=['Benbow#(n_species = 167)', 'IslandPick#(n_species = 104)', 'RVM#(n_species = 32)', 'GI-Cluster#(n_species = 9)'],
        header=alt.Header(labelFontSize=13),
    )
).properties(
    width=203,
)

fontsize=13
alt.vconcat(heatmap, boxplot).configure_axis(
    labelFontSize=fontsize,
    titleFontSize=fontsize,
).configure_header(
    labelFontSize=fontsize,
    titleFontSize=fontsize,
)

## Statistical Analysis on the difference between models and data representations

In [7]:
#checking for normality

from scipy.stats import shapiro
import pandas as pd

raw_data_ = final_df[final_df['dataset'].isin({'benbow','islandpick','rvm','gicluster'})][['dataset','representation','model','mean_F1','mean_MCC']].copy()

datasets = ['benbow','islandpick','gicluster','rvm']

# Load your DataFrame (replace 'df' with your actual DataFrame)
# Example: df = pd.read_csv("your_data.csv")

for metric in ['mean_F1', 'mean_MCC']:  
    stat, p_value = shapiro(raw_data_[metric])  
    print(f"Shapiro-Wilk Test for {metric}: p-value = {p_value}")

    if p_value < 0.05:
        print(f"{metric} is NOT normally distributed.\n")
    else:
        print(f"{metric} is normally distributed.\n")

for dataset in datasets:
    print(dataset)
    raw_data = raw_data_[raw_data_['dataset']==dataset]

    for metric in ['mean_F1', 'mean_MCC']:  
        stat, p_value = shapiro(raw_data_[metric])  
        print(f"Shapiro-Wilk Test for {metric}: p-value = {p_value}")

        if p_value < 0.05:
            print(f"{metric} is NOT normally distributed.\n")
        else:
            print(f"{metric} is normally distributed.\n")
    


Shapiro-Wilk Test for mean_F1: p-value = 7.545476116429796e-21
mean_F1 is NOT normally distributed.

Shapiro-Wilk Test for mean_MCC: p-value = 7.082714804395088e-17
mean_MCC is NOT normally distributed.

benbow
Shapiro-Wilk Test for mean_F1: p-value = 7.545476116429796e-21
mean_F1 is NOT normally distributed.

Shapiro-Wilk Test for mean_MCC: p-value = 7.082714804395088e-17
mean_MCC is NOT normally distributed.

islandpick
Shapiro-Wilk Test for mean_F1: p-value = 7.545476116429796e-21
mean_F1 is NOT normally distributed.

Shapiro-Wilk Test for mean_MCC: p-value = 7.082714804395088e-17
mean_MCC is NOT normally distributed.

gicluster
Shapiro-Wilk Test for mean_F1: p-value = 7.545476116429796e-21
mean_F1 is NOT normally distributed.

Shapiro-Wilk Test for mean_MCC: p-value = 7.082714804395088e-17
mean_MCC is NOT normally distributed.

rvm
Shapiro-Wilk Test for mean_F1: p-value = 7.545476116429796e-21
mean_F1 is NOT normally distributed.

Shapiro-Wilk Test for mean_MCC: p-value = 7.0827148

In [8]:
#Friedman test impact of models on representations

from scipy.stats import friedmanchisquare
import scikit_posthocs as sp 

model_scores_per_dataset = []

for dataset in datasets:
    print(dataset)
    raw_data = raw_data_[raw_data_['dataset']==dataset]

    # List of unique models and representations
    models = raw_data["model"].unique()
    representations = raw_data["representation"].unique()

    # Create a list of performance scores for each model across all representations
    # The list will have a sublist for each model, containing the performance scores for each representation
    model_scores = []

    for model in models:
        model_performance = []
        for rep in representations:
            # Get the F1-score for the model and representation
            performance = raw_data[(raw_data["model"] == model) & (raw_data["representation"] == rep)]["mean_F1"].values
            model_performance.extend(performance)
        model_scores.append(model_performance)
    
    model_scores_flatten = np.array(model_scores).flatten()
    model_scores_per_dataset.append(model_scores_flatten.tolist())

    # Perform Friedman Test
    stat, p_value = friedmanchisquare(*model_scores)

    print(f"Friedman Test Result: p-value = {p_value}")

    # Interpretation
    if p_value < 0.05:
        print("There is a significant difference in performance across models and representations.")
    else:
        print("No significant difference in performance across models and representations.")

# # Perform Friedman Test
# stat, p_value = friedmanchisquare(*model_scores_per_dataset)

# print(f"Friedman Test Result: p-value = {p_value}")

# # Interpretation
# if p_value < 0.05:
#     print("There is a significant difference in performance across datasets")
# else:
#     print("No significant difference in performance across datasets")


benbow
Friedman Test Result: p-value = 5.832701627182584e-21
There is a significant difference in performance across models and representations.
islandpick
Friedman Test Result: p-value = 3.326826907230399e-28
There is a significant difference in performance across models and representations.
gicluster
Friedman Test Result: p-value = 1.340388575325093e-18
There is a significant difference in performance across models and representations.
rvm
Friedman Test Result: p-value = 1.5796172953496593e-24
There is a significant difference in performance across models and representations.


In [9]:
#Friedman test impact of representations on models
model_scores_per_dataset = []

for dataset in datasets:
    print(dataset)
    raw_data = raw_data_[raw_data_['dataset']==dataset]

    # List of unique models and representations
    models = raw_data["model"].unique()
    representations = raw_data["representation"].unique()

    # Create a list of performance scores for each model across all representations
    # The list will have a sublist for each model, containing the performance scores for each representation
    model_scores = []

    for rep in representations:
        model_performance = []
        for model in models:
            # Get the F1-score for the model and representation
            performance = raw_data[(raw_data["model"] == model) & (raw_data["representation"] == rep)]["mean_F1"].values
            model_performance.extend(performance)
        model_scores.append(model_performance)

    model_scores_flatten = np.array(model_scores).flatten()
    model_scores_per_dataset.append(model_scores_flatten.tolist())

    # Perform Friedman Test
    stat, p_value = friedmanchisquare(*model_scores)

    print(f"Friedman Test Result: p-value = {p_value}")

    # Interpretation
    if p_value < 0.05:
        print("There is a significant difference in performance across models and representations.")
    else:
        print("No significant difference in performance across models and representations.")

# # Perform Friedman Test
# stat, p_value = friedmanchisquare(*model_scores_per_dataset)

# print(f"Friedman Test Result: p-value = {p_value}")

# # Interpretation
# if p_value < 0.05:
#     print("There is a significant difference in performance across datasets")
# else:
#     print("No significant difference in performance across datasets")


benbow
Friedman Test Result: p-value = 3.485826489011358e-07
There is a significant difference in performance across models and representations.
islandpick
Friedman Test Result: p-value = 4.3733175104158254e-05
There is a significant difference in performance across models and representations.
gicluster
Friedman Test Result: p-value = 0.001038405055847403
There is a significant difference in performance across models and representations.
rvm
Friedman Test Result: p-value = 0.0861953672086341
No significant difference in performance across models and representations.


In [10]:
#Kruskal-Wallis test

from scipy.stats import kruskal

# Compare F1-score across datasets
dataset_scores = [raw_data_[raw_data_["dataset"] == ds]["mean_F1"] for ds in raw_data_["dataset"].unique()]

stat, p_value = kruskal(*dataset_scores)

print(f"Kruskal-Wallis Test for Datasets: p-value = {p_value}")

if p_value < 0.05:
    print("Dataset significantly affects model performance.")
else:
    print("No significant dataset effect.")


Kruskal-Wallis Test for Datasets: p-value = 1.6150231079064596e-77
Dataset significantly affects model performance.


## Effect of k on RCKmer and Kmer representations

In [11]:
mypath = "outputs/k_effect"
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

kmer_df = pd.DataFrame()

for file in files:
    
    if file.endswith('.xlsx'):
        filename = join(mypath,file)
        df = read_results(filename, header=['dataset','model','fold','n_fold','representation'])
        kmer_df = pd.concat([kmer_df,df])

#numbering the evaluation metrics as each metric has as many values as the number of n_fold cross validation (n=5)
eval_metrics = ['Accuracy','MCC','F_1','F_beta_2','Precision','Recall']

for metric in eval_metrics:
    score_columns = kmer_df.filter(regex=metric).columns
    kmer_df['mean_{}'.format(metric)] = kmer_df[score_columns].mean(axis=1)
    kmer_df['std_{}'.format(metric)] = kmer_df[score_columns].std(axis=1)
    kmer_df['{}'.format(metric)] = kmer_df.apply(lambda x: '{:.3f}±{:.3f}'.format(x['mean_{}'.format(metric)],x['std_{}'.format(metric)]), axis=1)

In [12]:
sub_kmer_df = kmer_df[(kmer_df['fold']=='StratifiedGroupKFold')&((kmer_df['model']=='RandomForest')|(kmer_df['model']=='SVM'))&\
                      (kmer_df['dataset']!='pick108')]
sub_kmer_df = sub_kmer_df.replace('rgp104','islandpick')

#rank the RCKmer and Kmer
kmer_family = sub_kmer_df[(sub_kmer_df.representation.str.startswith('Kmer'))].copy()
kmer_family['k'] = kmer_family.apply(lambda x: int(x['representation'].split('-')[-1]), axis=1)

rckmer_family = sub_kmer_df[(sub_kmer_df.representation.str.startswith('RCKmer'))].copy()
rckmer_family['k'] = rckmer_family.apply(lambda x: int(x['representation'].split('-')[-1]), axis=1)

In [13]:
data = rckmer_family
data.rename(columns={'dataset':'Data_set'}, inplace=True)

# Altair line plot with points
chart_svm = alt.Chart(data[data['model']=='SVM']).mark_line().encode(
    x=alt.X('k:O', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('mean_F_1:Q', scale=alt.Scale(bins=np.arange(0,1.1,0.1), domain=[0, 1])),
    #color=alt.Color('dataset:N').scale(scheme="greys")
    color=alt.Color('Data_set:N').scale(domain=['benbow','islandpick','gicluster','rvm'], range=['#a6cee3','#1f78b4','#b2df8a','#33a02c'])
).properties(
    width=400,
    height=300,
    title="A"
)

chart_rf = alt.Chart(data[data['model']=='RandomForest']).mark_line().encode(
    x=alt.X('k:O', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('mean_F_1:Q', scale=alt.Scale(bins=np.arange(0,1.1,0.1), domain=[0, 1])),
    #color=alt.Color('dataset:N').scale(scheme="greys")
    color=alt.Color('Data_set:N').scale(domain=['benbow','islandpick','gicluster','rvm'], range=['#a6cee3','#1f78b4','#b2df8a','#33a02c'])
).properties(
    width=400,
    height=300,
    title="B"
)


In [14]:
data = kmer_family
data.rename(columns={'dataset':'Data_set'}, inplace=True)

# Altair line plot with points
chart_svm_kmer = alt.Chart(data[data['model']=='SVM']).mark_line().encode(
    x=alt.X('k:O', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('mean_F_1:Q', scale=alt.Scale(bins=np.arange(0,1.1,0.1), domain=[0, 1])),
    color=alt.Color('Data_set:N').scale(domain=['benbow','islandpick','gicluster','rvm'], range=['#a6cee3','#1f78b4','#b2df8a','#33a02c'])
).properties(
    width=400,
    height=300,
    title="C"
)

chart_rf_kmer = alt.Chart(data[data['model']=='RandomForest']).mark_line().encode(
    x=alt.X('k:O', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('mean_F_1:Q', scale=alt.Scale(bins=np.arange(0,1.1,0.1), domain=[0, 1])),
    color=alt.Color('Data_set:N').scale(domain=['benbow','islandpick','gicluster','rvm'], range=['#a6cee3','#1f78b4','#b2df8a','#33a02c'])
).properties(
    width=400,
    height=300,
    title="D"
)


In [15]:
upper = alt.hconcat(chart_svm,chart_rf)
lower = alt.hconcat(chart_svm_kmer,chart_rf_kmer)
alt.vconcat(upper,lower).configure_title(
    anchor='start'
).configure_axis(
    labelFontSize=15,
    titleFontSize=15,
).configure_legend(
    titleFontSize=15,
    labelFontSize=15
)  

## Read Boundary Prediction Results

In [16]:
#"outputs/baselines_literature_fine_tuned_model.json" or "outputs/baselines_test_fine_tuned_model.json"
json_file = "outputs/evaluation/baselines_literature_fine_tuned_model.json"

eval_df = read_eval_result(json_file)
eval_df = eval_df[['Predictor','F-Score', 'F-2-Score', 'MCC', 'Precision', 'Recall', 'Accuracy']]
eval_df.groupby('Predictor').mean().sort_values('F-Score', ascending=False)

Unnamed: 0_level_0,F-Score,F-2-Score,MCC,Precision,Recall,Accuracy
Predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fine_tuned_model_literature,0.953346,0.933228,0.906695,0.996163,0.9214,0.94731
treasure_island_literature,0.938388,0.927899,0.860836,0.959541,0.921764,0.930182
islandviewer,0.791256,0.711831,0.683736,0.998099,0.66912,0.816549
alien_hunter,0.642011,0.596354,0.39793,0.75306,0.570324,0.704716
islandpath_dimob,0.636124,0.530466,0.527399,0.997641,0.478883,0.699898
islander,0.353588,0.263905,0.320506,1.0,0.226429,0.560047
sigi_hmm,0.313323,0.237263,0.271646,1.0,0.204859,0.553996
islandpick,0.25666,0.188161,0.230364,1.0,0.159914,0.524498


In [17]:
json_file = "outputs/evaluation/baselines_test_fine_tuned_model.json"

eval_df = read_eval_result(json_file)
eval_df = eval_df[['Predictor','F-Score', 'F-2-Score', 'MCC', 'Precision', 'Recall', 'Accuracy']]
eval_df.groupby('Predictor').mean().sort_values('F-Score', ascending=False)

Unnamed: 0_level_0,F-Score,F-2-Score,MCC,Precision,Recall,Accuracy
Predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fine_tuned_model_test,0.877946,0.865248,0.787977,0.924411,0.865387,0.90989
islandviewer,0.79931,0.764452,0.68365,0.916063,0.747487,0.861298
treasure_island_test,0.702114,0.659615,0.576227,0.825132,0.643673,0.847562
alien_hunter,0.601781,0.597632,0.34948,0.689202,0.611727,0.690282
islandpath_dimob,0.541732,0.470485,0.437365,0.910845,0.43867,0.688004
islandpick,0.429831,0.379038,0.360537,1.0,0.353067,0.739534
sigi_hmm,0.387802,0.309323,0.336574,0.958518,0.275289,0.644538
islander,0.26032,0.203444,0.227576,1.0,0.178152,0.622461


## Read Hyperparameter Tuning results

In [18]:
#read the cross validation results
mypath = "outputs/hpo"
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

hpo_res = {}

for file in files:
    key = file.split('.')[0]

    if file.endswith('.pkl'):
        with open(join(mypath,file),'rb') as f:
            r = pickle.load(f)

        hpo_res.update({key:r})

eval_metrics = ['F_1','F_beta_2','F_beta_0.5','MCC','Accuracy','Precision','Recall']

results_df = pd.DataFrame()


for key in hpo_res:
    data = key.split('_')[0]
    model = key.split('_')[1]
    encoding = '_'.join(key.split('_')[2:])

    best_index = hpo_res[key]['best_index']
    best_params = hpo_res[key]['best_params']
    cv_results = hpo_res[key]['cv_results']

    df_columns = ['Dataset','Model','Encoding']
    df_content = [data,model,encoding]

    for metric in eval_metrics:
        df_content.extend([cv_results['mean_test_{}'.format(metric)][best_index]])
        df_content.extend([cv_results['std_test_{}'.format(metric)][best_index]])
        df_columns.extend(['mean_{}'.format(metric), 'std_{}'.format(metric)])

    results_df = pd.concat([results_df,pd.DataFrame([df_content], columns=df_columns)])
    
for metric in eval_metrics:
    results_df['{}'.format(metric)] = results_df.apply(lambda x: '{:.3f}±{:.3f}'.format(x['mean_{}'.format(metric)],x['std_{}'.format(metric)]), axis=1)

results_df = results_df.rename(columns={'Encoding':'Representation'})
results_df.groupby('Dataset').apply(pd.DataFrame.nlargest, n=5, columns=['mean_F_1'])[['Model','Representation','F_1','F_beta_0.5','F_beta_2','MCC','Accuracy','Precision','Recall']]


Unnamed: 0_level_0,Unnamed: 1_level_0,Model,Representation,F_1,F_beta_0.5,F_beta_2,MCC,Accuracy,Precision,Recall
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
benbow,0,svm,RCKmer-7,0.937±0.020,0.940±0.023,0.934±0.019,0.859±0.045,0.930±0.022,0.942±0.027,0.933±0.021
benbow,0,svm,Kmer-6,0.920±0.023,0.920±0.027,0.920±0.025,0.820±0.054,0.911±0.027,0.920±0.032,0.920±0.029
benbow,0,svm,Z_curve_48bit,0.898±0.025,0.903±0.033,0.893±0.021,0.773±0.062,0.887±0.030,0.907±0.039,0.890±0.022
benbow,0,rf,PCPseTNC,0.886±0.039,0.866±0.049,0.906±0.031,0.730±0.097,0.867±0.048,0.853±0.054,0.921±0.026
benbow,0,rf,PseEIIP,0.884±0.038,0.862±0.048,0.907±0.031,0.726±0.093,0.864±0.046,0.849±0.054,0.923±0.030
gicluster,0,svm,RCKmer-7,0.689±0.126,0.641±0.167,0.759±0.088,0.578±0.166,0.796±0.094,0.618±0.191,0.829±0.102
gicluster,0,svm,Kmer-6,0.683±0.102,0.629±0.139,0.761±0.073,0.571±0.132,0.792±0.080,0.602±0.161,0.836±0.101
gicluster,0,svm,CKSNAP,0.635±0.101,0.579±0.143,0.719±0.059,0.494±0.145,0.749±0.097,0.550±0.167,0.802±0.077
gicluster,0,lr,Subsequence,0.633±0.115,0.606±0.169,0.681±0.053,0.474±0.213,0.746±0.165,0.594±0.202,0.735±0.077
gicluster,0,lr,Mismatch,0.631±0.118,0.602±0.170,0.680±0.058,0.472±0.211,0.745±0.163,0.589±0.201,0.736±0.080
