In [54]:
import argparse
import pandas as pd
from aux_msa_functions import *
import time
from scipy.spatial.distance import cdist
import os

scores_folders = ["./scores/msa-seed-simulations/MSA-1b/static-context/10/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/dynamic-context/10/PF00004",
                  "./scores/msa-seed-simulations/ESM2/PF00004",
                  "./scores/msa-seed-simulations/Potts/PF00004"]

tool_types = ["MSA-static","MSA-dynamic","ESM2","Potts"]

tool_scores_dict = {}

for j,folder in enumerate(scores_folders):
    
    main_dataframe = []
    
    for i,file in enumerate(os.listdir(folder)):

        row = {"tool":tool_types[j],"sim_ind":i+1}
        file_path = os.path.join(folder,file)
        df = pd.read_csv(file_path, delimiter="\t")
        
        df = df.iloc[:,2:].mean(axis = 0)

        names = [elem + "_mean" for elem in list(df.index)]
        df.index = names

        record = df.to_dict()
        row.update(record)

        main_dataframe.append(row)

    main_dataframe = pd.DataFrame(main_dataframe)
    tool_scores_dict.update({tool_types[j]:main_dataframe})



In [55]:
all_metrics_dataframe = pd.concat([df for _,df in tool_scores_dict.items()])

large_metrics_dataframe = all_metrics_dataframe[["tool","sim_ind","hmmer_seq_score_mean","stat_energy_scores_mean"]]
small_metrics_dataframe = all_metrics_dataframe[["tool","sim_ind","min_natural_ham_dist_mean"]]

In [56]:
melted_large_metrics_dataframe = pd.melt(large_metrics_dataframe, id_vars=["tool","sim_ind"], var_name="score_type")
melted_small_metrics_dataframe = pd.melt(small_metrics_dataframe, id_vars=["tool","sim_ind"], var_name="score_type")


In [60]:
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.violin(melted_small_metrics_dataframe, y="value", x="tool",color = "score_type", points="all")
fig.show()

In [58]:
fig = px.violin(melted_large_metrics_dataframe, y="value", x="tool",color="score_type",points="all")
fig.show()