In [24]:
import argparse
import pandas as pd
from scripts.aux_msa_functions import *
import time
from scipy.spatial.distance import cdist
import os
import numpy as np
import torch

scores_folders = ["./scores/msa-seed-simulations/MSA-1b/random-proposal/static-context/10/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/random-proposal/dynamic-context/10/greedy/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/random-proposal/dynamic-context/10/random/PF00004",
                  "./scores/msa-seed-simulations/ESM2/PF00004",
                  "./scores/msa-seed-simulations/Potts/PF00004",
                  "./scores/protein-families-msa-seed",
                  "./scores/msa-seed-simulations/MSA-1b/msa_prob_dist-proposal/dynamic-context/10/greedy/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/msa_prob_dist-proposal/dynamic-context/10/random/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/msa_prob_dist-proposal/static-context/10/PF00004",
                 ]

tool_types = ["MSA-randomprop-static","MSA-randomprop-dynamic-greedy","MSA-randomprop-dynamic-random",
              "ESM2","Potts","nat","MSA-msaprop-dynamic-greedy","MSA-msaprop-dynamic-random",
             "MSA-msaprop-static"]

tool_scores_dict = {}

for j,folder in enumerate(scores_folders):
    
    main_dataframe = []
    
    for i,file in enumerate(os.listdir(folder)):

        row = {"tool":tool_types[j],"sim_ind":i+1}
        file_path = os.path.join(folder,file)
        df = pd.read_csv(file_path, delimiter="\t")
        
        df = df.iloc[:,4:].mean(axis = 0)

        names = [elem + "_mean" for elem in list(df.index)]
        df.index = names

        record = df.to_dict()
        row.update(record)

        main_dataframe.append(row)

    main_dataframe = pd.DataFrame(main_dataframe)
    tool_scores_dict.update({tool_types[j]:main_dataframe})

tool_scores_dict["nat"]["min_natural_ham_dist_seed_mean"] = [None]
tool_scores_dict["nat"]["min_natural_ham_dist_full_mean"] = [None]

In [33]:
relevant_char_indices = list(range(4,24)) + [30]
relevant_indices_mapping = {k:v for k,v in zip(relevant_char_indices,list(range(21)))}

In [35]:
relevant_indices_mapping

{4: 0,
 5: 1,
 6: 2,
 7: 3,
 8: 4,
 9: 5,
 10: 6,
 11: 7,
 12: 8,
 13: 9,
 14: 10,
 15: 11,
 16: 12,
 17: 13,
 18: 14,
 19: 15,
 20: 16,
 21: 17,
 22: 18,
 23: 19,
 30: 20}

In [25]:
all_metrics_dataframe = pd.concat([df for _,df in tool_scores_dict.items()])

large_metrics_dataframe = all_metrics_dataframe[["tool","sim_ind","hmmer_seq_score_mean","stat_energy_scores_mean"]]
small_metrics_dataframe = all_metrics_dataframe[["tool","sim_ind","min_natural_ham_dist_seed_mean","min_natural_ham_dist_full_mean","max_self_ham_distance_mean"]]

In [26]:
melted_large_metrics_dataframe = pd.melt(large_metrics_dataframe, id_vars=["tool","sim_ind"], var_name="score_type")
melted_small_metrics_dataframe = pd.melt(small_metrics_dataframe, id_vars=["tool","sim_ind"], var_name="score_type")


In [27]:
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.violin(melted_small_metrics_dataframe, y="value", x="tool",color = "score_type", points="all", box = True)
fig.show()

In [28]:
fig = px.violin(melted_large_metrics_dataframe, y="value", x="tool",color="score_type",points="all")
fig.show()

In [29]:
fig = px.violin(all_metrics_dataframe, y="hmmer_seq_score_mean", x="tool", points="all", box = True)
fig.show()

In [30]:
fig = px.violin(all_metrics_dataframe, y="stat_energy_scores_mean", x="tool", points="all", box = True)
fig.show()

In [31]:
fig = px.violin(all_metrics_dataframe, y="min_natural_ham_dist_seed_mean", x="tool", points="all", box = True)
fig.show()

In [32]:
fig = px.violin(all_metrics_dataframe, y="max_self_ham_distance_mean", x="tool", points="all", box = True)
fig.show()

In [37]:
import argparse
import pandas as pd
from scripts.aux_msa_functions import *
import time
from scipy.spatial.distance import cdist
import os
from scipy.spatial.distance import cdist

scores_folders = ["./scores/msa-seed-simulations/MSA-1b/random-proposal/static-context/10/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/random-proposal/dynamic-context/10/greedy/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/random-proposal/dynamic-context/10/random/PF00004",
                  "./scores/msa-seed-simulations/ESM2/PF00004",
                  "./scores/msa-seed-simulations/Potts/PF00004",
                  "./scores/protein-families-msa-seed",
                  "./scores/msa-seed-simulations/MSA-1b/msa_prob_dist-proposal/dynamic-context/10/greedy/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/msa_prob_dist-proposal/dynamic-context/10/random/PF00004",
                  "./scores/msa-seed-simulations/MSA-1b/msa_prob_dist-proposal/static-context/10/PF00004",
                 ]

tool_types = ["MSA-randomprop-static","MSA-randomprop-dynamic-greedy","MSA-randomprop-dynamic-random",
              "ESM2","Potts","nat","MSA-msaprop-dynamic-greedy","MSA-msaprop-dynamic-random",
             "MSA-msaprop-static"]

   
main_dataframe = pd.DataFrame(columns=["tool","sim_ind","ham_dist"])

for j,folder in enumerate(scores_folders):
        
    for i,file in enumerate(os.listdir(folder)):

        file_path = os.path.join(folder,file)
        dist_dataframe = pd.DataFrame()
        
        df = pd.read_csv(file_path, delimiter="\t") 
        num_seq_array = np.array([list(seq) for seq in df["sequence"]], dtype=np.bytes_).view(np.uint8)
        
        distance_matrix = cdist(num_seq_array,num_seq_array, "hamming")
        pairwise_distances = np.triu(distance_matrix).flatten()
        mask = pairwise_distances != 0
        pairwise_distances = pairwise_distances[mask]

        dist_dataframe["ham_dist"] = list(pairwise_distances)
        dist_dataframe["tool"] = tool_types[j]
        dist_dataframe["sim_ind"] = i + 1

        main_dataframe = pd.concat((main_dataframe, dist_dataframe))

In [38]:
sim_number = 1

main_dataframe.loc[main_dataframe["tool"] == "nat","sim_ind"] = sim_number
main_dataframe_one_sim = main_dataframe.loc[main_dataframe["sim_ind"] == sim_number, :]

In [39]:
fig = px.violin(main_dataframe_one_sim, y="ham_dist", x="tool",color = "sim_ind", box = True, points = False)
fig.show()