In [None]:
# analysis paper
# import libraries
import sys
# sys.path.insert(1, "/home/anna/Documents/code/python/pipeline")

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from scipy.stats import sem as sem
import glob
import networkx as nx

import logging

logging.getLogger().setLevel(logging.ERROR)

from pipeline import *
from pipeline.utils import validate
from pipeline.analysis import *

print(BSS.__file__)

logging.getLogger().setLevel(logging.INFO)


from functools import reduce
import scipy.stats as stats
import numpy as np
import seaborn as sns

In [2]:
# define the analysis method to use
ana_dicts = {"plain": {
    "estimator": "MBAR",
    "method": "alchemlyb",
    "check overlap": True,
    "try pickle": True,
    "save pickle": True,
    "auto equilibration": False,
    "statistical inefficiency": False,
    "truncate lower": 0,
    "truncate upper": 100,
    "name": None,
},
"subsampling": {
    "estimator": "MBAR",
    "method": "alchemlyb",
    "check overlap": True,
    "try pickle": True,
    "save pickle": True,
    "auto equilibration": False,
    "statistical inefficiency": True,
    "truncate lower": 0,
    "truncate upper": 100,
    "name": None,
},
}

In [None]:
# set the variables

# all the options
ana_obj_dict = {}

for protein in ["tyk2","mcl1","p38","syk","hif2a","cmet"]:
    ana_obj_dict[protein] = {}

    for ana_dict in ana_dicts.items():
        ana_prot = analysis_protocol(ana_dict[1])

        if protein == "syk" or protein == "cmet":
            main_dir = f"/backup/{protein}/neutral"
        else:
            main_dir = f"/backup/{protein}"

        bench_folder = f"/home/anna/Documents/benchmark"

        # if need size of protein 
        try:
            prot = BSS.IO.readMolecules(
                [f"{bench_folder}/inputs/{protein}/{protein}_prep/{protein}.gro", f"{bench_folder}/inputs/{protein}/{protein}_prep/{protein}.top"])[0]
        except:
            prot = BSS.IO.readMolecules(
                [f"{bench_folder}/inputs/{protein}/{protein}_parameterised.prm7", f"{bench_folder}/inputs/{protein}/{protein}_parameterised.rst7"])[0]

        print(f"no of residues in the protein: {prot.nResidues()}")

        # choose location for the files
        if protein == "syk" or protein == "cmet" or protein == "hif2a":
            net_file = f"{main_dir}/execution_model/network_all.dat" # the lomap network
        else:
            net_file = f"{main_dir}/execution_model/network_lomap.dat"

        exp_file = f"{bench_folder}/inputs/experimental/{protein}.yml"
        output_folder = f"{main_dir}/outputs_extracted"

        # prot_file = f"{main_dir}/execution_model/protocol.dat" # no protocol used , name added after if needed
        pipeline_prot = pipeline_protocol(auto_validate=True)
        # pipeline_prot.name("")

        # initialise the network object
        all_analysis_object = analysis_network(
            output_folder,
            exp_file=exp_file,
            net_file=net_file,
            analysis_prot=ana_prot,
            method = pipeline_prot.name(), # if the protocol had a name
            engines= pipeline_prot.engines(),
        )

        # compute
        all_analysis_object.compute_results()

        # add ligands folder
        if os.path.isdir(f"{bench_folder}/inputs/{protein}/ligands"):
            all_analysis_object.add_ligands_folder(f"{bench_folder}/inputs/{protein}/ligands")
        else:
            all_analysis_object.add_ligands_folder(f"{bench_folder}/inputs/{protein}/ligands_neutral") 
        
        ana_obj_dict[protein][ana_dict[0]] = all_analysis_object

print(ana_obj_dict)

In [None]:
import edgembar
engine="SOMD"
xml_folder = f"{ana_obj.results_folder}/edgembar/{engine}/xml_py_files_{analyse.file_ext(ana_obj.analysis_options)}"
print(xml_folder)
regular_list = [glob.glob(f) for f in glob.glob(f"{xml_folder}/*.py")]
print(regular_list)
efiles = list(set([item for sublist in regular_list for item in sublist]))

g = edgembar.Graph(efiles,
                exclude=None,
                refnode=None,
                ana_obj= ana_obj,
                engine=engine)
g.Read()
g.topology

In [None]:

ana_obj = ana_obj_dict["p38"]["subsampling"]

ana_obj.analyse_mbarnet(compute_missing=False,
                        write_xml=False, run_xml_py=False,
                        use_experimental=True, overwrite=True,
                        solver="linear", engines=["SOMD"],
                        )

In [None]:
# considering the mbarnet analysis
prot_res = {}
for prot in ana_obj_dict:
    ana_obj = ana_obj_dict[prot]["subsampling"]
    try:
        ana_obj.analyse_mbarnet(compute_missing=False,
                                write_xml=True, run_xml_py=True,
                                use_experimental=True, overwrite=True,
                                solver="linear", engines=["SOMD"],
                                )
    except:
        print(f"failed for {prot}")
        continue
    res_dict = {}
    for lig in ana_obj._ligands_dict["SOMD"]:
        res_dict[lig] = []
    for lig in ana_obj._ligands_dict["SOMD"]:
        ana_obj.analyse_mbarnet(compute_missing=False,
                                write_xml=False, run_xml_py=False,
                                use_experimental=True, overwrite=True,
                                solver="linear", engines=["SOMD"],
                                refnode=lig
                                )
        # res_dict
        # res = ana_obj._get_stats_mbarnet("SOMD","MUE")
        # res_dict[lig] = (res[0]["SOMD"]["experimental"], res[1]["SOMD"]["experimental"])
        for ligres in ana_obj._mbarnet_computed_DGs["SOMD"]:
            res_dict[ligres].append(ana_obj._mbarnet_computed_DGs["SOMD"][ligres][0])
            
    prot_res[prot] = res_dict

# print(res_dict)


In [None]:
for prot in ana_obj_dict:
    ana_obj = ana_obj_dict[prot]["subsampling"]
    ana_obj.analyse_mbarnet(compute_missing=False,
                        write_xml=False, run_xml_py=False,
                        use_experimental=True, overwrite=True,
                        solver="linear", engines=["SOMD"],
                        refnode=None
                        )
for prot in ana_obj_dict:
    ana_obj = ana_obj_dict[prot]["subsampling"]
    res = ana_obj._get_stats_mbarnet(engines=["SOMD"], statistic="MUE")
    print(prot, res)

In [None]:
for prot in ana_obj_dict:
    df = pd.DataFrame(res_dict[prot]).T 
    df["mean"] = df.mean(numeric_only=True, axis=1)
    df["err"] = df.sem(numeric_only=True, axis=1)
    df_dict = {}
    for val,lig in zip(df["mean"], ana_obj.ligands):
        df_dict[lig] = val

    res = stats_engines.compute_stats(x=[val for val in df_dict.values()],
                                y=[val[0] for val in ana_obj.normalised_exper_val_dict.values()], statistic="MUE")
    print(prot, res)

In [None]:
import edgembar
engine = "SOMD"
xml_folder = f"{ana_obj.results_folder}/edgembar/{engine}/xml_py_files" #_{analyse.file_ext(ana_obj.analysis_options)}"
regular_list = [glob.glob(f) for f in glob.glob(f"{xml_folder}/*.py")]
efiles = list(set([item for sublist in regular_list for item in sublist]))

g = edgembar.Graph(efiles,
                   exclude=None,
                   refnode=None,
                   ana_obj=ana_obj,
                   engine=engine)
g.Read()
g.topology.nodes[0]

In [None]:
edgedata = [g.GetPathFreeEnergy(g.topology.StrToPath(e.fwdname))
            for e in g.entries]
for e in edgedata:
    print(e.name, e.value, e.error, ana_obj.calc_pert_dict[engine][e.name])

In [None]:
g.GetPathData("lig_ejm31~lig_ejm42")

In [None]:

ana_obj = ana_obj_dict["p38"]["plain"]
# function for single dicts
ana_obj.compute_other_results(f"{ana_obj.output_folder}/SOMD_oldsc/results_oldsc/final_summary_SOMD_MBAR_alchemlyb_None_eqfalse_statsfalse_truncate0_100.csv",
                                name="SOMD_oldsc")
ana_obj.compute_other_results(f"{ana_obj.output_folder}/SOMD_newsc/final_summary_SOMD_MBAR_alchemlyb_None_eqfalse_statsfalse_truncate0_100.csv",
                              name="SOMD_newsc")

mae = ana_obj.calc_mae_engines("pert", engines=["SOMD", "SOMD_oldsc", "SOMD_newsc"], recalculate=True)
print(mae)
mae = ana_obj.calc_spearmans_rank_engines(
    "pert", engines=["SOMD", "SOMD_oldsc", "SOMD_newsc"], recalculate=True)
print(mae)
mae = ana_obj.calc_kendalls_rank_engines(
    "pert", engines=["SOMD", "SOMD_oldsc", "SOMD_newsc"], recalculate=True)
print(mae)

In [None]:
for pert in ana_obj.calc_pert_dict["SOMD_oldsc"]:
    try:
        oldsc = abs(ana_obj.calc_pert_dict["SOMD_oldsc"][pert][0] - ana_obj.exper_pert_dict[pert][0])
        newsc = abs(ana_obj.calc_pert_dict["SOMD_newsc"][pert][0] - ana_obj.exper_pert_dict[pert][0])
        val = "newsc" if oldsc > newsc else "oldsc"
        print(pert, val)
        if val == "oldsc":
            print(
                ana_obj.calc_pert_dict["SOMD_oldsc"][pert][0],
                ana_obj.calc_pert_dict["SOMD_newsc"][pert][0],
                ana_obj.exper_pert_dict[pert][0])
    except:
        pass

In [None]:
df = pd.DataFrame(r2_dict).T
df_err = pd.DataFrame(r2_error_dict)
df_low = df_err.map(lambda x: x[0])
df_high = df_err.map(lambda x: x[1])

pal = pipeline.utils.set_colours()
pal["maximum"] = "darkblue"

# # Draw a nested barplot by species and sex
g = sns.barplot(
    data=df,
    # errorbar="sd",
    palette=pal,
)
g.despine(left=True)
g.set_axis_labels("protein system", "r2")
g.legend.set_title("")

In [None]:
# MUE for subsampling and plain and autoeq
# calculate average MUE for each engine and consensus

import warnings
warnings.filterwarnings("ignore")

mue_dict = {}
mue_err_dict = {}
mue_ci_dict = {}

mad_dict = {}
mad_err_dict = {}
mad_ci_dict = {}

# TODO joblib or dask (no)
# look at embarassingly parallel loops
# check how often bootstrapping
for prot in ana_obj_dict.keys():
    
    for name in ["plain","single"]:
        ana_obj = ana_obj_dict[prot][name]

        mue_dict[f"{prot}_{name}"] = {}
        mue_err_dict[f"{prot}_{name}"] = {}
        mue_ci_dict[f"{prot}_{name}"] = {}

        mad_dict[f"{prot}_{name}"] = {}
        mad_err_dict[f"{prot}_{name}"] = {}
        mad_ci_dict[f"{prot}_{name}"] = {}

        mad_dict[f"{prot}_{name}"] = {}
        mad_err_dict[f"{prot}_{name}"] = {}
        mad_ci_dict[f"{prot}_{name}"] = {}
        
        print(prot, name)
        try:
            df, df_err, df_ci = ana_obj.calc_mae_engines(pert_val="pert", recalculate=True) # From bootstrapping
            for eng in df.columns.values:
                mue_dict[f"{prot}_{name}"][eng] = df[eng]["experimental"]
                mue_err_dict[f"{prot}_{name}"][eng] = df_err[eng]["experimental"]
                mue_ci_dict[f"{prot}_{name}"][eng] = df_ci[eng]["experimental"]
        except Exception as e:
            print(e)
            print(f"did not calc mue for {prot}_{name}")
        # try:
        #     df, df_err, df_ci = ana_obj.calc_mad_engines(pert_val="pert", recalculate=False) # from bootstrapping
        #     mad_dict[f"{prot}_{name}"] = df
        #     mad_err_dict[f"{prot}_{name}"] = df_err # SEM
        #     mad_ci_dict[f"{prot}_{name}"] = df_ci
        # except Exception as e:
        #     print(e)
        #     print(f"did not calc mad for {prot}_{name}")

In [None]:
# print("MUE")
print(mue_dict)
print(mue_err_dict)
print(mue_ci_dict)

#  print("MAD")
# print(mad_dict)
# print(mad_err_dict)
# print(mad_ci_dict)

In [None]:
# mad consensus

# all systems 
import itertools as it

df = pd.DataFrame(columns=ana_obj.engines, index=ana_obj.engines)
df_err = pd.DataFrame(columns=ana_obj.engines, index=ana_obj.engines)
df_ci = pd.DataFrame(columns=ana_obj.engines, index=ana_obj.engines)

for eng1,eng2 in it.product(ana_obj.engines, ana_obj.engines):
        # loc index, column
    df.loc[eng2, eng1] = []
    # df_err.loc[eng2, eng1] = []
    # df_ci.loc[eng2, eng1] = []
mae_dict = {}
for name in ana_dicts:
    mae_list = []
    print(name)
    for prot in ana_obj_dict.keys(): # 
        for eng1,eng2 in it.product(ana_obj.engines, ana_obj.engines):
            try:
                mad_dict[f"{prot}_{name}"][eng1][eng2]

                # loc index, column
                df.loc[eng2, eng1].append(mad_dict[f"{prot}_{name}"][eng1][eng2])
            # df_err.loc[eng2, eng1].append(mad_err_dict[f"{prot}_{name}"][eng1][eng2])
            # df_ci.loc[eng2, eng1].append(mad_ci_dict[f"{prot}_{name}"][eng1][eng2])
            except:
                pass
# df_mean = df.mean()
# df_err = df.std()

# print(df_mean)
# print(df_err)
df
# df_ci = 
# def ci_func
# lower_ci,upper_ci = stats.t.interval(0.95, df=self.no_of_repeats-1, 
#                                             loc=self.freenrg_val, # mean
#                                             scale=self.freenrg_err, # SEM
#                                             ) 

In [None]:
# consensus
# cycle through each individual engine - view df
# mae for subsampling and not subsampling

mae_dict = {}
for name in ["plain","single"]: # ana_dicts
    print(name)
    me_x_list = []
    me_y_list = []
    me_x_err_list = []
    me_y_err_list = []
    for prot in ana_obj_dict.keys():
        ana_obj = ana_obj_dict[prot][name]
        ana_obj._initialise_stats_object(check=True)
        for eng in ["AMBER","SOMD","GROMACS"]:  # ana_obj.engines
            x, y, xerr, yerr = ana_obj._stats_object._get_x_y("pert", "experimental", eng)
            me_x_list.append(x)
            me_y_list.append(y)
            me_x_err_list.append(xerr)
            me_y_err_list.append(yerr)
    # print(me_y_list)
    me_x_list = pd.concat(me_x_list)
    me_y_list = pd.concat(me_y_list)
    me_x_err_list = pd.concat(me_x_err_list)
    me_y_err_list = pd.concat(me_y_err_list)

    values = pipeline.analysis.stats_engines.compute_stats(
                                                    x = me_x_list,
                                                    y = me_y_list,
                                                    xerr = me_x_err_list,
                                                    yerr= me_y_err_list,
                                                    statistic = "MUE")
    # (s["mle"], s["stderr"], [s['low'], s['high']])

    mae_dict[name] = values
    print(values)

# mae_dict = {}
# for name in ana_dicts:
#     mae_list = []
#     print(name)
#     for prot in ana_obj_dict.keys(): # 
#         for eng in ana_obj.engines:
#             mae_list.append(mue_dict[f"{prot}_{name}"][eng])
#     mean = np.mean(mae_list)

#     # check normally dist
#     if len(sem_list) < 50:
#         stat, p = stats.shapiro(sem_list)
#     else:
#         stat, p = stats.kstest(sem_list)
#     if p < 0.05:
#         pass
#     else:
#         print("not normal distribution")

#     # <30 samples
#     lower_ci,upper_ci = stats.t.interval(confidence=0.95, 
#                     loc=np.mean(mae_list), 
#                     scale=stats.sem(mae_list)) 
    
#     print(mean, (lower_ci, upper_ci))
#     mae_dict[name] = (mean, stats.sem(mae_list),(lower_ci, upper_ci), mae_list)

In [None]:
plain_df = pd.DataFrame.from_dict(sem_dict).dropna().rename(index={"1ns": 1, "2ns": 2, "3ns": 3, "subsampling": 4})
df = plain_df.map(lambda x: x[0])
df_low = plain_df.map(lambda x: x[2][0])
df_high = plain_df.map(lambda x: x[2][1])

In [None]:
# plot CIs
plt.rc("font", size=12)
fig, ax = plt.subplots(figsize=[10, 10])

for eng in ana_obj.engines:
    col = pipeline.analysis.set_colours()[eng]

    ax.plot(df.index, df[eng],
            label=eng,
            color=col,
            )
    
    ax.fill_between(df.index, df_low[eng], df_high[eng], color=col, alpha=.2)

plt.title("", fontsize=20)
plt.ylabel("Error (kcal/mol)", fontsize=20)
plt.xlabel("Simulation time per window (ns)", fontsize=20)

plt.legend(fontsize=18)
plt.savefig(
    f"/backup/overall_analysis/SEM_w_time_outliers{threshold}removed.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# paired t-test to see if sig diff
stats.ttest_rel(mae_dict["1ns"][3], mae_dict["plain"][3])
# if p value less than 0.05  can reject null hypothesis ie the values are different

In [None]:
# plot the errors
df = pd.DataFrame.from_dict(mue_dict).transpose().dropna()
err_df = pd.DataFrame.from_dict(mue_ci_dict).transpose().dropna()
df_low = df - err_df.map(lambda x: x[0])
df_high = err_df.map(lambda x: x[1]) - df
for prot in ana_obj_dict.keys():
    df = df.drop([f"{prot}_plain"])
    err_df = err_df.drop([f"{prot}_plain"])
    df_low = df_low.drop([f"{prot}_plain"])
    df_high = df_high.drop([f"{prot}_plain"])
    df = df.drop([f"{prot}_1ns"])
    err_df = err_df.drop([f"{prot}_1ns"])
    df_low = df_low.drop([f"{prot}_1ns"])
    df_high = df_high.drop([f"{prot}_1ns"])
    df = df.drop([f"{prot}_2ns"])
    err_df = err_df.drop([f"{prot}_2ns"])
    df_low = df_low.drop([f"{prot}_2ns"])
    df_high = df_high.drop([f"{prot}_2ns"])
    df = df.drop([f"{prot}_3ns"])
    err_df = err_df.drop([f"{prot}_3ns"])
    df_low = df_low.drop([f"{prot}_3ns"])
    df_high = df_high.drop([f"{prot}_3ns"])
    df = df.rename({f"{prot}_subsampling": f"{prot.upper()}"})
    err_df = err_df.rename({f"{prot}_subsampling":f"{prot.upper()}"})
    df_high = df_high.rename({f"{prot}_subsampling":f"{prot.upper()}"})
    df_low = df_low.rename({f"{prot}_subsampling":f"{prot.upper()}"})
    df = df.rename(
        {f"{prot}_autoeq": f"{prot.upper()}_autoeq"})
    err_df = err_df.rename({f"{prot}_autoeq": f"{prot.upper()}_autoeq"})
    df_high = df_high.rename(
        {f"{prot}_autoeq": f"{prot.upper()}_autoeq"})
    df_low = df_low.rename(
        {f"{prot}_autoeq": f"{prot.upper()}_autoeq"})

for prot in ana_obj_dict.keys():
    try:
        df = df.drop([f"{prot}_TI"])
        err_df = err_df.drop([f"{prot}_TI"])
        df_low = df_low.drop([f"{prot}_TI"])
        df_high = df_high.drop([f"{prot}_TI"])
    except:
        pass

In [None]:
# plot CIs
plt.rc("font", size=12)
fig, ax = plt.subplots(figsize=[30,10])

width = 0.23
placement = [-width * (2 / 2), 0, width * (2 / 2)]
placement_dict = {}
for eng, place in zip(ana_obj.engines, placement):
    placement_dict.update({eng: place})  # for each engine

for eng in ana_obj.engines:
    col = pipeline.analysis.set_colours()[eng]
    space = placement_dict[eng]

    # just always compare to experimental for this
    freenrg_df_plotting = df[eng]

    # determine positions for X axis labels.
    x_locs = np.arange(len(freenrg_df_plotting))

    # plot both our experimental and FEP free energies using an offset on the x position so bars don't overlap.
    ax.bar(
        x_locs + space,
        height=freenrg_df_plotting,
        width=width,
        yerr=[df_low[eng], df_high[eng]],
        label=eng,
        color=col,
        )

plt.title("", fontsize=20)
plt.ylabel("MAE (kcal/mol)", fontsize=20)
plt.xlabel("Protein(_analysis)", fontsize=20)

plt.xticks(x_locs, freenrg_df_plotting.index, rotation=70, ha="right")
plt.legend(fontsize=18)
plt.savefig(f"/backup/overall_analysis/MUE_outliers{threshold}removed.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# accuracy and precision as a funciton of time
# plot the errors
df = pd.DataFrame.from_dict(mue_dict).transpose().dropna()
err_df = pd.DataFrame.from_dict(mue_ci_dict).transpose().dropna()
df_low = df - err_df.map(lambda x: x[0])
df_high = err_df.map(lambda x: x[1]) - df
for prot in ana_obj_dict.keys():
    df = df.drop([f"{prot}_plain"])
    err_df = err_df.drop([f"{prot}_plain"])
    df_low = df_low.drop([f"{prot}_plain"])
    df_high = df_high.drop([f"{prot}_plain"])
    df = df.drop([f"{prot}_autoeq"])
    err_df = err_df.drop([f"{prot}_autoeq"])
    df_low = df_low.drop([f"{prot}_autoeq"])
    df_high = df_high.drop([f"{prot}_autoeq"])
    df = df.rename({f"{prot}_subsampling": f"{prot}"})
    err_df = err_df.rename({f"{prot}_subsampling": f"{prot}_4ns"})
    df_high = df_high.rename({f"{prot}_subsampling": f"{prot}_4ns"})
    df_low = df_low.rename({f"{prot}_subsampling": f"{prot}_4ns"})


for prot in ana_obj_dict.keys():
    try:
        df = df.drop([f"{prot}_TI"])
        err_df = err_df.drop([f"{prot}_TI"])
        df_low = df_low.drop([f"{prot}_TI"])
        df_high = df_high.drop([f"{prot}_TI"])
    except:
        pass

In [None]:
# cycle closures

cc_dict = {}

for prot in ana_obj_dict.keys():
    
    for name in ana_dicts:
        ana_obj = ana_obj_dict[prot][name]

        print(prot, name)
        ana_obj.compute_cycle_closures()
        cc_dict[f"{prot}_{name}"] = ana_obj.cycle_dict

# plot the cycle closures
# plot the errors
df = pd.DataFrame.from_dict(cc_dict).transpose()

df_ci = df.map(lambda x: x[3])
df_mean = df.map(lambda x: x[1]).fillna(0)
df_low = df_mean - df_ci.map(lambda x: x[0])
df_high = df_ci.map(lambda x: x[1]) - df_mean
df_low = df_low.fillna(0)
df_high = df_high.fillna(0)
df_err = df.map(lambda x: x[2])
print(df_mean)
print(df_low)
print(df_high)

