In [None]:
# analysis paper
# import libraries
import seaborn as sns
import numpy as np
import scipy.stats as _stats
from functools import reduce
from pipeline.analysis import *
from pipeline.utils import validate
from pipeline import *
import logging
import networkx as nx
import glob
from scipy.stats import sem as sem
from matplotlib import colormaps
import sys
# sys.path.insert(1, "/home/anna/Documents/code/python/pipeline")
from matplotlib.ticker import MaxNLocator

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
# warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

logging.getLogger().setLevel(logging.ERROR)


print(BSS.__file__)

In [2]:
def write_perts_file(
    val_dict,
    file_path: str,
    eng: Optional[str] = None,
    analysis_string: Optional[str] = None,
    method: Optional[str] = None,
):
    val_dict = validate.dictionary(val_dict)

    if not method:
        method = "None"

    with open(f"{file_path}.csv", "w") as file:
        writer = csv.writer(file)
        writer.writerow(["lig_0", "lig_1", "freenrg", "error",
                        "engine", "analysis", "method"])

        for key, value in val_dict.items():
            writer.writerow([key.split("~")[0], key.split(
                "~")[1], value[0], value[1], eng, analysis_string, method])

In [3]:
# define the analysis method to use
ana_dicts = {"plain": {
    "estimator": "MBAR",
    "method": "alchemlyb",
    "check overlap": True,
    "try pickle": True,
    "save pickle": True,
    "auto equilibration": False,
    "statistical inefficiency": False,
    "truncate lower": 0,
    "truncate upper": 100,
    "name": None,
},
    "subsampling": {
    "estimator": "MBAR",
    "method": "alchemlyb",
    "check overlap": True,
    "try pickle": True,
    "save pickle": True,
    "auto equilibration": False,
    "statistical inefficiency": True,
    "truncate lower": 0,
    "truncate upper": 100,
    "name": None,
},
#     "1ns": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": False,
#     "statistical inefficiency": True,
#     "truncate lower": 0,
#     "truncate upper": 25,
#     "name": None,
# },
#     "2ns": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": False,
#     "statistical inefficiency": True,
#     "truncate lower": 0,
#     "truncate upper": 50,
#     "name": None,
# },
#     "3ns": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": False,
#     "statistical inefficiency": True,
#     "truncate lower": 0,
#     "truncate upper": 75,
#     "name": None,
# },
#     "autoeq": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": True,
#     "statistical inefficiency": True,
#     "truncate lower": 0,
#     "truncate upper": 100,
#     "name": None,
# },
    # "TI": {
    # "estimator": "TI",
    # "method": "alchemlyb",
    # "check overlap": True,
    # "try pickle": True,
    # "save pickle": True,
    # "auto equilibration": False,
    # "statistical inefficiency": True,
    # "truncate lower": 0,
    # "truncate upper": 100,
    # "name": None,
# },
#     "single_0": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": False,
#     "statistical inefficiency": False,
#     "truncate lower": 0,
#     "truncate upper": 100,
#     "name": None,
# },
#     "single_1": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": False,
#     "statistical inefficiency": False,
#     "truncate lower": 0,
#     "truncate upper": 100,
#     "name": None,
# },
#     "single_2": {
#     "estimator": "MBAR",
#     "method": "alchemlyb",
#     "check overlap": True,
#     "try pickle": True,
#     "save pickle": True,
#     "auto equilibration": False,
#     "statistical inefficiency": False,
#     "truncate lower": 0,
#     "truncate upper": 100,
#     "name": None,
# }
}

In [None]:
# set the variables
network = "lomap"  # lomap rbfenn combined

prot_dict_name = {"tyk2":"TYK2", "mcl1":"MCL1", "p38":"P38α", "syk":"SYK", "hif2a":"HIF2A", "cmet":"CMET"}
eng_dict_name = {"AMBER":"AMBER22","SOMD":"SOMD1","GROMACS":"GROMACS23"}

# all the options
ana_obj_dict = {}

for protein in ["tyk2", "mcl1", "p38", "syk", "hif2a", "cmet"]:

    ana_obj_dict[protein] = {}

    for ana_dict in ana_dicts:
        ana_prot = analysis_protocol(ana_dicts[ana_dict])
        print(protein, ana_dict)

        if protein == "syk" or protein == "cmet":
            main_dir = f"/backup/{protein}/neutral"
        else:
            main_dir = f"/backup/{protein}"

        bench_folder = f"/home/anna/Documents/benchmark"

        # if need size of protein
        try:
            prot = BSS.IO.readMolecules(
                [f"{bench_folder}/inputs/{protein}/{protein}_prep/{protein}.gro", f"{bench_folder}/inputs/{protein}/{protein}_prep/{protein}.top"])[0]
        except:
            prot = BSS.IO.readMolecules(
                [f"{bench_folder}/inputs/{protein}/{protein}_parameterised.prm7", f"{bench_folder}/inputs/{protein}/{protein}_parameterised.rst7"])[0]

        print(f"no of residues in the protein: {prot.nResidues()}")

        # choose location for the files
        if protein == "syk" or protein == "cmet" or protein == "hif2a":
            # the lomap network
            net_file = f"{main_dir}/execution_model/network_all.dat"
        else:
            net_file = f"{main_dir}/execution_model/network_{network}.dat"

        exp_file = f"{bench_folder}/inputs/experimental/{protein}.yml"
        output_folder = f"{main_dir}/outputs_extracted"

        # prot_file = f"{main_dir}/execution_model/protocol.dat" # no protocol used , name added after if needed
        pipeline_prot = pipeline_protocol(auto_validate=True)
        # pipeline_prot.name("")

        # initialise the network object
        all_analysis_object = analysis_network(
            output_folder,
            exp_file=exp_file,
            net_file=net_file,
            analysis_prot=ana_prot,
            method=pipeline_prot.name(),  # if the protocol had a name
            engines=pipeline_prot.engines(),
        )

        # compute
        all_analysis_object.compute_results()

        if ana_dict == "single":
            all_analysis_object.file_ext = all_analysis_object.file_ext+f"_{ana_dict}"
            
        # add ligands folder
        if os.path.isdir(f"{bench_folder}/inputs/{protein}/ligands"):
            all_analysis_object.add_ligands_folder(
                f"{bench_folder}/inputs/{protein}/ligands")
        else:
            all_analysis_object.add_ligands_folder(
                f"{bench_folder}/inputs/{protein}/ligands_neutral")

        ana_obj_dict[protein][ana_dict] = all_analysis_object

print(ana_obj_dict)

In [None]:
# check maximum possible accuracy
r2_dict = {}
r2_error_dict = {}
for prot in ana_obj_dict.keys():
    r2_dict[prot] = {}
    r2_error_dict[prot] = {}
    ana_obj = ana_obj_dict[prot]["plain"]
    print(prot, len(ana_obj.ligands))
    print("max", max(ana_obj.exper_val_dict.values())[0], "min",
          min(ana_obj.exper_val_dict.values())[0], "range", 
          max(ana_obj.exper_val_dict.values())[0]-min(ana_obj.exper_val_dict.values())[0])
    avg = np.mean([val[1] for val in ana_obj.exper_val_dict.values()])
    std = np.std([val[0] for val in ana_obj.exper_val_dict.values()])
    print("mean of error", avg, "std of val", std)
    # experimental uncertainty is std of measurement error
    # max is measurement error / std dev of the affinity , squared
    # tyk2 mcl1 Ki 0.44
    # others IC50 0.75
    r2max = 1 - (avg / std)**2
    print(r2max)
    r2_dict[prot]["maximum"] = r2max
    r2_error_dict[prot]["maximum"] = (0, 0)

In [None]:
# exclude outliers
threshold = 10
for prot in ana_obj_dict.keys():

    for name in ana_dicts.keys():
        print(prot, name)
        ana_obj = ana_obj_dict[prot][name]

        for eng in ana_obj.engines:
            ana_obj.file_ext = ana_obj.file_ext + f"_outliers{threshold}removed"
            ana_obj.remove_outliers(threshold=threshold, name=eng)
        # print(ana_obj.file_ext)

In [81]:
prot = "mcl1"
lig = "lig_45"
ana_obj = ana_obj_dict[prot]["subsampling"]
for pert in ana_obj.perturbations:
    if lig in pert:
        print(pert)
        for eng in ana_obj.engines:
            print(eng, ana_obj.calc_pert_dict[eng][pert])
        print("exp", ana_obj.exper_pert_dict[pert])
        ana_obj.remove_perturbations([pert])

In [None]:
mae = ana_obj.calc_kendalls_rank_engines(pert_val="val", recalculate=True)

for eng in ana_obj.engines:
    print(
        f"{eng} MAE: {mae[0][eng]['experimental']:.2f} {mae[2][eng]['experimental']}")


In [None]:
ana_obj.draw_graph()

In [34]:
# rename
from rdkit import Chem

name_dict = {}
for prot in ["syk", "hif2a", "cmet"]:
    jacs_file = f"/home/anna/Documents/benchmark/inputs/{prot}/results_edges_5ns.csv"
    mols = Chem.SDMolSupplier(
        f"/home/anna/Documents/benchmark/inputs/{prot}/ligands.sdf")
    name_dict[prot] = {}
    for mol,idx in zip(mols, range(1,len(mols)+1,1)):
        name_dict[prot][mol.GetProp("_Name")] = f"lig_{idx}"


In [41]:
# compared to shroedinger results

jacs_file = "/home/anna/Documents/benchmark/inputs/jacs2015_inputs/jacs2015results.csv"
jacs_df = pd.read_csv(jacs_file)
# jacs_df

jacs_dict = {} # convert into a dictionary of values

for prot in ["Tyk2", "MCL1", "P38"]:
    jacs_dict[prot.lower()] = {}
    for index, row in jacs_df.iterrows():
        if row["system"].upper() == prot.upper():
            jacs_dict[prot.lower()][f"{row['Ligand1']}~{row['Ligand2']}"] = (row['bennett_ddG'], row['bennett_error'])

for prot in ["syk", "hif2a", "cmet"]:
    jacs_file = f"/home/anna/Documents/benchmark/inputs/{prot}/results_edges_5ns.csv"
    jacs_df = pd.read_csv(jacs_file)
    jacs_dict[prot.lower()] = {}
    for index, row in jacs_df.iterrows():
        jacs_dict[prot.lower()][f"{name_dict[prot][row['Ligand1']]}~{name_dict[prot][row['Ligand2']]}"] = (
            row['FEP'], row['FEP Error'])


# exper dict
exper_dict_missing = {}
fep_lig_dict = {}

jacs_file = "/home/anna/Documents/benchmark/inputs/jacs2015_inputs/jacs2015resultsdG.csv"
jacs_df = pd.read_csv(jacs_file, delimiter=",")

for prot in ["Tyk2", "MCL1", "P38"]:
    exper_dict_missing[prot.lower()] = {}
    fep_lig_dict[prot.lower()] = {}
    for index, row in jacs_df.iterrows():
        if row["Systems"].upper() == prot.upper():
            try:
                exper_dict_missing[prot.lower()][row['Ligand']] = (row['Exp. dG'], 0.44)
                fep_lig_dict[prot.lower()][row['Ligand']] = (row['Pred. dG'], row['Pred. Error'])
            except:
                print(row['Ligand'])

for prot in ["syk", "hif2a", "cmet"]:
    jacs_file = f"/home/anna/Documents/benchmark/inputs/{prot}/results_5ns.csv"
    jacs_df = pd.read_csv(jacs_file, delimiter=",")
    exper_dict_missing[prot] = {}
    fep_lig_dict[prot] = {}
    for index, row in jacs_df.iterrows():
        try:
            exper_dict_missing[prot][name_dict[prot][row['Ligand']]] = (row['Exp. ΔG'], 0.44)
            fep_lig_dict[prot][name_dict[prot][row['Ligand']]] = (row['Pred. ΔG'], row['Pred. Error'])
        except:
            print(row['Ligand'])
     

# for cinnabar files
for prot in ana_obj_dict.keys():
    write_perts_file(jacs_dict[prot],
                     file_path=f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_fepplus", # .csv
                    )

In [45]:
# professa results
prot = "tyk2"
file = f"/home/anna/Documents/benchmark/inputs/other_computed/professa/professa_{prot}_results.dat"
ana_obj = ana_obj_dict[prot]["subsampling"]

df = pd.read_csv(file, delimiter=",")

perts_dict = {}
for index, row in df.iterrows():
    perts_dict[f"{row['perturbation']}"] = (
        float(row['ddG']), float(row['ddG_error']))
write_perts_file(perts_dict,
                 # .csv
                 file_path=f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_professa",
                 )

In [51]:
# hahn et al
file = f"/home/anna/Documents/benchmark/inputs/other_computed/hahn_tyk2_kjmol.dat"

df = pd.read_csv(file, delimiter=",")

perts_dict = {}
for index, row in df.iterrows():
    perts_dict[f"{row['perturbation']}"] = (
        float(row['ddG']), float(row['ddG_error']))

# need to convert into kcal/mol
for key in perts_dict:
    perts_dict[key] = (perts_dict[key][0]*0.239006,perts_dict[key][1]*0.239006)

write_perts_file(perts_dict,
                 # .csv
                 file_path=f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_hahn",
                 )

In [52]:

name = "hahn"

files = [f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_{name}.csv"] 

calc_diff_dict = make_dict.comp_results(
    files
)  # older method

perts, ligs = get_info_network_from_dict(calc_diff_dict)

if name == "fepplus":
    exper_dict = exper_dict_missing[prot]
elif name == "hahn":
    exper_dict = pipeline.analysis.convert.yml_into_exper_dict(exp_file=ana_obj.exp_file, temperature=298)
else:
    exper_dict = ana_obj.exper_val_dict

convert.cinnabar_file(
    files,
    exper_dict,
    f"/home/anna/Documents/benchmark/inputs/{prot}/cinnabar_{name}",
    perturbations=perts,
    method=None,
)

# compute the per ligand for the network
network = wrangle.FEMap(
    f"/home/anna/Documents/benchmark/inputs/{prot}/cinnabar_{name}.csv")

# for self plotting of per ligand
cinnabar_calc_val_dict = make_dict.from_cinnabar_network_node(network, "calc")
cinnabar_calc_pert_dict = make_dict.from_cinnabar_network_edges(network, "calc", perts)

# normalise exper dict
normalised_exper_dict = {}
avg = np.mean([val[0] for val in exper_dict.values()])
for lig in exper_dict:
    normalised_exper_dict[lig] = (exper_dict[lig][0] - avg,exper_dict[lig][1])

x = []
y = []
xerr = []
yerr = []
for lig in ligs:
    if not np.isnan(cinnabar_calc_val_dict[lig][0]):
        x.append(cinnabar_calc_val_dict[lig][0])
        xerr.append(cinnabar_calc_val_dict[lig][1])
        y.append(normalised_exper_dict[lig][0])
        yerr.append(normalised_exper_dict[lig][1])

In [54]:
# print(x,y,xerr,yerr)
res = stats_engines.compute_stats(x=x, xerr=xerr,
                                  y=y, yerr=yerr,
                                  statistic="R2")
print("MAE cinnabar", prot, res)

MAE cinnabar tyk2 (0.7100596494567039, 0.11323365001909216, [0.4406718881317579, 0.8892374700895855])


In [55]:
exper_pert_dict = pipeline.analysis.make_dict.exper_from_perturbations(exper_dict, perts)

x = []
y = []
xerr = []
yerr = []
for lig in perts:
    if not np.isnan(cinnabar_calc_pert_dict[lig][0]):
        x.append(cinnabar_calc_pert_dict[lig][0])
        xerr.append(cinnabar_calc_pert_dict[lig][1])
        y.append(exper_pert_dict[lig][0])
        yerr.append(exper_pert_dict[lig][1])

res = stats_engines.compute_stats(x=x, xerr=xerr,
                                  y=y, yerr=yerr,
                                  statistic="RMSE")
print("RMSE", prot, res)

RMSE tyk2 (0.9696921170470629, 0.13729152501996844, [0.7043448324660908, 1.2377697399861662])


In [None]:
cinnabar_dicts = {}
for prot in ana_obj_dict.keys():
    ana_obj = ana_obj_dict[prot]["subsampling"]

    files = [f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_fepplus.csv"]

    calc_diff_dict = make_dict.comp_results(
        files
    )  # older method

    perts, ligs = get_info_network_from_dict(calc_diff_dict)

    exper_dict = exper_dict_missing[prot]
    # exper_dict = ana_obj.exper_val_dict
    # for lig in ligs:
    #     if lig not in exper_dict.keys():
    #         exper_dict[lig] = exper_dict_missing[prot][lig]

    convert.cinnabar_file(
        [f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_fepplus.csv"],
        exper_dict,
        f"/home/anna/Documents/benchmark/inputs/{prot}/cinnabar",
        perturbations=perts,
        method=None,
    )

    # compute the per ligand for the network
    network = wrangle.FEMap(f"/home/anna/Documents/benchmark/inputs/{prot}/cinnabar.csv")

    # for self plotting of per ligand
    cinnabar_calc_val_dict = make_dict.from_cinnabar_network_node(network, "calc")
    cinnabar_dicts[prot] = cinnabar_calc_val_dict

    # normalise exper dict
    normalised_exper_dict = {}
    avg = np.mean([val[0] for val in exper_dict.values()])
    for lig in exper_dict:
        normalised_exper_dict[lig] = (exper_dict[lig][0] - avg,exper_dict[lig][1])
    x = []
    y = []
    xdata = []
    ydata = []
    xerr = []
    yerr = []
    xerrdata = []
    yerrdata = []
    for lig in ligs:
        if not np.isnan(cinnabar_calc_val_dict[lig][0]):
            x.append(cinnabar_calc_val_dict[lig][0])
            xerr.append(cinnabar_calc_val_dict[lig][1])
            xdata.append(fep_lig_dict[prot][lig][0])
            xerrdata.append(fep_lig_dict[prot][lig][1])
            y.append(normalised_exper_dict[lig][0])
            yerr.append(normalised_exper_dict[lig][1])
            ydata.append(exper_dict[lig][0])
            yerrdata.append(exper_dict[lig][1])

    # print(x,y,xerr,yerr)
    res = stats_engines.compute_stats(x=x, xerr=xerr,
                                        y=y, yerr=yerr,
                                        statistic="MUE")
    print("MAE cinnabar", prot, res)
    res = stats_engines.compute_stats(x=xdata, xerr=xerrdata,
                                      y=ydata, yerr=yerrdata,
                                      statistic="MUE")
    print("MAE cc", prot, res)

In [None]:
# using the updated exp values to calc mae
for prot in ana_obj_dict.keys():
    print(prot)
    ana_obj = ana_obj_dict[prot]["subsampling"]

    files = [f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_fepplus.csv"]

    calc_diff_dict = make_dict.comp_results(
        files
    )  # older method

    perts, ligs = get_info_network_from_dict(calc_diff_dict)

    ana_obj.exper_val_dict = exper_dict_missing[prot]
    normalised_exper_dict = {}
    avg = np.mean([val[0] for val in ana_obj.exper_val_dict.values()])
    for lig in ana_obj.exper_val_dict.keys():
        normalised_exper_dict[lig] = (ana_obj.exper_val_dict[lig][0] - avg, ana_obj.exper_val_dict[lig][1])
    ana_obj.normalised_exper_val_dict = normalised_exper_dict

    ana_obj._initialise_stats_object(check=False)


In [None]:
mae_dict = {}
for prot in ana_obj_dict.keys():
    print(prot)
    mae_dict[prot] = {}
    ana_obj = ana_obj_dict[prot]["subsampling"]

    mae = ana_obj.calc_r2_engines(pert_val="val", recalculate=True)

    for eng in ana_obj.engines:
        print(
            f"{eng} MAE: {mae[0][eng]['experimental']:.2f} {mae[2][eng]['experimental']}")
        mae_dict[prot][eng] = (
            mae[0][eng]['experimental'], mae[1][eng]['experimental'], mae[2][eng]['experimental'])



In [None]:
pd.DataFrame(mae_dict).applymap(lambda x: x[0])

In [None]:
for prot in ana_obj_dict.keys():
    ana_obj = ana_obj_dict[prot]["subsampling"]

    files = [f"/home/anna/Documents/benchmark/inputs/{prot}/perts_file_fepplus.csv"]

    calc_diff_dict = make_dict.comp_results(
        files
    )  # older method

    perts, ligs = get_info_network_from_dict(calc_diff_dict)

    exper_dict = ana_obj.exper_val_dict
    for lig in ligs:
        try:
            print(lig, exper_dict[lig], exper_dict_missing[prot][lig])
        except:
            pass

In [None]:
for prot in ana_obj_dict.keys():
    
    ana_obj = ana_obj_dict[prot]["subsampling"]
    use_perts = []
    reverse_perts = []
    for pert in ana_obj.perturbations:
        if pert in jacs_dict[prot].keys():
            use_perts.append(pert)
        if f"{pert.split('~')[1]}~{pert.split('~')[0]}" in jacs_dict[prot].keys():
            reverse_perts.append(pert)
    print(prot, len(use_perts))

    for eng in ana_obj.engines:
        x = []
        y = []
        yexp = []
        xerr = []
        yerr = []
        yerrexp = []
        for pert in use_perts:
            try:
                if not np.isnan(ana_obj.calc_pert_dict[eng][pert][0]):
                    if not np.isnan(jacs_dict[prot][pert][0]):
                        x.append(ana_obj.calc_pert_dict[eng][pert][0])
                        xerr.append(ana_obj.calc_pert_dict[eng][pert][1])
                        y.append(jacs_dict[prot][pert][0])
                        yerr.append(jacs_dict[prot][pert][1])
                        yexp.append(ana_obj.exper_pert_dict[pert][0])
                        yerrexp.append(ana_obj.exper_pert_dict[pert][1])
                    else:
                        print(f"{pert} shroedinger is none")
                else:
                    pass
            except:
                pass
        for pert in reverse_perts:
            try:
                if not np.isnan(ana_obj.calc_pert_dict[eng][pert][0]):
                    if not np.isnan(jacs_dict[prot][f"{pert.split('~')[1]}~{pert.split('~')[0]}"][0]):
                        x.append(ana_obj.calc_pert_dict[eng][pert][0])
                        xerr.append(ana_obj.calc_pert_dict[eng][pert][1])
                        y.append(
                            -jacs_dict[prot][f"{pert.split('~')[1]}~{pert.split('~')[0]}"][0])
                        yerr.append(
                            jacs_dict[prot][f"{pert.split('~')[1]}~{pert.split('~')[0]}"][1])
                        yexp.append(ana_obj.exper_pert_dict[pert][0])
                        yerrexp.append(ana_obj.exper_pert_dict[pert][1])
                    else:
                        print(f"{pert} shroedinger is none")
                else:
                    pass
            except Exception as e:
                print(e)

        # print(x,y,xerr,yerr)
        res = stats_engines.compute_stats(x=x, xerr=xerr,
                                            y=y, yerr=yerr,
                                            statistic="MUE")
        print("MUE", prot, eng, res)

        res = stats_engines.compute_stats(x=x, xerr=xerr,
                                          y=yexp, yerr=yerrexp,
                                          statistic="RMSE")
        print("RMSE me", prot, eng, res)

    res = stats_engines.compute_stats(x=y, xerr=yerr,
                                        y=yexp, yerr=yerrexp,
                                        statistic="RMSE")
    print("RMSE shroedinger", prot, res)

In [None]:
manual_ligs = ["lig_ejm44", "lig_ejm49", "lig_ejm53",
               "lig_23", "lig_26", "lig_29", "lig_38", "lig_40", "lig_42", "lig_44",
               "lig_2cc","lig_2dd","lig_2q", "lig_2u"
               ]

for prot in ana_obj_dict.keys():

    ana_obj = ana_obj_dict[prot]["subsampling"]
    use_perts = []
    reverse_perts = []
    for pert in ana_obj.perturbations:
        if pert in jacs_dict[prot].keys():
            use_perts.append(pert)
        if f"{pert.split('~')[1]}~{pert.split('~')[0]}" in jacs_dict[prot].keys():
            reverse_perts.append(pert)
    print(prot, len(use_perts), use_perts)

    for eng in ana_obj.engines:
        for pert in use_perts:
            try:
                if 1 > abs(ana_obj.calc_pert_dict[eng][pert][0]-jacs_dict[prot][pert][0]) > 0.5:   
                    # for lig in manual_ligs:
                    #     if lig in pert:
                        #     print(eng, pert, "manual ligand")
                        # else:
                    print(eng, pert)
            except:
                pass
        for pert in reverse_perts:
            try:
                if 1 > abs(ana_obj.calc_pert_dict[eng][pert][0]-(-jacs_dict[prot][pert][0])) > 0.5:
                    # for lig in manual_ligs:
                    #     if lig in pert:
                    #         print(eng, pert, "manual ligand")
                    #     else:
                    print(eng, pert)
            except:
                pass

In [None]:
from rdkit import Chem

# mols = Chem.SDMolSupplier(
#     "/home/anna/Documents/benchmark/inputs/jacs2015_inputs/p38_ligands.sdf")
# for mol in mols:
#     # print(mol.GetProp("_Name"))
#     with Chem.SDWriter(f'/home/anna/Documents/benchmark/inputs/jacs2015_inputs/p38_ligands/lig_{mol.GetProp("_Name").replace("p38a_","").replace("_","")}.sdf') as w:
#         w.write(mol)

In [None]:
# comparing the rmsd for the structures

ligands = BSS.IO.readMolecules(
    "/home/anna/Documents/benchmark/inputs/jacs2015_inputs/tyk2_ligands/lig_jmc28.sdf")[0]
for lig in ligands:
    print(lig)