# <span style="color:teal">RBFE Network - Analysis</span>


This network provides a basic outline for how to run analysis.

In [None]:
# import libraries

from scipy.stats import sem as sem
import sys
import glob

code = "/home/anna/Documents/code/python"
if code not in sys.path:
    sys.path.insert(1, code)
import pipeline

from pipeline import *
from pipeline.utils import validate
from pipeline.analysis import *

The following variables need to be set:

net_file - the network file that describes all the perturbations that were run and which engine they were run for. Usually generated in the execution_model folder during setup.

ana_file - the analysis protocol that was used to analyse the runs. This determines the extension that is used to open the results files. If none is provided, all extensions/analysis methods are considered.

exp_file - file containing the experimental results. This can be in yml format (better) or csv. The format of the yml file for each ligand should be:

```
lig_a:
  measurement:
    comment:
    doi: source of data
    error: error
    type: ki or ic50
    unit: uM or nM 
    value: value
  name: lig_a
```

results_folder - the location of the results files computed during the analysis stage after the run. The default for this is outputs_extracted/results. 

output_folder - the location for the graphs and tables generated during this notebook.

In [None]:
bench_folder = f"/home/anna/Documents/benchmark"
protein = "mcl1"
main_dir = f"{bench_folder}/extracted/{protein}"
# main_dir = f"/backup/GROMACS_reruns/{protein}"

# choose location for the files
net_file = f"{main_dir}/execution_model/network.dat"
ana_file = f"{main_dir}/execution_model/analysis_protocol.dat"
exp_file = f"{bench_folder}/inputs/experimental/{protein}.yml"
results_folder = f"{main_dir}/outputs_extracted/results"
output_folder = validate.folder_path(f"{main_dir}/analysis", create=True)

The protocol from the execution model can also be read in to gain additional parameters.

In [None]:
prot_file = f"{main_dir}/execution_model/protocol.dat"
pipeline_prot = pipeline_protocol(prot_file, auto_validate=True)

These can then be initialised into the analysis_network object, which will be used to run the rest of the functions in this notebook.

In [None]:
all_analysis_object = analysis_network(
    results_folder,
    exp_file=exp_file,
    net_file=net_file,
    output_folder=output_folder,
    analysis_prot=ana_file,
    # method = pipeline_prot.name(),
    engines = pipeline_prot.engines()
)

The following will then analyse the entire network:

In [None]:
all_analysis_object.compute_results()

A ligands folder can be added to visualise any perturbations and draw the network graph of the successful runs. This is generally the folder that was also used at the start for all the ligand inputs.

In [None]:
all_analysis_object.add_ligands_folder(f"/home/anna/Documents/benchmark/inputs/{protein}/ligands")
all_analysis_object._initialise_graph_object()
all_analysis_object.draw_graph()

To check and visualise any failed perturbations:

In [None]:
failed_perts = all_analysis_object.failed_runs("GROMACS")

for pert in sorted(failed_perts):
    print(pert)

all_analysis_object.draw_failed_perturbations()

If the failed perturbations have resulted in any disconnected ligands, these can also be listed.

In [None]:
all_analysis_object.disconnected_ligands(eng="GROMACS")

If more extensive analysis has been performed, it is also possible to check for average convergence for the runs.

In [None]:
all_analysis_object.compute_convergence(main_dir=main_dir)
all_analysis_object.plot_convergence()

There are different options for plotting:

bar (pert or lig)

scatter (pert or lig) - can be plotted using cinnabar

eng vs eng

other results (eg from other workflows)

outliers

histograms (for the error of the runs, sem, legs)

In [None]:
# bar
all_analysis_object.plot_bar_lig()
all_analysis_object.plot_bar_pert()

# scatter
all_analysis_object.plot_scatter_lig()
all_analysis_object.plot_scatter_pert()
all_analysis_object.plot_scatter_lig(use_cinnabar=True)
all_analysis_object.plot_scatter_pert(use_cinnabar=True)

for eng in all_analysis_object.engines:
    all_analysis_object.plot_scatter_lig(engine=eng)
    all_analysis_object.plot_scatter_pert(engine=eng)

    # outliers
    all_analysis_object.plot_outliers(engine=eng)
    all_analysis_object.plot_outliers(engine=eng, pert_val="val")

all_analysis_object.plot_histogram_legs()
all_analysis_object.plot_histogram_repeats()
all_analysis_object.plot_histogram_sem(pert_val="pert")
all_analysis_object.plot_histogram_sem(pert_val="val")

The statistics of the MAD (comparing engines) and MAE (compared to experimental) can also be computed.

In [None]:
df = all_analysis_object.calc_mad_engines(pert_val="pert")
# all_analysis_object.calc_mad_engines(pert_val="val")
df

In [None]:
df = all_analysis_object.calc_mae_engines(pert_val="pert")
# all_analysis_object.calc_mad_engines(pert_val="val")
df

The ligands can be sorted by binding affinity, and the spearmans rank correlation coefficient calculated (rho).

In [None]:
all_analysis_object.sort_ligands_by_binding_affinity(engine="GROMACS")
all_analysis_object.sort_ligands_by_experimental_binding_affinity()

In [None]:
values = all_analysis_object._stats_object.compute_rho(pert_val="val",y="GROMACS")
print(values)

plot other results

In [None]:
eng = "GROMACS"
rerun_folder = "/backup/GROMACS_reruns/tyk2/outputs_extracted/results"

other_results = glob.glob(
    f"{rerun_folder}/freenrg_*_{eng}_MBAR_alchemlyb_None_eqfalse_statsfalse_truncate0end.csv"
)
bound_results = glob.glob(
    f"{rerun_folder}/bound_*_{eng}_MBAR_alchemlyb_None_eqfalse_statsfalse_truncate0end.csv"
)
free_results = glob.glob(
    f"{rerun_folder}/free_*_{eng}_MBAR_alchemlyb_None_eqfalse_statsfalse_truncate0end.csv"
)
all_analysis_object.compute_other_results(
    other_results, name=f"GROMACS_reruns", bound_files=bound_results, free_files=free_results
)


In [None]:
# remove any non main netowrk perturbations
for eng in all_analysis_object.other_results_names:
    for pert in all_analysis_object._perturbations_dict[eng]:
        if pert not in all_analysis_object._perturbations_dict["GROMACS"]:
            all_analysis_object.remove_perturbations(pert, name=eng)

In [None]:
all_analysis_object.plot_outliers(engines=["GROMACS","GROMACS_reruns"], no_outliers=3)

In [None]:
all_analysis_object.plot_histogram_repeats(engines=["GROMACS","GROMACS_reruns"])

remove a perturbation

In [None]:
# check which perturbations are outliers
all_analysis_object.plot_outliers(engine="GROMACS", outliers=5, pert_val="pert")
# all_analysis_object._plotting_object.outlier(engines="AMBER", outliers=5, name="Exscientia")

In [None]:
# remove perturbation for all engines
all_analysis_object.remove_perturbations("lig_c~lig_d")

remove outliers

In [None]:
all_analysis_object.remove_outliers(threshold=10)

# can then recalculate the above.