In [1]:
import os
import pandas as pd
from rdkit import Chem
from pathlib import Path 


## Benchmark meta dataset

CDK2 highly flexible kinase and a major cancer target.
Related dataset with the ensemble of five representative conformations in the ensemble.


In [4]:
full_dataset = pd.read_csv("./CDK2_dataset_labels.csv")

In [5]:
full_dataset

Unnamed: 0,pdb_code,ligand_name,dof,cyclin_bound,inhibitor_label,DFG_label,A_loop_label,activity_label,CDK_label
0,5A14,LQ5,10,no,type II,out,out-out,inactive,CDK2_human
1,4KD1,dinaciclib,9,no,type I,in,out-out,inactive,CDK2_human
2,5L2W,dinaciclib,9,yes,type I,in,in-in,inactive,CDK2_human
3,3PJ8,roscovitine,11,no,type I,in,out-out,inactive,CDK2_human
4,3DDQ,roscovitine,11,yes,type I,in,in-in,inactive,CDK2_human
5,6GUH,AZ5438,7,no,type I,in,none-out,inactive,CDK2_human
6,6GUE,AZ5438,7,yes,type I,in,in-in,inactive,CDK2_human
7,6GUK,CGP74514A,7,no,type I,in,out-out,inactive,CDK2_human
8,6GUF,CGP74514A,7,yes,type I,in,in-in,inactive,CDK2_human
9,6GUB,flavopiridol,8,yes,type I,in,out-out,inactive,CDK2_human


In [114]:
full_dataset[["pdb_code", "dof", "cyclin_bound", "inhibitor_label"]]

Unnamed: 0,pdb_code,dof,cyclin_bound,inhibitor_label
0,5A14,10,no,type II
1,4KD1,9,no,type I
2,5L2W,9,yes,type I
3,3PJ8,11,no,type I
4,3DDQ,11,yes,type I
5,6GUH,7,no,type I
6,6GUE,7,yes,type I
7,6GUK,7,no,type I
8,6GUF,7,yes,type I
9,6GUB,8,yes,type I


In [8]:
# load the results
import glob 
# find the ligands for which we have results
root_dir = "../benchmark-ensemble-redock-results"
results_pd = None
for i, row in full_dataset.iterrows():
    pdb_id = row["pdb_code"]
    pdb_id_l = pdb_id.lower()
    results_file = Path(root_dir) / "{}/analysis/results.csv".format(pdb_id)  
    print(results_file)
    if results_file.exists():
        print("exists")
        if results_pd is None:
            results_pd = pd.read_csv(results_file)
            results_pd["pdb_id"] = [pdb_id for i in range(results_pd.shape[0])]
        else:
            tmp = pd.read_csv(results_file)
            tmp["pdb_id"] = [pdb_id for i in range(tmp.shape[0])]
            results_pd = pd.concat([results_pd, tmp])

../benchmark-ensemble-redock-results/5A14/analysis/results.csv
exists
../benchmark-ensemble-redock-results/4KD1/analysis/results.csv
exists
../benchmark-ensemble-redock-results/5L2W/analysis/results.csv
exists
../benchmark-ensemble-redock-results/3PJ8/analysis/results.csv
exists
../benchmark-ensemble-redock-results/3DDQ/analysis/results.csv
exists
../benchmark-ensemble-redock-results/6GUH/analysis/results.csv
exists
../benchmark-ensemble-redock-results/6GUE/analysis/results.csv
exists
../benchmark-ensemble-redock-results/6GUK/analysis/results.csv
exists
../benchmark-ensemble-redock-results/6GUF/analysis/results.csv
exists
../benchmark-ensemble-redock-results/6GUB/analysis/results.csv
exists


In [9]:
results_pd

Unnamed: 0.1,Unnamed: 0,Energy (kcal/mol),RMSD,Cluster size rank,Receptor Name,pdb_id
0,0,-14.55,1.38,22,5a14_receptor_aligned,5A14
1,1,-11.24,12.10,4,5a14_receptor_aligned,5A14
2,2,-10.82,1.47,23,5a14_receptor_aligned,5A14
3,3,-9.46,4.67,10,6guh_receptor_aligned,5A14
4,4,-9.15,4.09,3,6guh_receptor_aligned,5A14
...,...,...,...,...,...,...
4,4,-10.05,5.66,6,6gub_receptor_aligned,6GUB
5,5,-9.43,5.49,3,6gub_receptor_aligned,6GUB
6,6,-8.44,7.71,8,6guh_receptor_aligned,6GUB
7,7,-7.48,6.37,7,6guh_receptor_aligned,6GUB


In [32]:
receptor_types = {
    "5A14": "DFG out",
    "4KD1": "no cyclin",
    "5L2W": "with cyclin",
    "3PJ8": "no cyclin",
    "3DDQ": "with cyclin",
    "6GUH": "no cyclin",
    "6GUE": "with cyclin",
    "6GUK": "no cyclin",
    "6GUF": "with cyclin",
    "6GUB": "with cyclin"
}

In [105]:
tmp_df = results_pd[results_pd["pdb_id"]=="5A14"].reset_index()
tmp_df["Receptor"] = tmp_df["Receptor Name"].apply(lambda x: x.split("_")[0])
tmp_df["Receptor Type"] = tmp_df["Receptor"].apply(lambda x: receptor_types[x.upper()])
tmp_df = tmp_df.sort_values("Receptor Type").reset_index()
symbols = ['circle', 'x', 'triangle-up']
fig = px.scatter(tmp_df, x="RMSD", y="Energy (kcal/mol)", color="Receptor", symbol="Receptor Type",
        template="ggplot2", symbol_sequence=symbols, 
        color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.update_traces(
    marker=dict(size=8, line=dict(width=1, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(title="Ensemble docking of {}".format("5A14"))
fig.write_image("5a14_results.svg")
fig.show()

In [100]:
# energy x RMSD scatter plots
import plotly.express as px
for pdb_id in full_dataset.pdb_code.unique():
    tmp_df = results_pd[results_pd["pdb_id"]==pdb_id].reset_index()
    tmp_df["Receptor"] = tmp_df["Receptor Name"].apply(lambda x: x.split("_")[0])
    tmp_df["Receptor Type"] = tmp_df["Receptor"].apply(lambda x: receptor_types[x.upper()])
    tmp_df = tmp_df.sort_values("Receptor Type").reset_index()
    symbols = ['circle', 'x', 'triangle-up']
    fig = px.scatter(tmp_df, x="RMSD", y="Energy (kcal/mol)", color="Receptor", symbol="Receptor Type",
            template="ggplot2", symbol_sequence=symbols, 
            color_discrete_sequence=px.colors.qualitative.Set3)
    fig.update_traces(
        marker=dict(size=8, line=dict(width=1, color="DarkSlateGrey")),
        selector=dict(mode="markers"),
    )
    fig.update_layout(title="Ensemble docking of {}".format(pdb_id))
    fig.show()

In [64]:
comparison_res = {
    "pdb_id" : [],
    "top3_rmsd": [],
    "best_rmsd": [],
    "type": []
}

for pdb_id in full_dataset.pdb_code.unique():
    tmp_df = results_pd[results_pd["pdb_id"]==pdb_id].reset_index()
    tmp_df["Receptor"] = tmp_df["Receptor Name"].apply(lambda x: x.split("_")[0].upper())
    redock_df = tmp_df[tmp_df["Receptor"]==pdb_id]
    redock_top_3 = redock_df.sort_values("Energy (kcal/mol)").head(5)
    top3_rmsd = redock_top_3.sort_values("RMSD").reset_index().iloc[0]["RMSD"]
    best_rmsd = redock_df.sort_values("RMSD").reset_index().iloc[0]["RMSD"]
    comparison_res["pdb_id"].append(pdb_id)
    comparison_res["top3_rmsd"].append(top3_rmsd)
    comparison_res["best_rmsd"].append(best_rmsd)
    comparison_res["type"].append("redock - single receptor")
    crossdock_df = tmp_df
    crossdock_top_3 = crossdock_df.sort_values("Energy (kcal/mol)").head(5)
    top3_rmsd = crossdock_top_3.sort_values("RMSD").reset_index().iloc[0]["RMSD"]
    best_rmsd = crossdock_df.sort_values("RMSD").reset_index().iloc[0]["RMSD"]
    comparison_res["pdb_id"].append(pdb_id)
    comparison_res["top3_rmsd"].append(top3_rmsd)
    comparison_res["best_rmsd"].append(best_rmsd)
    comparison_res["type"].append("crossdock - ensemble")

In [65]:
res_1_excluded = pd.DataFrame(comparison_res)

In [66]:
res_1_excluded

Unnamed: 0,pdb_id,top3_rmsd,best_rmsd,type
0,5A14,1.38,1.38,redock - single receptor
1,5A14,1.38,1.38,crossdock - ensemble
2,4KD1,2.7,2.7,redock - single receptor
3,4KD1,2.14,1.45,crossdock - ensemble
4,5L2W,7.68,7.68,redock - single receptor
5,5L2W,1.62,1.62,crossdock - ensemble
6,3PJ8,1.73,1.73,redock - single receptor
7,3PJ8,1.73,1.73,crossdock - ensemble
8,3DDQ,3.36,3.36,redock - single receptor
9,3DDQ,2.09,1.64,crossdock - ensemble


In [67]:
ens_res = res_1_excluded[res_1_excluded["type"]=="crossdock - ensemble"]
redock_res = res_1_excluded[res_1_excluded["type"]=="redock - single receptor"]


In [109]:
import plotly.io as pio
pio.templates.default = "gridon"

fig = px.scatter(x=redock_res.best_rmsd, y=ens_res.best_rmsd, 
                 width=420, height=400, 
                 color_discrete_sequence=px.colors.qualitative.Dark2)
fig.update_layout(yaxis_range=[0,8],xaxis_range=[0,8] )
fig.add_shape(type="line",
              x0=0, 
              y0=0, 
              x1=8, 
              y1=8)
fig.update_layout(yaxis_range=[0,8],xaxis_range=[0,8],
                  title="Best RMSD", 
                  xaxis_title="Re-Docking Best RMSD (Å)", 
                  yaxis_title="Ensemble Docking Best RMSD (Å)")
fig.update_traces(marker=dict(size=7,
                              line=dict(width=2,
                                        color='DarkSlateGrey')))
fig.add_shape(type="line",
              x0=0, 
              y0=0, 
              x1=8, 
              y1=8)
fig.write_image("CDK_ensemble_scatter_best.svg")
fig

In [113]:
fig = px.scatter(x=redock_res.top3_rmsd, y=ens_res.top3_rmsd, 
                 width=420, height=400,
                 color_discrete_sequence=px.colors.qualitative.Dark2)
fig.update_layout(yaxis_range=[0,8],xaxis_range=[0,8] )
fig.update_layout(yaxis_range=[0,8],xaxis_range=[0,8],
                  title="Top3 RMSD", 
                  xaxis_title="Re-Docking Top3 RMSD (Å)", 
                  yaxis_title="Ensemble Docking Top3 RMSD (Å)")
fig.update_traces(marker=dict(size=7,
                              line=dict(width=2,
                                        color='DarkSlateGrey')))
fig.add_shape(type="line",
              x0=0, 
              y0=0, 
              x1=8, 
              y1=8)
fig.write_image("CDK_ensemble_top3_best.svg")
fig

In [7]:
results_pd["ligand_name"] = results_pd["pdb_id"].apply(lambda x: full_dataset[full_dataset["pdb_code"]==x]["ligand_name"].iloc[0])

TypeError: 'NoneType' object is not subscriptable

In [80]:
results_pd

Unnamed: 0.3,Unnamed: 0.2,index,Unnamed: 0.1,energies,rmsds,model_id,replica_id,receptor_id,fragment_id,thread_id,Unnamed: 0,clust_energy_rank,clust_size_rank,job_id,pdb_id,ligand_name
0,0,0,0,73.696,11.803832,0.0,9.0,3pj8_receptor_aligned,1.0,9.0,0.0,1,2,job_3pj8_receptor_aligned_5a14_ligand,5A14,LQ5
1,1,10,0,74.006,11.807512,0.0,9.0,5l2w_receptor_aligned,1.0,9.0,0.0,1,1,job_5l2w_receptor_aligned_5a14_ligand,5A14,LQ5
2,2,20,1,74.822,11.957763,1.0,9.0,5l2w_receptor_aligned,1.0,9.0,1.0,2,1,job_5l2w_receptor_aligned_5a14_ligand,5A14,LQ5
3,3,30,0,75.426,11.832215,0.0,0.0,4kd1_receptor_aligned,1.0,0.0,0.0,1,3,job_4kd1_receptor_aligned_5a14_ligand,5A14,LQ5
4,4,40,2,75.733,11.789532,0.0,2.0,5l2w_receptor_aligned,1.0,2.0,0.0,3,1,job_5l2w_receptor_aligned_5a14_ligand,5A14,LQ5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,391,1564,36,-8.566,5.349913,9.0,3.0,6guh_receptor_aligned,0.0,3.0,9.0,7,2,job_6guh_receptor_aligned_6gub_ligand,6GUB,flavopiridol
392,392,1568,39,-8.507,7.313979,7.0,1.0,3ddq_receptor_aligned,0.0,1.0,7.0,10,1,job_3ddq_receptor_aligned_6gub_ligand,6GUB,flavopiridol
393,393,1572,36,-8.477,5.737315,10.0,1.0,5l2w_receptor_aligned,0.0,1.0,10.0,2,6,job_5l2w_receptor_aligned_6gub_ligand,6GUB,flavopiridol
394,394,1576,44,-8.473,5.735322,12.0,0.0,4kd1_receptor_aligned,0.0,0.0,12.0,7,4,job_4kd1_receptor_aligned_6gub_ligand,6GUB,flavopiridol


In [96]:
from itertools import combinations
ligand = "3DDQ"
res_df = pd.DataFrame({
    "ligand": [],
    "ensemble_size": [],
    "correctly_docked": []
})
ligands = list(full_dataset["ligand_name"].unique())
for ligand in ligands:
    # redock result
    per_ligand_results = results_pd[results_pd["ligand_name"]==ligand]
    ligand_receptors = [ "{}_receptor_aligned".format(x) for x in list(full_dataset[full_dataset["ligand_name"]==ligand]["pdb_code"].unique())]
    per_ligand_rec_t5_results = per_ligand_results #per_ligand_results.sort_values(["receptor_id", "energies"]).groupby("receptor_id").head(10).reset_index()
    redock_results = per_ligand_rec_t5_results[~(per_ligand_rec_t5_results["receptor_id"].isin(ligand_receptors))]
    correct_pose_cnt = redock_results[redock_results["rmsds"]<2].shape[0]
    print("Redock number of <2A poses")
    print(correct_pose_cnt)
    if correct_pose_cnt > 0:
        print("Redock <2A best RMSD")
        print(redock_results[redock_results["rmsds"]<2].sort_values(["rmsds"]).iloc[0].rmsds)
    # crossdock result
    rd_receptor = "{}_receptor_aligned".format(ligand.lower())
    cd_receptors = [ r for r in list(per_ligand_results["receptor_id"].unique()) if not r in ligand_receptors]
    correct_docks = []
    for ens_size in range(1, len(cd_receptors)+1):
        ens_combos = combinations(cd_receptors, ens_size)
        ens_combos_l = list(ens_combos)
        ens_i_correct = 0
        ens_i_correct_brmsd = []
        for ens_combo in ens_combos_l:
            ens_combo_l = list(ens_combo)
            cd_results = per_ligand_rec_t5_results[per_ligand_rec_t5_results["receptor_id"].isin(ens_combo_l)]
            correct_pose_cnt = cd_results[cd_results["rmsds"]<2].shape[0]
            if correct_pose_cnt > 1:
                ens_i_correct+=1
                ens_i_correct_brmsd.append(cd_results[cd_results["rmsds"]<2].sort_values(["rmsds"]).iloc[0].rmsds)
        
        print("Ensemble size {}".format(ens_size))
        print("(%) ensemble with a correct pose {}".format(ens_i_correct/len(ens_combos_l)))
        print("Best RMSDs in ensemble {}".format(ens_i_correct_brmsd))
        correct_docks.append(ens_i_correct/len(ens_combos_l))
        res_df = pd.concat([res_df, pd.DataFrame({
                                        "ligand": [ligand],
                                        "ensemble_size": [ens_size],
                                        "correctly_docked": [ens_i_correct/len(ens_combos_l)]
                                    })])

Redock number of <2A poses
0
Ensemble size 1
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 2
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 3
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 4
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 5
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 6
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 7
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 8
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 9
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Ensemble size 10
(%) ensemble with a correct pose 0.0
Best RMSDs in ensemble []
Redock number of <2A poses
66
Redock <2A best RMSD
0.9378418981337836
Ensemble size 1
(%) ensemble with a correct pose 1.0
Best RMSDs in ensemble [1.128185757396594, 1.254028392108

In [98]:
res_df

Unnamed: 0,ligand,ensemble_size,correctly_docked
0,LQ5,1.0,0.0
0,LQ5,2.0,0.0
0,LQ5,3.0,0.0
0,LQ5,4.0,0.0
0,LQ5,5.0,0.0
0,LQ5,6.0,0.0
0,LQ5,7.0,0.0
0,LQ5,8.0,0.0
0,LQ5,9.0,0.0
0,LQ5,10.0,0.0


In [100]:
import plotly.express as px
fig = px.line(res_df, x="ensemble_size", y="correctly_docked", 
        color="ligand", template="ggplot2", width = 600, height=500, 
        color_discrete_sequence=px.colors.sequential.Plasma[1::1],
        markers=True,)
fig.update_layout(
    yaxis_title="% of ensembles with poses <2 Å",
    xaxis_title="Ensemble size"
)

In [66]:
res_df

Unnamed: 0,ligand,ensemble_size,correctly_docked
0,5A14,1.0,0.0
0,5A14,2.0,0.0
0,5A14,3.0,0.0
0,5A14,4.0,0.0
0,5A14,5.0,0.0
...,...,...,...
0,6GUB,5.0,1.0
0,6GUB,6.0,1.0
0,6GUB,7.0,1.0
0,6GUB,8.0,1.0


In [28]:
cd_results_top_5

Unnamed: 0.3,level_0,Unnamed: 0.2,index,Unnamed: 0.1,energies,rmsds,model_id,replica_id,receptor_id,fragment_id,thread_id,Unnamed: 0,clust_energy_rank,clust_size_rank,job_id,pdb_id
0,1,1,4,0,-9.269,1.75824,0.0,2.0,6guh_receptor_aligned,2.0,2.0,0.0,1,1,job_6guh_receptor_aligned_3ddq_ligand,3DDQ
1,2,2,8,0,-9.26,7.550465,0.0,1.0,5a14_receptor_aligned,2.0,1.0,0.0,1,8,job_5a14_receptor_aligned_3ddq_ligand,3DDQ
2,5,5,20,1,-9.222,1.605867,1.0,1.0,5a14_receptor_aligned,2.0,1.0,1.0,1,3,job_5a14_receptor_aligned_3ddq_ligand,3DDQ
3,6,6,24,0,-9.197,7.496868,0.0,0.0,3pj8_receptor_aligned,2.0,0.0,0.0,1,1,job_3pj8_receptor_aligned_3ddq_ligand,3DDQ
4,7,7,28,1,-9.178,4.413904,0.0,0.0,6guh_receptor_aligned,2.0,0.0,0.0,1,3,job_6guh_receptor_aligned_3ddq_ligand,3DDQ
5,9,9,36,2,-9.128,6.593075,1.0,2.0,6guh_receptor_aligned,2.0,2.0,1.0,1,5,job_6guh_receptor_aligned_3ddq_ligand,3DDQ
6,10,10,40,2,-9.128,7.198771,2.0,1.0,5a14_receptor_aligned,2.0,1.0,2.0,2,8,job_5a14_receptor_aligned_3ddq_ligand,3DDQ
7,11,11,44,3,-9.123,6.25467,2.0,2.0,6guh_receptor_aligned,2.0,2.0,2.0,1,6,job_6guh_receptor_aligned_3ddq_ligand,3DDQ
8,12,12,48,3,-9.084,7.544315,3.0,1.0,5a14_receptor_aligned,2.0,1.0,3.0,1,9,job_5a14_receptor_aligned_3ddq_ligand,3DDQ
9,13,13,52,4,-9.082,6.607444,4.0,1.0,5a14_receptor_aligned,2.0,1.0,4.0,1,1,job_5a14_receptor_aligned_3ddq_ligand,3DDQ


In [19]:
full_dataset["ligand_category"] = full_dataset["comment"].apply(lambda x: ligand_categories[x])

In [20]:
full_dataset = full_dataset[["pdb_id", "ligands", "rot_bonds", "n_atoms", "comment", "ligand_category"]]
full_dataset

Unnamed: 0,pdb_id,ligands,rot_bonds,n_atoms,comment,ligand_category
0,2G9X,NU5,12,34,"A-loop in, with cyclin",0
1,3ROY,22Z,11,27,"A-loop in, no cyclin",0
2,3R71,X86,10,26,"A-loop in, no cyclin",0
3,1H07,MFQ,10,33,"A-loop in, no cyclin",0
4,6RIJ,K4W,10,27,"A-loop in, with cyclin",0
5,2IW6,QQ2,10,33,"A-loop out, with cyclin",1
6,6INL,AJR,10,29,"A-loop flex, no cyclin",1
7,3RAI,X85,10,30,"A-loop in, no cyclin",0
8,3RPO,24Z,10,27,"A-loop in, no cyclin",0
9,1H07,MFP,10,33,"A-loop in, no cyclin",0


In [21]:
import glob 
# find the ligands for which we have results
ens_res_list = []
resuls_pd = None
for i, row in full_dataset.iterrows():
    pdb_id = row["pdb_id"]
    pdb_id_l = pdb_id.lower()
    data_dir = Path("./CDK2/ligands/{}".format(pdb_id_l))
    if data_dir.exists():
        ligands_in_dir = glob.glob(str(data_dir / "{}_ligand*.mol2".format(pdb_id_l)))
        for ligand in ligands_in_dir:
            lig_name = Path(ligand).stem
            output_dir = "../../benchmark-ens-results/{}/".format(lig_name)
            results_file = Path(output_dir) / "analysis/results.csv"
            if results_file.exists():
                ens_res_list.append(lig_name) 
                if resuls_pd is None:
                    resuls_pd = pd.read_csv(results_file)
                    resuls_pd["ligand_name"] = [lig_name for i in range(resuls_pd.shape[0])]
                else:
                    tmp = pd.read_csv(results_file)
                    tmp["ligand_name"] = [lig_name for i in range(tmp.shape[0])]
                    resuls_pd = pd.concat([resuls_pd, tmp])
print(len(ens_res_list))


21


In [22]:
resuls_pd[resuls_pd["ligand_name"]=="6inl_ligand"].sort_values(by="energies").sort_values(by="rmsds")

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,energies,rmsds,model_id,replica_id,receptor_id,fragment_id,thread_id,Unnamed: 0,job_id,ligand_name
46,46,2,-8.984,1.571993,0.0,3.0,8fp5_receptor_aligned,2.0,3.0,0.0,job_8fp5_receptor_aligned_6inl_ligand,6inl_ligand
45,45,2,-8.984,1.571993,0.0,3.0,8fp5_receptor_aligned,2.0,3.0,0.0,job_8fp5_receptor_aligned_6inl_ligand,6inl_ligand
44,44,2,-8.984,1.571993,0.0,3.0,8fp5_receptor_aligned,2.0,3.0,0.0,job_8fp5_receptor_aligned_6inl_ligand,6inl_ligand
47,47,2,-8.984,1.571993,0.0,3.0,8fp5_receptor_aligned,2.0,3.0,0.0,job_8fp5_receptor_aligned_6inl_ligand,6inl_ligand
63,63,3,-8.979,1.575625,0.0,2.0,8fp5_receptor_aligned,2.0,2.0,0.0,job_8fp5_receptor_aligned_6inl_ligand,6inl_ligand
...,...,...,...,...,...,...,...,...,...,...,...,...
481,481,22,-6.684,10.796423,6.0,2.0,3bhv_receptor_aligned,2.0,2.0,6.0,job_3bhv_receptor_aligned_6inl_ligand,6inl_ligand
417,417,22,-6.800,10.835908,6.0,3.0,2r3k_receptor_aligned,2.0,3.0,6.0,job_2r3k_receptor_aligned_6inl_ligand,6inl_ligand
418,418,22,-6.800,10.835908,6.0,3.0,2r3k_receptor_aligned,2.0,3.0,6.0,job_2r3k_receptor_aligned_6inl_ligand,6inl_ligand
419,419,22,-6.800,10.835908,6.0,3.0,2r3k_receptor_aligned,2.0,3.0,6.0,job_2r3k_receptor_aligned_6inl_ligand,6inl_ligand


In [23]:
resuls_pd["ligand_categories"] = resuls_pd["ligand_name"].apply(lambda x: full_dataset[full_dataset["pdb_id"]==x.split("_")[0].upper()].iloc[0]["ligand_category"])

In [24]:
resuls_pd["receptor_categories"] = resuls_pd["receptor_id"].apply(lambda x: receptor_categories[x])

In [51]:
results_summary = resuls_pd.sort_values("energies", ascending=True).groupby("ligand_name").head(10).groupby("ligand_name")["rmsds"].min().reset_index()
results_summary.columns = ["ligand_name", "best_rmsds_top5"]
results_summary ["best_rmsds"] = resuls_pd.sort_values("energies", ascending=True).groupby("ligand_name")["rmsds"].min().reset_index()["rmsds"]
#results_summary ["selected_rec"] = resuls_pd.sort_values("energies", ascending=True).groupby("ligand_name")["rmsds"].min().reset_index()["receptor_id"]
results_summary ["best_rmsds_top1"] = resuls_pd.sort_values("energies", ascending=True).groupby("ligand_name").head(1).groupby("ligand_name")["rmsds"].min().reset_index()["rmsds"]
results_summary["mean_rmsds_top5"] = resuls_pd.sort_values("energies", ascending=True).groupby("ligand_name").head(5).groupby("ligand_name")["rmsds"].mean().reset_index()["rmsds"]
results_summary["std_rmsds_top5"] = resuls_pd.sort_values("energies", ascending=True).groupby("ligand_name").head(5).groupby("ligand_name")["rmsds"].std().reset_index()["rmsds"]

In [52]:
results_summary = results_summary.sort_values("best_rmsds").reset_index(drop=True)
results_summary

Unnamed: 0,ligand_name,best_rmsds_top5,best_rmsds,best_rmsds_top1,mean_rmsds_top5,std_rmsds_top5
0,3ns9_ligand,6.88861,1.349567,6.895128,6.893824,0.002915
1,3r71_ligand,1.844475,1.467957,5.904052,5.899827,0.009446
2,6inl_ligand,1.587441,1.571993,1.587556,1.587533,5.1e-05
3,3rai_ligand,9.939425,1.606133,9.946488,9.948854,0.00529
4,3r73_ligand,5.750467,1.711741,5.754142,5.754216,0.000166
5,3ddp_ligand,4.492268,2.0891,4.502001,4.500054,0.004353
6,1h00_ligand_fcp,3.524417,3.005377,3.524417,3.524417,0.0
7,1h00_ligand_fap,4.667915,3.262525,4.667915,4.667915,0.0
8,3eid_ligand,5.173122,3.668447,5.173122,5.176745,0.008102
9,1h08_ligand,5.590708,3.734912,5.623077,5.618975,0.009173


In [27]:
selected_ligands = list(results_summary[:6]["ligand_name"])
selected_ligands

['3ns9_ligand',
 '3r71_ligand',
 '6inl_ligand',
 '3rai_ligand',
 '3r73_ligand',
 '3ddp_ligand']

In [62]:
import plotly.express as px
ligand_titles = {0: "A-loop IN", 1: "A-loop OUT"}
receptor_titles = {0: "A-loop IN", 1: "A-loop OUT"}
tmp_res = resuls_pd.sort_values("energies").groupby("ligand_name").head(10).reset_index()
tmp_res = tmp_res.sort_values("rmsds").groupby("ligand_name").head(1).reset_index()
tmp_res = tmp_res[["energies", "rmsds","receptor_categories", "ligand_categories"]]
tmp_res["receptor_categories"] = tmp_res["receptor_categories"].apply(lambda x: receptor_titles[x])
tmp_res["ligand_categories"] = tmp_res["ligand_categories"].apply(lambda x:ligand_titles[x])
tmp_res.columns = ["Best in Top 10 Energy (kcal/mol)", "Best in Top 10 RMSD", "Receptor Conformation Type", "Ligand Type"]

fig = px.box(tmp_res.sort_values("Receptor Conformation Type"), 
             x='Ligand Type', y='Best in Top 10 RMSD', 
             color='Receptor Conformation Type', points="all", 
             color_discrete_sequence = ["#FF6C90", "#00B0F6"],
             template="ggplot2")
'''
fig = px.scatter(tmp_res.sort_values("Receptor Conformation Type"), 
             x='Ligand Type', y='Best in Top 10 RMSD', 
             color='Receptor Conformation Type',
             color_discrete_sequence = ["#FF6C90", "#00B0F6"],
             template="ggplot2")
'''
fig.update_traces(marker_size=10)
fig.show()
fig.write_image("CDK2_results.svg")

In [63]:
fig._data_objs[0].pointpos = 0

fig._data_objs[0].jitter = 1
fig._data_objs[0].line = dict(color = 'rgba(0,0,0,0)')
fig._data_objs[0].fillcolor = 'rgba(0,0,0,0)'
fig._data_objs[1].pointpos = 0
fig._data_objs[1].line = dict(color = 'rgba(0,0,0,0)')
fig._data_objs[1].fillcolor = 'rgba(0,0,0,0)'


In [64]:
fig.show()

In [1]:
import plotly.io as pio
templ = pio.templates["ggplot2"]

In [13]:
templ._orphan_props["layout"]["colorscale"]["sequential"]

[[0, 'rgb(20,44,66)'], [1, 'rgb(90,179,244)']]