# Imports

In [None]:
from pathlib import Path
import pandas as pd, numpy as np
import plotly.express as px
from datetime import datetime
from tqdm import tqdm

In [None]:
main_path = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis")

In [None]:
csv = main_path / "results_with_structure_dates.csv"

In [None]:
csv.exists()

## load as pandas df

In [None]:
df = pd.read_csv(csv, index_col=0)

In [None]:
df.head()

# Load Molecules

In [None]:
from asapdiscovery.data.openeye import load_openeye_sdfs, oechem

In [None]:
combined_sdf = main_path / "Mpro_combined_labeled.sdf"

In [None]:
combined_sdf.exists()

In [None]:
mols = load_openeye_sdfs(str(combined_sdf))

In [None]:
mols[0].GetTitle()

In [None]:
len(df.Compound_ID.unique())

In [None]:
filtered_mols = [mol for mol in mols 
                if oechem.OEGetSDData(mol, "Dataset") in df.Structure_Source.unique()]

In [None]:
def get_n_to_n_tanimoto(mols, return_mtx=False):
    from openeye import oeshape
    from asapdiscovery.data.openeye import oechem
    results_tuples = []
    prep = oeshape.OEOverlapPrep()
    func = oeshape.OEOverlapFunc()
    results = oeshape.OEOverlapResults()
    complex_ids = []
    for reference in tqdm(mols):
        prep.Prep(reference)
        func.SetupRef(reference)
        for fitmol in mols:
            complex_ids.append(f"{fitmol.GetTitle()}_{oechem.OEGetSDData(reference, 'Dataset')}")
            prep.Prep(fitmol)
            func.Overlap(fitmol, results)
            results_tuples.append(results.GetTanimotoCombo())
    if return_mtx:
        matrix = np.matrix(results_tuples)
        mtx = matrix.reshape(len(mols), len(mols))
    else:
        df = pd.DataFrame({"Complex_ID":complex_ids, "TanimotoCombo":results_tuples})
    return df

In [None]:
tc_df = get_n_to_n_tanimoto(filtered_mols)

In [None]:
tc_df.head()

In [None]:
df_with_tc = pd.merge(df, tc_df, on="Complex_ID")

In [None]:
self_docked = df_with_tc[df_with_tc.Compound_Source == df_with_tc.Structure_Name]

In [None]:
sum(self_docked.TanimotoCombo != 2.0)

## nice, all of the self docked molecules have a TC of 2.0

# Now make the plot

In [None]:
def calculate_perc_good(df, cutoffs:list):
    sorted_df = df.sort_values(["POSIT"], ascending=[False])
    perc_good = []
    n_selected = []
    cutoff_list = []
    date_list = []
    n_structures = []
    n_final_reference=[]
    for cutoff in tqdm(cutoffs):
        for date in dates:
            selected = sorted_df[sorted_df.Structure_Date <= date]
            top_posit_score = selected.groupby("Compound_ID").head(1)
            n_selected.append(len(top_posit_score))
            perc_good.append(top_posit_score.RMSD.apply(lambda x: x <=cutoff).sum() / len(top_posit_score))
            cutoff_list.append(cutoff)
            date_list.append(date)
            n_structures.append(len(selected.Structure_Source.unique()))
            n_final_reference.append(len(top_posit_score.Structure_Source.unique()))
    df = pd.DataFrame({"Date": date_list, "Cutoff (Å)": cutoff_list, "Percentage": perc_good, "Number of Reference Structures": n_structures, "Number of Structures Used in Best Pose": n_final_reference})
    return df

In [None]:
def calculate_perc_good(n, posit_scores):
    sorted_df = p_df.sort_values(["POSIT"], ascending=[False])
    perc_good = []
    for score in posit_scores:
        selected = sorted_df[sorted_df.POSIT <= score].groupby("Compound_ID").head(1)
        print(len(selected), score)
        perc_good.append(selected.RMSD.apply(lambda x: x <=n).sum() / len(selected))
    return perc_good

In [None]:
def calculate_perc_good(df, tc_scores, cutoffs:list=[2.0]):
    sorted_df = df.sort_values(["POSIT"], ascending=[False])
    perc_good = []
    n_selected = []
    cutoff_list = []
    tc_list = []
    n_structures = []
    n_final_reference=[]
    for cutoff in tqdm(cutoffs):
        for tc_score in tc_scores:
            selected = sorted_df[sorted_df.TanimotoCombo <= tc_score]
            top_posit_score = selected.groupby("Compound_ID").head(1)
            n_selected.append(len(top_posit_score))
            perc_good.append(top_posit_score.RMSD.apply(lambda x: x <=cutoff).sum() / len(top_posit_score))
            cutoff_list.append(cutoff)
            tc_list.append(tc_score)
            n_structures.append(len(selected.Structure_Source.unique()))
            n_final_reference.append(len(top_posit_score.Structure_Source.unique()))
    df = pd.DataFrame({"TanimotoCombo": tc_list, "Cutoff (Å)": cutoff_list, "Percentage": perc_good, "Number of Reference Structures": n_structures, "Number of Structures Used in Best Pose": n_final_reference})
    return df

In [None]:
plotdata = calculate_perc_good(df_with_tc, tc_scores=np.linspace(0,2.0,50), cutoffs=[1,1.5,2])

In [None]:
plotdata["Cutoff (Å)"] = plotdata["Cutoff (Å)"].astype(str)

In [None]:
fig = px.scatter(plotdata, x="TanimotoCombo", 
                 y="Percentage", 
                 color="Cutoff (Å)",
                 category_orders={"Cutoff (Å)":["2.0","1.5", "1.0", ]},
                 height=600, 
                 width=600,)
fig.update_xaxes(title="TanimotoCombo Score of Query to Reference Molecule", range=[0,2.1])
fig.update_yaxes(title="Fraction of Posed Molecules with RMSD to Crystal Structure < Cutoff", range=[0,1])
fig.show()

In [None]:
fig.write_image("../../../../figures/20230525_sars_retrospective_TCscore.png")