# Imports

In [None]:
from pathlib import Path
import pandas as pd, numpy as np
import plotly.express as px
from datetime import datetime

In [None]:
main_path = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/retro_docking/sars_fragalysis_retrospective/20230411")

In [None]:
csv = main_path/"all_results_cleaned.csv"

In [None]:
csv.exists()

## load as pandas df

In [None]:
df = pd.read_csv(csv, index_col=0)

In [None]:
df.head()

# Filter out only P structures

In [None]:
p_df = df[(df.Complex_ID.apply(lambda x: "Mpro-P" in x)) & (df.Compound_Source.apply(lambda x: "Mpro-P" in x))]

In [None]:
len(p_df.Compound_ID.unique())

In [None]:
p_df

# Iterate through POSIT Scores

In [None]:
posit_scores = np.linspace(0,1,10)

In [None]:
for score in posit_scores:
    print(len(p_df[p_df.POSIT >= score]))

## i'd like to plot the % of compounds for which there is at least 1 structure with RMSD < 1 against the posit score

In [None]:
sorted_df = filtered_df.sort_values(
            sort_list, ascending=[True] + score_ascending
        )

        # group by compound id and return the top row for each group
        g = sorted_df.groupby("Compound_ID")
        self.best_df = g.head(1)

In [None]:
selected = p_df.sort_values(["POSIT"], ascending=[False]).groupby("Compound_ID").head(3)

In [None]:
sorted_df = p_df.sort_values(["POSIT"], ascending=[False])
perc_good = []
for score in posit_scores:
    selected = sorted_df[sorted_df.POSIT <= score].groupby("Compound_ID").head(1)
    print(len(selected), score)
    perc_good.append(selected.RMSD.apply(lambda x: x <=2).sum() / len(selected))

In [None]:
px.scatter(x=posit_scores, y=perc_good)

In [None]:
def calculate_perc_good(n):
    sorted_df = p_df.sort_values(["POSIT"], ascending=[False])
    perc_good = []
    for score in posit_scores:
        selected = sorted_df[sorted_df.POSIT <= score].groupby("Compound_ID").head(1)
        print(len(selected), score)
        perc_good.append(selected.RMSD.apply(lambda x: x <=n).sum() / len(selected))
    return perc_good

In [None]:
n1 = calculate_perc_good(1)
n2 = calculate_perc_good(2)
n3 = calculate_perc_good(3)

In [None]:
df = pd.DataFrame({"POSIT": posit_scores, "1Å": n1, "2Å": n2, "3Å": n3})

In [None]:
tidy_df = df.melt(id_vars = "POSIT")

In [None]:
tidy_df.columns = ["POSIT", "Cutoff", "Percentage"]

In [None]:
fig = px.scatter(tidy_df, x="POSIT", y="Percentage", color = "Cutoff")

In [None]:
fig.show()

In [None]:
fig.write_image("../../../../figures/20230518_sars_retrospective_POSIT_RMSD.png")