# Imports

In [None]:
import sys, os, pandas as pd, numpy as np
sys.path.append("../")
from asapdiscovery.docking import analysis as a
from importlib import reload
import plotly.express as px

In [None]:
dr = a.DockingResults("/Volumes/Rohirrim/local_test/mers_hallucination_hybrid/all_results.csv")
dr.df.head()

In [None]:
len(dr.df)

## minimal data cleanup

In [None]:
dr.df = dr.df.drop("Unnamed: 0", axis=1)
dr.df["MERS_structure"] = dr.df["MERS_structure"].apply(lambda x: x.split("_")[1])
dr.df.head()

In [None]:
dr.df.columns = ["Compound_ID", "Structure_Source", "Docked_File", "RMSD", "POSIT", "Chemgauss4"]
dr.df.head()

# Get Grouped dfs

In [None]:
dr.get_compound_df()
dr.get_structure_df()

# When did our scoring functions fail?

In [None]:
dr.df[dr.df.Docked_File.isna()].groupby("Compound_ID")["Structure_Source"].count()

In [None]:
dr.compound_df[dr.compound_df.Not_NA_RMSD < 30]

In [None]:
px.scatter(dr.compound_df,
           x="Not_NA_RMSD",
           y="Not_NA_Chemgauss4",)

In [None]:
px.scatter(dr.structure_df,
           x="Not_NA_RMSD",
           y="Not_NA_Chemgauss4",
          color="Structure_Source")

# Automate basic plotting functions

In [None]:
dr.structure_df.columns

In [None]:
features = [feature for feature in dr.structure_df.columns if feature.split("_")[0] in ["Not", "Good", "Mean", "Min"]]

In [None]:
df = dr.structure_df
for feature in features:
    fig = px.bar(df.sort_values(feature),
          y=feature,
                text_auto=".2s")
#     fig.show()
    file_path = f"../figures/MERS_fauxalysis_{feature}_by_structure.png"
    fig.write_image(file_path)

In [None]:
df = dr.compound_df
for feature in features:
    fig = px.histogram(df.sort_values(feature),
          x=feature,
                text_auto=".2s")
#     fig.show()
    file_path = f"../figures/MERS_fauxalysis_{feature}_compound_histograms.png"
    fig.write_image(file_path)

# Get best structure for each compound

## directly copying from this:
https://stackoverflow.com/questions/54470917/pandas-groupby-and-select-rows-with-the-minimum-value-in-a-specific-column

In [None]:
min_value = dr.df.groupby("Compound_ID").RMSD.min()
min_value.head()

In [None]:
test_df = dr.df.merge(min_value, on="Compound_ID", suffixes=('', '_min'))

In [None]:
test_df

In [None]:
best_structures = test_df[test_df.RMSD == test_df.RMSD_min]

In [None]:
best_structures

## need to do extra filtering step bc of some duplicates

In [None]:
min_value = best_structures.groupby("Compound_ID")[["Chemgauss4"]].min()
best_structures = best_structures.merge(min_value, on="Compound_ID", suffixes=('', '_min'))
very_best_structures = best_structures[best_structures.Chemgauss4 == best_structures.Chemgauss4_min]

In [None]:
very_best_structures

In [None]:
very_best_structures.to_csv("../csvs/mers_fauxalysis.csv", index=False)

## try using API

In [None]:
dr.get_best_structure_per_compound()

In [None]:
dr.best_df

## which compounds are missing?

In [None]:
set(dr.df.Compound_ID) - set(very_best_structures.Compound_ID)

### of course, the wierd ones for which we couldn't calculate any RMSDs

## histograms of best structures

In [None]:
for score in ["RMSD", "Chemgauss4"]:
    fig = px.histogram(very_best_structures,
                x=score)
    fig.show()

# Try new way of finding best structures

In [None]:
sum(dr.df[dr.df.RMSD<2.5].groupby("Compound_ID").count()["Structure_Source"] == 0)

In [None]:
sum(dr.df[(dr.df.RMSD<2.5) & (dr.df.Chemgauss4<0)].groupby("Compound_ID").count()["Structure_Source"] == 0)

In [None]:
filtered_df = dr.df[(dr.df.RMSD<2.5)].groupby("Compound_ID").count()

In [None]:
filtered_df

In [None]:
set(dr.compound_df.Compound_ID) - set(filtered_df.index)

In [None]:
dr.compound_df

# Analyze posit results

In [None]:
reload(a)

## Load and process results csv

In [None]:
dr = a.DockingResults("/Volumes/Rohirrim/local_test/mers_hallucination_hybrid/posit_hybrid_no_relax/all_results.csv")
dr.df = dr.df.drop("Unnamed: 0", axis=1)
dr.df["MERS_structure"] = dr.df["MERS_structure"].apply(lambda x: x.split("_")[1])
dr.df.columns = ["Compound_ID", "Structure_Source", "Docked_File", "RMSD", "POSIT", "Chemgauss4", "Clash"]
dr.df['POSIT_R'] = 1-dr.df.POSIT
dr.df["Complex_ID"] = dr.df.Compound_ID + "_" + dr.df.Structure_Source
dr.df = dr.df.sort_values(["Compound_ID"]).reset_index(drop=True)
dr.get_compound_df()
dr.get_structure_df()

## when did we fail?

In [None]:
dr.df[dr.df.Docked_File.isna()].groupby("Compound_ID")["Structure_Source"].count()

## Write out images

In [None]:
features = [feature for feature in dr.structure_df.columns if feature.split("_")[0] in ["Not", "Good", "Mean", "Min"]]

In [None]:
df = dr.structure_df
for feature in features:
    fig = px.bar(df.sort_values(feature),
          y=feature,
                text_auto=".2s")
#     fig.show()
    file_path = f"../figures/MERS_fauxalysis_{feature}_by_structure.png"
    fig.write_image(file_path)

In [None]:
df = dr.compound_df
for feature in features:
    fig = px.histogram(df.sort_values(feature),
          x=feature,
                text_auto=".2s")
#     fig.show()
    file_path = f"../figures/MERS_fauxalysis_{feature}_compound_histograms.png"
    fig.write_image(file_path)

## Get best structure per compound

In [None]:
dr.get_best_structure_per_compound()

In [None]:
dr.best_df[dr.best_df.Complex_ID == "Mpro-z7qbb_0A_bound_7DR8"]