# Imports

In [None]:
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
import numpy as np, pandas as pd, matplotlib.pyplot as plt, plotly.express as px

## Paths

In [None]:
rmsd_path = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/retro_docking/sars_fragalysis_retrospective/20221208_analysis/")

## Import Raw Data

In [None]:
rmsd_arrays = [np.load(npy_file) for npy_file in tqdm(list(rmsd_path.glob("*.npy")))]

# Covert arrays to dataframe and save as single npy file!

In [None]:
arrays = np.concatenate(rmsd_arrays)

In [None]:
np.shape(arrays)

In [None]:
arrays[-1]

In [None]:
cmpd_ids = [npy_path.stem for npy_path in list(rmsd_path.glob("*.npy"))]

In [None]:
cmpd_ids

In [None]:
test_column = np.array([[cmpd_ids[0]]*10]).T

In [None]:
np.hstack((test_column, test_column))

In [None]:
appended_arrays = [np.hstack((np.array([[cmpd_ids[i]]*len(array)]).T, array)) for i, array in enumerate(rmsd_arrays)]

In [None]:
appended_arrays[0]

## i just realized I don't actually need to do this

## Converting to dataframe

In [None]:
df = pd.DataFrame(arrays)

In [None]:
cmpd_ids = df[0].apply(lambda x:x.split("_Mpro")[0])

In [None]:
datasets = df[0].apply(lambda x:"_".join(x.split("_")[1:]))

In [None]:
import yaml

In [None]:
with open("../../../../metadata/cmpd_to_frag.yaml") as f:
    cmpd_to_frag = yaml.safe_load(f)

In [None]:
cmpd_to_frag

In [None]:
frag_to_cmpd = {frag:cmpd for cmpd, frag in cmpd_to_frag.items()}

In [None]:
ref_ligands = datasets.apply(lambda x: frag_to_cmpd[x.split("_")[0]])

In [None]:
ref_ligands

In [None]:
pretty_df = pd.DataFrame({"Compound_ID": cmpd_ids, "Dataset": datasets, "Reference_Ligands":ref_ligands, "RMSD": df[1]})

In [None]:
pretty_df.sort_values("Compound_ID").reset_index(drop=True)

In [None]:
sum(pretty_df["Compound_ID"] == pretty_df["Reference_Ligands"])

In [None]:
pretty_df.to_csv(rmsd_path / "combined_array.csv")

# Load in df

In [None]:
df = pd.read_csv(rmsd_path / "combined_array.csv", index_col=0)

In [None]:
df

# Add IC50s

In [None]:
cdd_results = rmsd_path / "cdd_noncovalent_02_10_23_unfiltered.csv"

In [None]:
cdd_results.exists()

## load in csv file

In [None]:
cdd = pd.read_csv(cdd_results)

## function to filter cdd courtesy of ChatGPT

In [None]:
import pandas as pd

def filter_dataframe_by_column_values(df, column_name, values_list):
    """
    Filter a pandas DataFrame by values in a certain column that are found in a list.

    Args:
        df: Pandas DataFrame to filter
        column_name: Name of the column to filter on
        values_list: List of values to filter on

    Returns:
        Pandas DataFrame containing all rows where the specified column 
        contains one of the values in the list, with only the first occurrence of each value kept.
    """
    filtered_df = df[df[column_name].isin(values_list)]
    return filtered_df.drop_duplicates(subset=column_name, keep='first')

In [None]:
filtered_cdd = filter_dataframe_by_column_values(cdd, "Canonical PostEra ID", df.Compound_ID.unique())

In [None]:
filtered_cdd = filtered_cdd.rename(columns={'Canonical PostEra ID': 'Compound_ID'})

In [None]:
filtered_cdd

In [None]:
pic50s = filtered_cdd['ProteaseAssay_Fluorescence_Dose-Response_Weizmann: Avg pIC50']

In [None]:
pic50s_numbers = []
pic50s_types = []
for pic50 in pic50s:
    list_ = str(pic50).split(" ")
    if len(list_) == 1:
        if list_[0] == 'nan':
            pic50_float = np.nan
            pic50_type = "NA"
        else:
            pic50_float = float(list_[0])
            pic50_type = "within"
    elif len(list_) == 2:
        if list_[0] == ">":
            pic50_type = "above"
        elif list_[0] == "<":
            pic50_type = "above"
        else:
            raise NotImplementedError
        pic50_float = float(list_[1])
    else:
        raise NotImplementedError
    pic50s_numbers.append(pic50_float)
    pic50s_types.append(pic50_type)

In [None]:
cdd_df = pd.DataFrame({"Compound_ID": filtered_cdd["Compound_ID"],
                       "pIC50": np.array(pic50s_numbers).astype(float),
                       "Assay_Range": pic50s_types}
                       )

In [None]:
cdd_df

## add cdd results to RMSDs

In [None]:
merged_df = pd.merge(df, 
                     cdd_df, 
                     on=["Compound_ID"], 
                     how='outer')

In [None]:
merged_df

In [None]:
merged_df = merge_dataframes_on_key(df, filtered_cdd, ["Compound_ID"], ["ProteaseAssay_Fluorescence_Dose-Response_Weizmann: Avg pIC50"])

In [None]:
merged_df

In [None]:
filtered_cdd[["Canonical PostEra ID", "ProteaseAssay_Fluorescence_Dose-Response_Weizmann: Avg pIC50"]]

In [None]:
merged_df.sort_values(["Dataset", "RMSD"])

In [None]:
self_docked = merged_df[merged_df["Compound_ID"] == merged_df["Reference_Ligands"]]

In [None]:
self_docked_trimmed = self_docked[["Reference_Ligands", "pIC50", "Assay_Range"]]

In [None]:
self_docked_trimmed.columns = ["Reference_Ligands", "Reference_pIC50", "Reference_Assay_Range"]

In [None]:
self_docked_trimmed

In [None]:
merged_df

In [None]:
self_docked_trimmed[self_docked_trimmed['Reference_Ligands'] =="EDJ-MED-76744c27-4"]

In [None]:
merged_df_with_reference = pd.merge(merged_df, self_docked_trimmed, on=["Reference_Ligands"])

In [None]:
ranked_df = merged_df_with_reference.copy()

In [None]:
ranked_df["RMSD_Rank"] = merged_df_with_reference.sort_values(
    ["Reference_Ligands", "RMSD"]).groupby(
    'Reference_Ligands')['RMSD'].rank('dense')

In [None]:
ranked_df["pIC50_Rank"] = merged_df_with_reference.sort_values(
    ["Reference_Ligands", "pIC50"]).groupby(
    'Reference_Ligands')['pIC50'].rank('dense')

In [None]:
ranked_df.sort_values(["Reference_Ligands", "RMSD_Rank"]).groupby(["Reference_Ligands"]).head(10)

## save the ranked dataframe

In [None]:
ranked_df.to_csv(rmsd_path / "ranked_dataframe.csv")

# Play with some visualization

## load ranked_df

In [None]:
df = pd.read_csv(rmsd_path / "ranked_dataframe.csv")

In [None]:
test = df[df["Compound_ID"] == "ADA-UCB-6c2cb422-1"]
fig = px.scatter(test, x="RMSD_Rank", y="Reference_pIC50")

In [None]:
fig.show()

In [None]:
fig = px.density_heatmap(df, x="RMSD_Rank", y="pIC50_Rank")

In [None]:
fig = px.density_heatmap(df.sample(n=40000), x="RMSD", y="pIC50_Rank", hover_data=["Compound_ID", "Reference_Ligands"])

In [None]:
fig.show()

## Before diving into this further I want to outline what it is I actually want to learn from this