# Imports

In [None]:
from pathlib import Path
from asapdiscovery.data.openeye import load_openeye_sdfs, oechem
from asapdiscovery.data.fragalysis import parse_fragalysis
import numpy as np

## Paths

In [None]:
frag_path = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_00_fragalysis")
sdf = frag_path / "Mpro_combined.sdf"

In [None]:
sdf_labeled = frag_path / "2022_12_02_fragalysis_correct_bond_orders_220_P_structures.sdf"

In [None]:
mpro_csv = frag_path / "metadata.csv"
structure_dir = frag_path / "aligned"

In [None]:
mpro_csv.exists()

In [None]:
structure_dir.exists()

## Load Molecules

In [None]:
mols = load_openeye_sdfs(str(sdf))

In [None]:
len(mols)

In [None]:
datasets = np.array([mol.GetTitle() for mol in mols])

In [None]:
datasets

In [None]:
mol = mols[0]

In [None]:
type(mol)

In [None]:
for pair in oechem.OEGetSDDataPairs(mol):
    print(pair.GetTag(), pair.GetValue())

In [None]:
labeled_mols = load_openeye_sdfs(str(sdf_labeled))

In [None]:
len(labeled_mols)

In [None]:
mol = labeled_mols[0]

In [None]:
pairs = [pair for pair in oechem.OEGetSDDataPairs(mol)]

In [None]:
for pair in oechem.OEGetSDDataPairs(mol):
    print(pair.GetTag(), pair.GetValue())

# Parse Fragalysis

In [None]:
xtal_compounds = parse_fragalysis(
                mpro_csv,
                structure_dir,
            )

In [None]:
xtal_compounds[0]

In [None]:
cmpd = xtal_compounds[0]

In [None]:
cmpd.dataset in datasets

# Combine Information

## convert xtal_compounds to array

In [None]:
xtal_compounds_array = np.array(xtal_compounds)
dataset_array = np.array([cmpd.dataset for cmpd in xtal_compounds])

In [None]:
for mol in mols:
    dataset = mol.GetTitle()
    cmpd = xtal_compounds_array[dataset_array == dataset][0]
    oechem.OESetSDData(mol, "SMILES", cmpd.smiles)
    oechem.OESetSDData(mol, "Dataset", cmpd.dataset)
    oechem.OESetSDData(mol, "Compound_ID", cmpd.compound_id)
    mol.SetTitle(cmpd.compound_id)

In [None]:
oechem.OEGetSDData(mols[192], "SMILES")

## Save multi-compound sdf

In [None]:
from asapdiscovery.data.openeye import save_openeye_sdfs

In [None]:
save_openeye_sdfs(mols, str(frag_path / "Mpro_combined_labeled.sdf"))

# I'm so dumb....totally forgot that the fragalysis SDF files don't have bond orders...

# Combined prepped sdf files

In [None]:
from asapdiscovery.data.utils import combine_files

In [None]:
combine_files("/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3/*/*.sdf", 
                 "/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3/combined.sdf")

## they don't have any of the useful information

## load in all sdfs

In [None]:
glob_string = "/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3/*/*.sdf"

In [None]:
import glob
sdfs = list([f for f in glob.glob(glob_string) if f.endswith(".sdf")])

In [None]:
from asapdiscovery.data.openeye import load_openeye_sdf

In [None]:
from tqdm.notebook import tqdm
mols2 = [load_openeye_sdf(sdf) for sdf in tqdm(sdfs)]

## add parse_fragalysis information to loaded sdfs

In [None]:
import re
for mol in tqdm(mols2):
    cmplx = mol.GetTitle()
    xtal_pat = r"Mpro-.*?_[0-9][A-Z]"
    dataset = re.search(xtal_pat, cmplx)[0]
    cmpd = xtal_compounds_array[dataset_array == dataset][0]
    oechem.OESetSDData(mol, "SMILES", cmpd.smiles)
    oechem.OESetSDData(mol, "Dataset", cmpd.dataset)
    oechem.OESetSDData(mol, "Compound_ID", cmpd.compound_id)
    mol.SetTitle(cmpd.compound_id)

In [None]:
mols2[99].GetTitle()

In [None]:
mols[99].GetTitle()

In [None]:
oechem.OEGetSDData(mols2[99], "SMILES")

## now trying to save it again

In [None]:
from asapdiscovery.data.openeye import save_openeye_sdfs
save_openeye_sdfs(mols2, "/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3/Mpro_combined_labeled.sdf")

## success?

In [None]:
mols2[37].GetTitle()

In [None]:
oechem.OEGetSDData(mols2[37], "Dataset")