# Imports

In [None]:
from asapdiscovery.data.fragalysis import parse_xtal
from pathlib import Path
from glob import glob
import os

## Paths

In [None]:
# xtal_fn = Path("/data/chodera/asap-datasets/mpro_fragalysis_2022_10_12/extra_files/Mpro_compound_tracker_csv.csv")
xtal_fn = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/mpro_fragalysis_2022_10_12/extra_files/Mpro_compound_tracker_csv.csv")
x_dir = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/mpro_fragalysis_2022_10_12/aligned/")

In [None]:
help(parse_xtal)

In [None]:
xtal_compounds_p_only = parse_xtal(xtal_fn, x_dir)

In [None]:
len(xtal_compounds_p_only)

In [None]:
xtal_compounds = parse_xtal(xtal_fn, x_dir, p_only=False)

In [None]:
len(xtal_compounds)

# Implement version here:

In [None]:
def parse_xtal_test(x_fn, x_dir, p_only=True, keep_filter=True, enumerate_chains=False):
    """
    Load all crystal structures into schema.CrystalCompoundData objects.
    Parameters
    ----------
    x_fn : str
        CSV file giving information on each crystal structure
    x_dir : str
        Path to directory containing directories with crystal structure PDB
        files
    p_only : bool, default=True
        Whether to filter to only include fragalysis structures of the
        format Mpro-P*
    Returns
    -------
    List[schema.CrystalCompoundData]
        List of parsed crystal structures
    """
    import pandas
    import os
    from asapdiscovery.data.schema import CrystalCompoundData
    f
    
    df = pandas.read_csv(x_fn)

    if p_only:
        ## Find all P-files
        idx = [(type(d) is str) and ("-P" in d) for d in df["Dataset"]]
    else:
        idx = [type(d) is str for d in df["Dataset"]]

    ## Build argument dicts for the CrystalCompoundData objects
    xtal_dicts = [
        dict(zip(("smiles", "dataset", "compound_id"), r[1].values))
        for r in df.loc[idx, ["SMILES", "Dataset", "Compound ID"]].iterrows()
    ]
    
    if keep_filter:
        ## Add structure filename information and filter if not found
        filtered_xtal_dicts = []
        for d in tqdm(xtal_dicts):
            fn_base = f'{x_dir}/{d["dataset"]}_0{{}}/{d["dataset"]}_0{{}}_{{}}.pdb'
            for suf in ["seqres", "bound"]:
                for chain in ["A", "B"]:
                    fn = fn_base.format(chain, chain, suf)
                    if os.path.isfile(fn):
                        d["str_fn"] = fn
                        break
                if os.path.isfile(fn):
                    break
            if os.path.isfile(fn):
                filtered_xtal_dicts.append(d)
            else:
                print(f'No structure found for {d["dataset"]}.')
        assert (
            len(filtered_xtal_dicts) > 0
        ), "No structure filenames were found by parse_xtal"
        ## Build CrystalCompoundData objects for each row
        print(f"Loading {len(filtered_xtal_dicts)} structures")
        xtal_dicts = filtered_xtal_dicts
    xtal_compounds = [CrystalCompoundData(**d) for d in xtal_dicts]

    return xtal_compounds

In [None]:
xtal_compounds_test = parse_xtal_test(xtal_fn, x_dir, p_only=False, keep_filter=False)

In [None]:
len(xtal_compounds_test)

In [None]:
xtal_compounds_test_v2 = parse_xtal_test(xtal_fn, x_dir, p_only=False, keep_filter=True)
len(xtal_compounds_test_v2)

## where are the other 300 structures going?

In [None]:
df = pandas.read_csv(xtal_fn)

In [None]:
idx = [type(d) is str for d in df["Dataset"]]

In [None]:
sum(idx)

In [None]:
xtal_compounds_test

### they just aren't getting enumerated during the fn addition step

In [None]:
def parse_xtal_test(x_fn, x_dir, p_only=True, enumerate_chains=False):
    """
    Load all crystal structures into schema.CrystalCompoundData objects.
    Parameters
    ----------
    x_fn : str
        CSV file giving information on each crystal structure
    x_dir : str
        Path to directory containing directories with crystal structure PDB
        files
    p_only : bool, default=True
        Whether to filter to only include fragalysis structures of the
        format Mpro-P*
    Returns
    -------
    List[schema.CrystalCompoundData]
        List of parsed crystal structures
    """
    import pandas
    import os
    from asapdiscovery.data.schema import CrystalCompoundData
    from tqdm import tqdm
    
    df = pandas.read_csv(x_fn)

    if p_only:
        ## Find all P-files
        idx = [(type(d) is str) and ("-P" in d) for d in df["Dataset"]]
    else:
        idx = [type(d) is str for d in df["Dataset"]]

    ## Build argument dicts for the CrystalCompoundData objects
    xtal_dicts = [
        dict(zip(("smiles", "dataset", "compound_id"), r[1].values))
        for r in df.loc[idx, ["SMILES", "Dataset", "Compound ID"]].iterrows()
    ]
    
    
    ## Add structure filename information and filter if not found
    filtered_xtal_dicts = []
    for d in tqdm(xtal_dicts):
        fn_base = f'{x_dir}/{d["dataset"]}_0{{}}/{d["dataset"]}_0{{}}_{{}}.pdb'
        for suf in ["seqres", "bound"]:
            for chain in ["A", "B"]:
                fn = fn_base.format(chain, chain, suf)
                if os.path.isfile(fn):
                    d["str_fn"] = fn
                    break
            if os.path.isfile(fn):
                break
        if os.path.isfile(fn):
            filtered_xtal_dicts.append(d)
        else:
            print(f'No structure found for {d["dataset"]}.')
    assert (
        len(filtered_xtal_dicts) > 0
    ), "No structure filenames were found by parse_xtal"
    ## Build CrystalCompoundData objects for each row
    print(f"Loading {len(filtered_xtal_dicts)} structures")
    xtal_dicts = filtered_xtal_dicts
    xtal_compounds = [CrystalCompoundData(**d) for d in xtal_dicts]

    return xtal_compounds

In [None]:
test =  parse_xtal_test(xtal_fn, x_dir, p_only=False, enumerate_chains=True)

## split up function

In [None]:
df = pandas.read_csv(xtal_fn)
idx = [type(d) is str for d in df["Dataset"]]

## Build argument dicts for the CrystalCompoundData objects
xtal_dicts = [
    dict(zip(("smiles", "dataset", "compound_id"), r[1].values))
    for r in df.loc[idx, ["SMILES", "Dataset", "Compound ID"]].iterrows()
]

In [None]:
xtal_dicts

In [None]:
len(xtal_dicts)

## use glob instead

In [None]:
d0 = xtal_dicts[0]

In [None]:
glob_str = f"{d0['dataset']}*/*.pdb"
glob_str

In [None]:
from glob import glob

In [None]:
list(x_dir.glob(glob_str))

In [None]:
filtered_xtal_dicts = []
all_fns = []
for d in xtal_dicts:
    glob_str = f"{d['dataset']}*/*.pdb"
    fns = list(x_dir.glob(glob_str))
    for fn in fns:
        d['str_fn'] = fn
#         if os.path.isfile(fn):
#             filtered_xtal_dicts.append(d)
        all_fns.append(fn)
    filtered_xtal_dicts.append(d)

In [None]:
len(filtered_xtal_dicts)

## full glob 

In [None]:
full_glob = list(x_dir.glob("*/*.pdb"))

In [None]:
len(full_glob)

## compare full glob to partial

In [None]:
missing = [fn for fn in full_glob if fn not in all_fns]

In [None]:
missing

In [None]:
len(missing)

In [None]:
len(all_fns)

In [None]:
len(full_glob)

### missing structures that are in metadata.csv and not compound tracker.csv

# try using metadata.csv instead

## load csv

In [None]:
metadata_csv = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/mpro_fragalysis_2022_10_12/metadata.csv")

In [None]:
metadata_csv

In [None]:
metadf = pandas.read_csv(metadata_csv, index_col=0)

In [None]:
metadf

## build argument dicts

In [None]:
## Build argument dicts for the CrystalCompoundData objects
xtal_dicts = [
    dict(zip(("smiles", "dataset", "compound_id"), r[1].values))
    for r in metadf.loc[:,["smiles", "crystal_name", "alternate_name"]].iterrows()
]

In [None]:
len(xtal_dicts)

In [None]:
from tqdm import tqdm
filtered_xtal_dicts = []
all_fns = []
for d in tqdm(xtal_dicts):
    glob_str = f"{d['dataset']}*/*.pdb"
    fns = list(x_dir.glob(glob_str))
    for fn in fns:
        d['str_fn'] = str(fn)
        if os.path.isfile(fn):
            filtered_xtal_dicts.append(d)
            all_fns.append(fn)

In [None]:
xtal_compounds = [CrystalCompoundData(**d) for d in filtered_xtal_dicts]

In [None]:
len(xtal_compounds)

## using new version from repo

In [None]:
from asapdiscovery.data import fragalysis as f
from importlib import reload
reload(f)

In [None]:
xtals = f.parse_fragalysis(metadata_csv, x_dir)

## testing filtering

In [None]:
idx = metadf.crystal_name.apply(lambda x: "Mpro-P" in x)

In [None]:
metadf[idx]

## using new filtering version from repo

In [None]:
reload(f)

In [None]:
xtals = f.parse_fragalysis(metadata_csv, x_dir, name_filter="Mpro-P", drop_duplicate_datasets=True)

In [None]:
xtals = f.parse_fragalysis(metadata_csv, x_dir, drop_duplicate_datasets=False)

In [None]:
xtals = f.parse_fragalysis(metadata_csv, x_dir, name_filter=["MAT-POS"], name_filter_column="alternate_name", drop_duplicate_datasets=True)