In [31]:
import re
import sys

import yaml
from openeye import oechem, oespruce

sys.path.append("../")
from asapdiscovery.data import pdb
from asapdiscovery.data.fragalysis import parse_xtal
from asapdiscovery.data.openeye import (
    load_openeye_pdb,
    save_openeye_pdb,
    split_openeye_mol,
)
from asapdiscovery.data.schema import CrystalCompoundData
from asapdiscovery.data.utils import edit_pdb_file, seqres_to_res_list
from asapdiscovery.docking.modeling import (
    align_receptor,
    du_to_complex,
    mutate_residues,
    prep_receptor,
    remove_extra_ligands,
)

In [6]:
frag_dir = "/lila/data/chodera/asap-datasets/mpro_fragalysis_2022_10_12/aligned"
main_dir = "/lila/data/chodera/kaminowb/stereochemistry_pred/mers/"
frag_fn = f"{main_dir}/test_sequence_metadata_xtal.csv"
xtal_compounds = parse_xtal(frag_fn, frag_dir)
assert len(xtal_compounds) == 1
xtal = xtal_compounds[0]
## From prep_proteins.py
re_pat = rf"/{xtal.dataset}_([0-9][A-Z])/"
try:
    frag_chain = re.search(re_pat, xtal.str_fn).groups()[0]
except AttributeError:
    print(
        f"Regex chain search failed: {re_pat}, {xtal.str_fn}.",
        "Using A as default.",
        flush=True,
    )
    frag_chain = "0A"
xtal.output_name = f"{xtal.dataset}_{frag_chain}_{xtal.compound_id}"

## We also want the chain in the form of a single letter ('A', 'B'), etc
xtal.active_site_chain = frag_chain[-1]

In [7]:
print(xtal, flush=True)

smiles='CC(C)(C)C=1C=CC(=CC1)N([C@H](C(=O)NCCC=2C=CC=C(F)C2)C=3C=CC=NC3)C(=O)C4=CC=CO4' compound_id='MAT-POS-f2460aef-1' dataset='Mpro-P0009' str_fn='/lila/data/chodera/asap-datasets/mpro_fragalysis_2022_10_12/aligned/Mpro-P0009_0A/Mpro-P0009_0A_bound.pdb' sdf_fn=None active_site_chain='A' output_name='Mpro-P0009_0A_MAT-POS-f2460aef-1' active_site=None oligomeric_state=None chains=None protein_chains=None


In [15]:
seqres_fn = "../metadata/mpro_sars2_seqres.yaml"
with open(seqres_fn) as f:
    seqres_dict = yaml.safe_load(f)
seqres = seqres_dict["SEQRES"]
seqres_list = seqres_to_res_list(seqres)
seqres = " ".join(seqres_list)
print(seqres_list, flush=True)
print(seqres, flush=True)
print(len(seqres_list), flush=True)

['SER', 'GLY', 'PHE', 'ARG', 'LYS', 'MET', 'ALA', 'PHE', 'PRO', 'SER', 'GLY', 'LYS', 'VAL', 'GLU', 'GLY', 'CYS', 'MET', 'VAL', 'GLN', 'VAL', 'THR', 'CYS', 'GLY', 'THR', 'THR', 'THR', 'LEU', 'ASN', 'GLY', 'LEU', 'TRP', 'LEU', 'ASP', 'ASP', 'VAL', 'VAL', 'TYR', 'CYS', 'PRO', 'ARG', 'HIS', 'VAL', 'ILE', 'CYS', 'THR', 'SER', 'GLU', 'ASP', 'MET', 'LEU', 'ASN', 'PRO', 'ASN', 'TYR', 'GLU', 'ASP', 'LEU', 'LEU', 'ILE', 'ARG', 'LYS', 'SER', 'ASN', 'HIS', 'ASN', 'PHE', 'LEU', 'VAL', 'GLN', 'ALA', 'GLY', 'ASN', 'VAL', 'GLN', 'LEU', 'ARG', 'VAL', 'ILE', 'GLY', 'HIS', 'SER', 'MET', 'GLN', 'ASN', 'CYS', 'VAL', 'LEU', 'LYS', 'LEU', 'LYS', 'VAL', 'ASP', 'THR', 'ALA', 'ASN', 'PRO', 'LYS', 'THR', 'PRO', 'LYS', 'TYR', 'LYS', 'PHE', 'VAL', 'ARG', 'ILE', 'GLN', 'PRO', 'GLY', 'GLN', 'THR', 'PHE', 'SER', 'VAL', 'LEU', 'ALA', 'CYS', 'TYR', 'ASN', 'GLY', 'SER', 'PRO', 'SER', 'GLY', 'VAL', 'TYR', 'GLN', 'CYS', 'ALA', 'MET', 'ARG', 'PRO', 'ASN', 'PHE', 'THR', 'ILE', 'LYS', 'GLY', 'SER', 'PHE', 'LEU', 'ASN', 'GLY'

In [30]:
initial_complex = load_openeye_pdb(xtal.str_fn)
initial_split = split_openeye_mol(initial_complex)
initial_prot = initial_split["pro"]
prot_seq = [
    res.GetName()
    for res in oechem.OEGetResidues(initial_prot)
    if (res.GetExtChainID() == "A")
    # and (res.GetName() != "DMS")
    # and (res.GetName() != "LIG")
]
print(prot_seq)
print(len(prot_seq))

['SER', 'GLY', 'PHE', 'ARG', 'LYS', 'MET', 'ALA', 'PHE', 'PRO', 'SER', 'GLY', 'LYS', 'VAL', 'GLU', 'GLY', 'CYS', 'MET', 'VAL', 'GLN', 'VAL', 'THR', 'CYS', 'GLY', 'THR', 'THR', 'THR', 'LEU', 'ASN', 'GLY', 'LEU', 'TRP', 'LEU', 'ASP', 'ASP', 'VAL', 'VAL', 'TYR', 'CYS', 'PRO', 'ARG', 'HIS', 'VAL', 'ILE', 'CYS', 'THR', 'SER', 'GLU', 'ASP', 'MET', 'LEU', 'ASN', 'PRO', 'ASN', 'TYR', 'GLU', 'ASP', 'LEU', 'LEU', 'ILE', 'ARG', 'LYS', 'SER', 'ASN', 'HIS', 'ASN', 'PHE', 'LEU', 'VAL', 'GLN', 'ALA', 'GLY', 'ASN', 'VAL', 'GLN', 'LEU', 'ARG', 'VAL', 'ILE', 'GLY', 'HIS', 'SER', 'MET', 'GLN', 'ASN', 'CYS', 'VAL', 'LEU', 'LYS', 'LEU', 'LYS', 'VAL', 'ASP', 'THR', 'ALA', 'ASN', 'PRO', 'LYS', 'THR', 'PRO', 'LYS', 'TYR', 'LYS', 'PHE', 'VAL', 'ARG', 'ILE', 'GLN', 'PRO', 'GLY', 'GLN', 'THR', 'PHE', 'SER', 'VAL', 'LEU', 'ALA', 'CYS', 'TYR', 'ASN', 'GLY', 'SER', 'PRO', 'SER', 'GLY', 'VAL', 'TYR', 'GLN', 'CYS', 'ALA', 'MET', 'ARG', 'PRO', 'ASN', 'PHE', 'THR', 'ILE', 'LYS', 'GLY', 'SER', 'PHE', 'LEU', 'ASN', 'GLY'

In [57]:
metadata = oespruce.OEStructureMetadata()
all_prot_chains = {
    res.GetExtChainID() for res in oechem.OEGetResidues(initial_prot)
}
for chain in all_prot_chains:
    seq_metadata = oespruce.OESequenceMetadata()
    seq_metadata.SetChainID(chain)
    seq_metadata.SetSequence(seqres)
    metadata.AddSequenceMetadata(seq_metadata)
for seq in metadata.GetSequenceMetadata():
    print(seq.GetChainID())
    print(seq.GetSequence())

A
SER GLY PHE ARG LYS MET ALA PHE PRO SER GLY LYS VAL GLU GLY CYS MET VAL GLN VAL THR CYS GLY THR THR THR LEU ASN GLY LEU TRP LEU ASP ASP VAL VAL TYR CYS PRO ARG HIS VAL ILE CYS THR SER GLU ASP MET LEU ASN PRO ASN TYR GLU ASP LEU LEU ILE ARG LYS SER ASN HIS ASN PHE LEU VAL GLN ALA GLY ASN VAL GLN LEU ARG VAL ILE GLY HIS SER MET GLN ASN CYS VAL LEU LYS LEU LYS VAL ASP THR ALA ASN PRO LYS THR PRO LYS TYR LYS PHE VAL ARG ILE GLN PRO GLY GLN THR PHE SER VAL LEU ALA CYS TYR ASN GLY SER PRO SER GLY VAL TYR GLN CYS ALA MET ARG PRO ASN PHE THR ILE LYS GLY SER PHE LEU ASN GLY SER CYS GLY SER VAL GLY PHE ASN ILE ASP TYR ASP CYS VAL SER PHE CYS TYR MET HIS HIS MET GLU LEU PRO THR GLY VAL HIS ALA GLY THR ASP LEU GLU GLY ASN PHE TYR GLY PRO PHE VAL ASP ARG GLN THR ALA GLN ALA ALA GLY THR ASP THR THR ILE THR VAL ASN VAL LEU ALA TRP LEU TYR ALA ALA VAL ILE ASN GLY ASP ARG TRP PHE LEU ASN ARG PHE THR THR THR LEU ASN ASP PHE ASN LEU VAL ALA MET LYS TYR ASN TYR GLU PRO LEU THR GLN ASP HIS VAL ASP ILE LE

In [48]:
loop_db = "/lila/home/kaminowb/.openeye/rcsb_spruce.loop_db"

In [60]:
## Set up DU building options
opts = oespruce.OEMakeDesignUnitOptions()
opts.SetSuperpose(False)
if loop_db is not None:
    opts.GetPrepOptions().GetBuildOptions().GetLoopBuilderOptions().SetLoopDBFilename(
        loop_db
    )

## Options set from John's function ########################################
## (https://github.com/FoldingAtHome/covid-moonshot/blob/454098f4255467f4655102e0330ebf9da0d09ccb/synthetic-enumeration/sprint-14-quinolones/00-prep-receptor.py)
opts.GetPrepOptions().SetStrictProtonationMode(True)
# set minimal number of ligand atoms to 5, e.g. a 5-membered ring fragment\
opts.GetSplitOptions().SetMinLigAtoms(5)

# also consider alternate locations outside binding pocket, important for later filtering
opts.GetPrepOptions().GetEnumerateSitesOptions().SetCollapseNonSiteAlts(False)

# alignment options, only matches are important
opts.GetPrepOptions().GetBuildOptions().GetLoopBuilderOptions().SetSeqAlignMethod(
    oechem.OESeqAlignmentMethod_Identity
)
opts.GetPrepOptions().GetBuildOptions().GetLoopBuilderOptions().SetSeqAlignGapPenalty(
    -1
)
opts.GetPrepOptions().GetBuildOptions().GetLoopBuilderOptions().SetSeqAlignExtendPenalty(
    0
)

# Both N- and C-termini should be zwitterionic
# Mpro cleaves its own N- and C-termini
# See https://www.pnas.org/content/113/46/12997
opts.GetPrepOptions().GetBuildOptions().SetCapNTermini(False)
opts.GetPrepOptions().GetBuildOptions().SetCapCTermini(False)
# Don't allow truncation of termini, since force fields don't have
#  parameters for this
opts.GetPrepOptions().GetBuildOptions().GetCapBuilderOptions().SetAllowTruncate(
    False
)
# Build loops and sidechains
opts.GetPrepOptions().GetBuildOptions().SetBuildLoops(True)
opts.GetPrepOptions().GetBuildOptions().SetBuildSidechains(True)

# Generate ligand tautomers
opts.GetPrepOptions().GetProtonateOptions().SetGenerateTautomers(True)

## Allow spruce to build tails at the ends of the protein
opts.GetPrepOptions().GetBuildOptions().GetLoopBuilderOptions().SetBuildTails(
    True
)

True

In [61]:
initial_bound = initial_prot.CreateCopy()
oechem.OEAddMols(initial_bound, initial_split["lig"].CreateCopy())
dus = oespruce.OEMakeDesignUnits(initial_bound, metadata, opts)
print(dus)

<oespruce.OEDesignUnitIter; proxy of <Swig Object of type 'OESystem::OEIter< OEBio::OEDesignUnit > *' at 0x7f8deefb06c0> >


DPI: 20.00, RFree: 0.00, Resolution: 0.00
Processing BU # 1 with title: _chainA._chainB_LIG, chains AB
Found unresolved C-terminal with 4 residues after GLY 302   A 1  , with sequence VTFQ
Found unresolved C-terminal with 1 residues after PHE 305   B 2  , with sequence Q
Attempting to build a C terminal tail: Q after PHE 305   B 2  
Attempting to build a C terminal tail: VTFQ after GLY 302   A 1  
Opened database /lila/home/kaminowb/.openeye/rcsb_spruce.loop_db
LoopDatabase Info: 
    161412 loops from RCSB last synced on 04-10-2020, were added to LoopTemplateDatabase on 04-22-2020 using Spruce Toolkit 1.1.0.a.1
    The loop database was built with a max loop length of 22, a terminus crop length of 2. Regular secondary structures were excluded


In [62]:
for du in dus:
    print(du.HasProtein(), du.HasLigand())

True True


In [63]:
du_prot = oechem.OEGraphMol()
print(du.GetProtein(du_prot))

True


In [64]:
print(len(list(oechem.OEGetResidues(initial_prot))))
print(len(list(oechem.OEGetResidues(du_prot))))
print(len(seqres_list) * 2)
print(
    " ".join(
        [
            res.GetName()
            for res in oechem.OEGetResidues(initial_prot)
            if res.GetExtChainID() == "A"
        ]
    )
)
print("---")
print(
    " ".join(
        [
            res.GetName()
            for res in oechem.OEGetResidues(du_prot)
            if res.GetExtChainID() == "A"
        ]
    )
)
print("---")
print(seqres)
print("-----")
print(
    " ".join(
        [
            res.GetName()
            for res in oechem.OEGetResidues(initial_prot)
            if res.GetExtChainID() == "B"
        ]
    )
)
print("---")
print(
    " ".join(
        [
            res.GetName()
            for res in oechem.OEGetResidues(du_prot)
            if res.GetExtChainID() == "B"
        ]
    )
)
print("---")
print(seqres)

607
612
612
SER GLY PHE ARG LYS MET ALA PHE PRO SER GLY LYS VAL GLU GLY CYS MET VAL GLN VAL THR CYS GLY THR THR THR LEU ASN GLY LEU TRP LEU ASP ASP VAL VAL TYR CYS PRO ARG HIS VAL ILE CYS THR SER GLU ASP MET LEU ASN PRO ASN TYR GLU ASP LEU LEU ILE ARG LYS SER ASN HIS ASN PHE LEU VAL GLN ALA GLY ASN VAL GLN LEU ARG VAL ILE GLY HIS SER MET GLN ASN CYS VAL LEU LYS LEU LYS VAL ASP THR ALA ASN PRO LYS THR PRO LYS TYR LYS PHE VAL ARG ILE GLN PRO GLY GLN THR PHE SER VAL LEU ALA CYS TYR ASN GLY SER PRO SER GLY VAL TYR GLN CYS ALA MET ARG PRO ASN PHE THR ILE LYS GLY SER PHE LEU ASN GLY SER CYS GLY SER VAL GLY PHE ASN ILE ASP TYR ASP CYS VAL SER PHE CYS TYR MET HIS HIS MET GLU LEU PRO THR GLY VAL HIS ALA GLY THR ASP LEU GLU GLY ASN PHE TYR GLY PRO PHE VAL ASP ARG GLN THR ALA GLN ALA ALA GLY THR ASP THR THR ILE THR VAL ASN VAL LEU ALA TRP LEU TYR ALA ALA VAL ILE ASN GLY ASP ARG TRP PHE LEU ASN ARG PHE THR THR THR LEU ASN ASP PHE ASN LEU VAL ALA MET LYS TYR ASN TYR GLU PRO LEU THR GLN ASP HIS VAL 

In [65]:
for i, (init_res, du_res, seqres_res) in enumerate(
    zip(
        [
            res.GetName()
            for res in oechem.OEGetResidues(initial_prot)
            if res.GetExtChainID() == "A"
        ],
        [
            res.GetName()
            for res in oechem.OEGetResidues(du_prot)
            if res.GetExtChainID() == "A"
        ],
        seqres_list,
    )
):
    if (
        (init_res != du_res)
        or (init_res != seqres_res)
        or (du_res != seqres_res)
    ):
        print(i, init_res, du_res, seqres_res, flush=True)