# Imports

In [1]:
import mdtraj as md
import numpy as np
from datetime import datetime
from tqdm import tqdm
from pathlib import Path
import yaml

## Paths

In [2]:
idir = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3")
pdb_fn = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3/Mpro-x0072_0A_AAR-POS-d2a4d1df-1/Mpro-x0072_0A_AAR-POS-d2a4d1df-1_prepped_receptor_0.pdb")
pdb_fn2 = Path("/Users/alexpayne/lilac-mount-point/asap-datasets/current/sars_01_prepped_v3/Mpro-P0010_0A_PET-UNK-c9c1e0d8-4/Mpro-P0010_0A_PET-UNK-c9c1e0d8-4_prepped_receptor_0.pdb")
sars_selection_fn = Path("../../../metadata/sars2.yaml")

In [3]:
pdb_fn.exists()

True

In [4]:
pdb_fn2.exists()

True

## load files

In [5]:
pdb1 = md.load(pdb_fn)

In [6]:
pdb2 = md.load(pdb_fn2)

In [53]:
with open(sars_selection_fn, 'r') as f:
    sars_sel = yaml.safe_load(f)

In [54]:
sars_sel

{'ligand': 'resn LIG',
 'P1': 'resi 140-145+163+172 and polymer.protein',
 'P1_prime': 'resi 25-27 and polymer.protein',
 'P2': 'resi 41+49+54 and polymer.protein',
 'P3_4_5': 'resi 165-168+189-192 and polymer.protein',
 'sars_unique': 'resi 25+49+142+164+168+169+181+186+188+190+191 and polymer.protein'}

In [68]:
def pymol_to_mdtraj(pymol_string):
    """
    Simple function to convert a pymol-like residue string into mdtraj-like
    """
    x = pymol_string.replace('+', ' or residue ')
    x = x.replace('-', ' to ')
    x = x.replace('resi ', 'residue ')
    x = x.replace("polymer.protein", "")
    return x

In [69]:
for selection in sars_sel.values():
    print(pymol_to_mdtraj(selection))

resn LIG
residue 140 to 145 or residue 163 or residue 172 and 
residue 25 to 27 and 
residue 41 or residue 49 or residue 54 and 
residue 165 to 168 or residue 189 to 192 and 
residue 25 or residue 49 or residue 142 or residue 164 or residue 168 or residue 169 or residue 181 or residue 186 or residue 188 or residue 190 or residue 191 and 


# Examine topology

In [20]:
pdb1.topology

<mdtraj.Topology with 4 chains, 1070 residues, 10783 atoms, 11349 bonds at 0x17e59f1c0>

In [22]:
pdb2.topology

<mdtraj.Topology with 4 chains, 937 residues, 10376 atoms, 10809 bonds at 0x17e0be6a0>

In [31]:
pdb1.atom_slice(pdb1.topology.select("chainid 0"))

<mdtraj.Trajectory with 1 frames, 4682 atoms, 306 residues, without unitcells at 0x17df7ceb0>

In [29]:
pdb2.topology.chain(1).topology

<mdtraj.Topology with 4 chains, 937 residues, 10376 atoms, 10809 bonds at 0x17e0be6a0>

In [9]:
def make_selection(t, string_):
    print(t.atom_slice(t.topology.select(string_)))

In [42]:
string_ = "chainid 0"
for pdb in [pdb1, pdb2]:
    make_selection(pdb, string_)

<mdtraj.Trajectory with 1 frames, 4682 atoms, 306 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 4684 atoms, 306 residues, without unitcells>


In [43]:
string_ = "chainid 1"
for pdb in [pdb1, pdb2]:
    make_selection(pdb, string_)

<mdtraj.Trajectory with 1 frames, 48 atoms, 1 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 38 atoms, 1 residues, without unitcells>


In [44]:
string_ = "chainid 2"
for pdb in [pdb1, pdb2]:
    make_selection(pdb, string_)

<mdtraj.Trajectory with 1 frames, 4682 atoms, 306 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 4682 atoms, 306 residues, without unitcells>


In [45]:
string_ = "chainid 3"
for pdb in [pdb1, pdb2]:
    make_selection(pdb, string_)

<mdtraj.Trajectory with 1 frames, 1371 atoms, 457 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 972 atoms, 324 residues, without unitcells>


In [60]:
string_ = "(chainid 0 or chainid 1) and resid 41"
for pdb in [pdb1, pdb2]:
    make_selection(pdb, string_)

<mdtraj.Trajectory with 1 frames, 16 atoms, 1 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 16 atoms, 1 residues, without unitcells>


In [121]:
string_ = "not (name =~ 'H.*') and chainid 2"
for pdb in [pdb1, pdb2]:
    make_selection(pdb, string_)

<mdtraj.Trajectory with 1 frames, 2368 atoms, 306 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 2368 atoms, 306 residues, without unitcells>


## doin it

In [10]:
full_protein = "not element H and (chainid 0 or chainid 2)"
active_site_string = "not element H and (chainid 0 or chainid 2) and (residue 140 to 145 or residue 163 or residue 172 or residue 25 to 27 or residue 41 or residue 49 or residue 54 or residue 165 to 168 or residue 189 to 192)"

In [11]:
for pdb in [pdb1, pdb2]:
    make_selection(pdb, full_protein)

<mdtraj.Trajectory with 1 frames, 4736 atoms, 612 residues, without unitcells>
<mdtraj.Trajectory with 1 frames, 4736 atoms, 612 residues, without unitcells>


In [12]:
md.rmsd(pdb1, pdb2, atom_indices=pdb1.topology.select(active_site_string))

array([0.48162633], dtype=float32)

In [13]:
md.rmsd(pdb1, pdb2, atom_indices=pdb2.topology.select(active_site_string))

array([0.42694336], dtype=float32)

In [14]:
pdb1_active_site = pdb1.atom_slice(pdb1.topology.select(active_site_string))

In [15]:
pdb2_active_site = pdb2.atom_slice(pdb2.topology.select(active_site_string))

In [16]:
pdb2_active_site

<mdtraj.Trajectory with 1 frames, 354 atoms, 44 residues, without unitcells at 0x10c101520>

In [17]:
pdb1_active_site

<mdtraj.Trajectory with 1 frames, 354 atoms, 44 residues, without unitcells at 0x10c185730>

In [18]:
t = md.join([pdb1_active_site, pdb2_active_site])

In [19]:
t.save_pdb("combined_test.pdb")

In [20]:
help(t.save_pdb)

Help on method save_pdb in module mdtraj.core.trajectory:

save_pdb(filename, force_overwrite=True, bfactors=None) method of mdtraj.core.trajectory.Trajectory instance
    Save trajectory to RCSB PDB format
    
    Parameters
    ----------
    filename : path-like
        filesystem path in which to save the trajectory
    force_overwrite : bool, default=True
        Overwrite anything that exists at filename, if its already there
    bfactors : array_like, default=None, shape=(n_frames, n_atoms) or (n_atoms,)
        Save bfactors with pdb file. If the array is two dimensional it should
        contain a bfactor for each atom in each frame of the trajectory.
        Otherwise, the same bfactor will be saved in each frame.



In [24]:
md.shrake_rupley(t)

array([[4.47687835e-01, 9.30880159e-02, 1.25794613e-03, 3.15857202e-01,
        8.93141776e-02, 4.18538690e-01, 2.13850841e-01, 6.72101304e-02,
        2.64168680e-02, 0.00000000e+00, 1.64067164e-01, 1.20762825e-01,
        3.88403893e-01, 4.74245697e-01, 1.82264764e-02, 6.28973078e-03,
        1.71080679e-01, 3.57152998e-01, 2.13850848e-02, 1.25794616e-02,
        4.33991402e-01, 1.55985326e-01, 3.93008411e-01, 2.64168680e-02,
        1.92465752e-01, 1.93085849e-01, 1.50953531e-02, 0.00000000e+00,
        1.82264764e-02, 2.66684592e-01, 1.35858178e-01, 8.54366049e-02,
        4.59079385e-01, 4.40281145e-02, 2.75490195e-01, 2.12059602e-01,
        6.54131994e-02, 2.67942518e-01, 1.90338627e-01, 2.13850841e-01,
        4.46548671e-01, 7.04449862e-02, 3.05680901e-01, 5.12291372e-01,
        2.42783606e-01, 3.27065997e-02, 2.46557444e-01, 2.36493871e-01,
        1.16988987e-01, 3.23292166e-01, 6.66711479e-02, 2.29917243e-01,
        3.49720508e-01, 4.90598977e-02, 0.00000000e+00, 1.417451

# Iterate through all PDBs 

In [46]:
import logging
def analyze_mp(fn, out_dir):
    full_protein_selection = "not element H and (chainid 0 or chainid 2)"
    active_site_string = "not element H and (chainid 0 or chainid 2) and (residue 140 to 145 or residue 163 or residue 172 or residue 25 to 27 or residue 41 or residue 49 or residue 54 or residue 165 to 168 or residue 189 to 192)"
    output_name = fn.stem

    ## Prepare logger
    handler = logging.FileHandler(out_dir / f"{output_name}-log.txt", mode="w")
    prep_logger = logging.getLogger(output_name)
    prep_logger.setLevel(logging.INFO)
    prep_logger.addHandler(handler)
    prep_logger.info(datetime.isoformat(datetime.now()))

    prep_logger.info(f"Loading {fn}")
    pdb = md.load(fn)

    active_site_idx = pdb.topology.select(active_site_string)
    active_site = pdb.atom_slice(active_site_idx)

    full_protein_idx = pdb.topology.select(full_protein_selection)
    full_protein = pdb.atom_slice(full_protein_idx)

    prep_logger.info(f"Saving idx arrays to {out_dir}")
    np.save(out_dir / f"{output_name}_active_site.npy", active_site_idx)
    np.save(out_dir / f"{output_name}_full_protein.npy", full_protein_idx)

    prep_logger.info(f"Saving pdbs to {out_dir}")
    active_site.save(out_dir / f"{output_name}_active_site.pdb")
    full_protein.save(out_dir / f"{output_name}_full_protein.pdb")

    return True

In [47]:
analyze_mp(pdb_fn, Path("."))

True

In [32]:
pdb_fn.stem

'Mpro-x0072_0A_AAR-POS-d2a4d1df-1_prepped_receptor_0'