# Building data

## Assay data

In [1]:

import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
from pathlib import Path
import yaml

## Create directories for plb

In [2]:
paths = {
    '00_data': Path('../00_data').resolve(),
    '01_protein_crd': Path('../01_protein/crd').resolve(),
    '02_ligands': Path('../02_ligands').resolve()
}
[paths[p].mkdir(exist_ok = True, parents=True) for p in paths]

[None, None, None]

## 00_data

In [3]:
# Edges
edges = {
    'mapper': None,
    'planner': None,
    'edges': None
}

target = {
    'date': '2021-02-04',
    'name': 'A2A',
    'netcharge': '?',
    'pdb': ['5OLZ', '6GT3'],
    'references': {
        'calculations': [
            '10.1021/acs.jcim.0c00449'
        ],
        'measurement': [
            '10.1021/acs.jcim.0c00449: Table S2'
        ]
    }
}


In [4]:
pKi = {
    '4f': 8.39,
    '4e': 8.85,
    '4o': 8.32,
    '4a': 6.93,
    '4b': 7.29,
    '4c': 8.4,
    '4d': 7.67,
    '4m': 7.73,
    '4q': 6.51,
    '4g': 8.11,
    '4n': 8.47,
    '4h': 7.81,
    '4i': 7.56,
    '4j': 7.98,
    '4k': 8.46,
    '4l': 8.34,
    '4r': 8.4
}
suppl = Chem.SDMolSupplier('A2A_FEPstudy8_LigandSet6.sdf')

In [5]:
ligands = dict()
for mol in suppl:
    mol = Chem.AddHs(mol)
    name = mol.GetProp("_Name").strip()

    ligands[name] = {
        'measurement': {
            'charge': float(Chem.GetFormalCharge(mol)),
            'comment': 'raw data from Table S2',
            'doi': '10.1021/acs.jcim.0c00449',
            'error': 0,
            'type': 'kd', # It is actually kd values
            'unit': 'M',
            'value': 10**(-pKi[name]),
        },
        'name': name,
        'smiles': Chem.MolToSmiles(mol)
    }


In [6]:
with open(paths['00_data'] / 'edges.yml', 'w') as f:
    yaml.dump(edges, f)

with open(paths['00_data'] / 'target.yml', 'w') as f:
    yaml.dump(target, f)

with open(paths['00_data'] / 'ligands.yml', 'w') as f:
    yaml.dump(ligands, f)


## 01_protein_crd

I will process A2A_FEPstudy8_receptor.pdb

1. Process in CHARMM-GUI:
    * Process protein only
    * pH=7, apply
    * Run PPM 2.0
    * Hexagonal box
    * Water thickness: 20
    * Length of X and Y: 90 (initial guess)
    * POPC
    * Do not include ions
    * force field:
        - AMBER:
            - Protein: ff14sb
            - Lipid: SLipids
            - Water: TIP3P
    * Input Generation Options: GROMACS and OpenMM

Open `gromac/step5_input.gro`. This file has the crystal information. Convert to PDB with:

`gmx editconf -f charmm-gui-2491603708/gromacs/step5_input.gro -o temporal.pdb`.

The last PDB will also have the crystal info. Split the PDB in POPC and protein. In PyMOL:

* `select popc, resn POPC`: `../01_protein/crd/membrane.pdb`
* `select prot, (polymer.protein)`
* load `A2A_FEPstudy8_receptor.pdb` and align to prot:
`align A2A_FEPstudy8_receptor, prot`, and save `A2A_FEPstudy8_receptor` to `A2A_FEPstudy8_receptor-aligned-to-charmm-gui.pdb`
* In the directory: `manual-fix`: split into protein and NA atom and remove ANISOU: `grep -v "ANISOU" ../A2A_FEPstudy8_receptor-aligned-to-charmm-gui.pdb > protein.pdb`.
* to this PDB file remove all the Hs of the MET1; replace `HXT` by `OXT` in SER305, and remove the rest Hs as well. Add the crystal information of `temporal.pdb`. The Na atoms is just to let it alone.


Finally, copy the membrane and protein to `01_protein/crd`

 

In [7]:

import parmed
import os
from openmm import app
from openff.toolkit.topology import Molecule
# from openmmforcefields.generators import GAFFTemplateGenerator
from rdkit import Chem


pdb_prot_file = "manual-fix/protein.pdb"
protein_force_fields = ['amber14-all']
for protein_force_field in protein_force_fields:
    forcefield = app.ForceField(f'{protein_force_field}.xml', 'tip3p.xml')

    dirname = '../01_protein/crd'
    filename = os.path.basename(pdb_prot_file).split('.')[0]
    out_dir = os.path.join(dirname, f"protein-{protein_force_field}")
    os.makedirs(out_dir, exist_ok=True)
    print(out_dir)

    pdb_obj = app.PDBFile(pdb_prot_file)
    openmm_topology = pdb_obj.topology
    # Fix topology
    modeller = app.Modeller(pdb_obj.topology, pdb_obj.positions)
    modeller.addHydrogens(forcefield, pH=7.0)

    # Optionally save the modified structure to inspect it
    with open('../01_protein/crd/protein.pdb', 'w') as output:
        app.PDBFile.writeFile(modeller.topology, modeller.positions, output)

    # Create an OpenMM System from an OpenMM Topology object
    system = forcefield.createSystem(modeller.topology)

    try:
        struct = parmed.openmm.load_topology(modeller.topology, system = system, xyz = modeller.positions)
        for file_type in [os.path.join(out_dir, f'{filename}.gro'), os.path.join(out_dir, f'{filename}.top')]:
            struct.save(file_type, overwrite = True)
    except ValueError as e:
        print(e)

../01_protein/crd/protein-amber14-all


### Co-factor

Here we will build the topology from the atoms types of AMBER ions coming (i used the topology of some simulation that I already did i check with the information in BindFlow gmx_water_models). We just will rename the Na atom to yNa, so it doe not collide with the one used for neutralization.

Then, `gmx editconf -f cofactor.pdb -o cofactor.gro`

## 02_ligands

Wee need to transfer the coordinates of the ligand to the aligned structure. For that we will use transrot

In [2]:
from rdkit import Chem
from transrot import TransRot
from pathlib import Path


def get_rdkit_coords(rdkit_mol):
    conf = rdkit_mol.GetConformer()
    coords = [(conf.GetAtomPosition(i).x, conf.GetAtomPosition(i).y, conf.GetAtomPosition(i).z) for i in range(rdkit_mol.GetNumAtoms())]
    return coords


def set_rdkit_coords(rdkit_mol, coords):
    if len(coords) != rdkit_mol.GetNumAtoms():
        raise ValueError("Number of new coordinates doesn't match number of atoms")
    conf = rdkit_mol.GetConformer()
    for i, coord in enumerate(coords):
        conf.SetAtomPosition(i, coord)

In [3]:

original_pdb = Chem.MolFromPDBFile("A2A_FEPstudy8_receptor.pdb")
original_aligned_to_charmm_gui_pdb = Chem.MolFromPDBFile("A2A_FEPstudy8_receptor-aligned-to-charmm-gui.pdb")

transformation = TransRot(
    coords1=get_rdkit_coords(original_pdb),
    coords2=get_rdkit_coords(original_aligned_to_charmm_gui_pdb)
)
transformation.fit()


sdf_suppl = "A2A_FEPstudy8_LigandSet6.sdf"

out_path = Path("../02_ligands/sdf_split").resolve()
out_path.mkdir(exist_ok=True, parents=True)

with Chem.SDMolSupplier(sdf_suppl, removeHs = False) as suppl:
    for lig in suppl:
        name = lig.GetProp("_Name")
        new_coords = transformation.transform(coords=get_rdkit_coords(lig))
        set_rdkit_coords(lig, new_coords)
        Chem.MolToMolFile(lig, str(out_path / f"{name}.mol"))

In [1]:
# Build a unified SDF

from rdkit import Chem
from glob import glob

with Chem.SDWriter(f"../02_ligands/ligands.sdf") as w:
    for p in glob("../02_ligands/sdf_split/*.mol"):
        mol = Chem.MolFromMolFile(p, removeHs=False)
        w.write(mol)