# Generating parameters for the burn-in set

This notebook generates parameters for selected systems for the "burn-in" set. 

The workflow starts with generating OpenMM systems for the protien and ligands. Parameters for the ligands are derrived from the "Sage" force field with AM1-BCC charges, while the protein using AMBER14SB and the TIP3P model is used for the water molecules. 

These openMM systems are converted for use in the ParmEd library, before being exported as GROMACS gro/top input files.

The burn-in set contains the following systems:
- BRD4(1) with 2 ligands
- cmet with 2 ligands
- CycloD with 2 frgaments and a merged ligand
- jnk1 with 2 ligands
- p38 with 2 ligands
- thrombin with 2 ligands

This notebook was based off the super notebook examples that are available on the Open Force Field GitHub page.

### Imports and functions to assist in parameter generation

In [5]:
import os
import shutil

from openff.toolkit.topology import Molecule
from openff.toolkit.typing.engines.smirnoff import ForceField as OFF_ForceField

try:
    from openmm import XmlSerializer, app, unit
    from openmm.app import HBonds, NoCutoff, PDBFile, ForceField
except ImportError:
    from simtk import unit
    from simtk.openmm import XmlSerializer, app
    from simtk.openmm.app import HBonds, NoCutoff, PDBFile

import parmed

import subprocess
from subprocess import Popen, PIPE

In [6]:
def find_params_sage(ligand_sdf, ligand_pdb):
    """ Creates a ParmEd object using the specified version of the Sage force field to 
    generate parameters for the ligand.

    Based on the OpenMM github pages:
        https://github.com/openmm/openmmforcefields

    Bond order, charge and connectivity information is extracted from the .sdf file. The 
    atomic positions are extracted from the .pdb file. This dual file model ensures that 
    all information is provided for the force field calculation. It also allows .pdb files 
    to not contain all required information e.g. files extracted from docking studies.

    Parameters
       ----------
       ligand_sdf : str
           Path to ligand SDF
       ligand_pdb : str
           Path to ligand PDB

       Returns
       -------
       parmed_structure
           A ParmEd object containing all information on the ligand.
       """

    # Load in the molecule
    ligand_molecule = Molecule(ligand_sdf)
    print("    Ligand information extracted from the .sdf file...")

    # Specify the "Sage" forcefield
    force_field = OFF_ForceField("openff_unconstrained-2.0.0.offxml")

    # Parametrize the ligand molecule by creating a Topology object from it
    ligand_system = force_field.create_openmm_system(ligand_molecule.to_topology())
    
    # Read in the coordinates of the ligand from the PDB file
    ligand_pdbfile = PDBFile(ligand_pdb)
    print("    Ligand positions extracted from the .pdb file...")

    # Convert the ligand system to a ParmEd object
    parmed_structure = parmed.openmm.load_topology(ligand_pdbfile.topology, 
                                                   ligand_system, 
                                                   ligand_pdbfile.positions)

    return parmed_structure

In [7]:
def find_amber_params(receptor_pdb):
    """ Creates a ParmEd object using the AMBER14 force field and TIP3P

    Parameters
       ----------
       receptor_pdb : str
           Path to the prepared protein PDB file

       Returns
       -------
       parmed_structure
           A ParmEd object containing all information on the proteins and water molecules
    """
    
    # Parse the PDB file
    receptor_pdbfile = PDBFile(receptor_pdb)

    # Load the AMBER protein force field through OpenMM.
    #omm_forcefield = app.ForceField("amber14-all.xml", "tip3p.xml")
    omm_forcefield = app.ForceField("amber14/protein.ff14SB.xml", "amber14/tip3p.xml")

    # Parameterize the protein.
    receptor_system = omm_forcefield.createSystem(receptor_pdbfile.topology,
                                                 nonbondedCutoff=1*unit.nanometer,
                                                 nonbondedMethod=app.NoCutoff,        
                                                 constraints=None,
                                                 rigidWater=False)

    # Convert the protein System into a ParmEd Structure.
    receptor_parmed_structure = parmed.openmm.load_topology(
                                            receptor_pdbfile.topology, 
                                            receptor_system, 
                                            xyz=receptor_pdbfile.positions)

    return receptor_parmed_structure

In [8]:
def output_parmed(parmed_structure, prefix):
    # Export GROMACS input files.
    parmed_structure.save(f'{prefix}.top', overwrite=True)
    parmed_structure.save(f'{prefix}.gro', overwrite=True)
    print("     Input files written.")

### Generate parameters for each system:

In [9]:
# List of proteins we need to set up:
protein_list = ['thrombin',
                #'cmet',
                #'cyclod',
                #'jnk1',
                #'p38',
                #'brd4',
               ]

for p in protein_list:
    
    print(f"Starting on {p}...")
    
    # make a folder for parameters
    if not os.path.exists(f'./parameters/{p}'):
        os.makedirs(f'./parameters/{p}')

    # Generate protein parameters
    protein_parmed_structure = find_amber_params(f'./structures/{p}/protein.pdb')
    
    # Generate water parameters
    if os.path.isfile(f'./structures/{p}/water.pdb'):
        water_parmed_structure = find_amber_params(f'./structures/{p}/water.pdb')
    
    # get ligand filenames:
    if p == 'cyclod':
        ligand_filenames = ['ligand1', 'ligand2', 'ligand3']
    else:
        ligand_filenames = ['ligand1', 'ligand2']

    # generate files for each ligand
    for l in ligand_filenames:
        
        print(f"  Working on {l}...")
        
        # directory for each ligand
        if not os.path.exists(f'./parameters/{p}/{l}'):
            os.makedirs(f'./parameters/{p}/{l}')
        
        # Generate sage parameters
        ligand_parmed_struct = find_params_sage(f'./structures/{p}/{l}.sdf',
                                                f'./structures/{p}/{l}.pdb')
        
        # Generate some outputs
        output_parmed(ligand_parmed_struct, f'./parameters/{p}/{l}/ligand')
   
        # Build the complex:
        # Order: protein, ligand and then any waters
        if os.path.isfile(f'./structures/{p}/water.pdb'):
            complex_parmed_structure = protein_parmed_structure + ligand_parmed_struct + water_parmed_structure
        else:
            complex_parmed_structure = protein_parmed_structure + ligand_parmed_struct
            
        output_parmed(complex_parmed_structure, f'./parameters/{p}/{l}/complex')


Starting on thrombin...
  Working on ligand1...
    Ligand information extracted from the .sdf file...
    Ligand positions extracted from the .pdb file...
     Input files written.
     Input files written.
  Working on ligand2...
    Ligand information extracted from the .sdf file...
    Ligand positions extracted from the .pdb file...
     Input files written.
     Input files written.


<b>NOTE:</b> This function should not be rerun by participants of the burn in set. This is to ensure that everyone is using the same parameters. If you need to play with the ParmEd system further, please use the example below to read in the AMBER inpt files.

### Generating solvated systems with GROMACS:

In [10]:
# Load GROMACS
!module load gromacs/2020.1-CPU

In [11]:
!gmx -h

                     :-) GROMACS - gmx, 2021.3-bioconda (-:

                            GROMACS is written by:
     Andrey Alekseenko              Emile Apol              Rossen Apostolov     
         Paul Bauer           Herman J.C. Berendsen           Par Bjelkmar       
       Christian Blau           Viacheslav Bolnykh             Kevin Boyd        
     Aldert van Buuren           Rudi van Drunen             Anton Feenstra      
    Gilles Gouaillardet             Alan Gray               Gerrit Groenhof      
       Anca Hamuraru            Vincent Hindriksen          M. Eric Irrgang      
      Aleksei Iupinov           Christoph Junghans             Joe Jordan        
    Dimitrios Karkoulis            Peter Kasson                Jiri Kraus        
      Carsten Kutzner              Per Larsson              Justin A. Lemkul     
       Viveca Lindahl            Magnus Lundborg             Erik Marklund       
        Pascal Merz             Pieter Meulenhoff            Teemu M

In [38]:
home_dir = os.getcwd()
#print(os.getcwd())

In [39]:
os.chdir(home_dir)
cwd = os.getcwd()

# List of proteins we need to set up:
protein_list = ['thrombin',
                'cmet',
                'cyclod',
                'jnk1',
                'p38',
                'brd4',
               ]

for p in protein_list:
    
    print(f"Starting on {p}...")
    
    # make a folder for parameters
    if not os.path.exists(f'./solvated_systems/{p}'):
        os.makedirs(f'./solvated_systems/{p}')
    
    # get ligand filenames:
    if p == 'cyclod':
        ligand_filenames = ['ligand1', 'ligand2', 'ligand3']
    else:
        ligand_filenames = ['ligand1', 'ligand2']

    # generate files for each ligand
    for l in ligand_filenames:
        
        print(f"  Working on {l}...")

        # directory for each ligand
        if not os.path.exists(f'./solvated_systems/{p}/{l}'):
            os.makedirs(f'./solvated_systems/{p}/{l}')
        
        # Copy over GROMACS files and rename 'HOH' to 'SOL' in all files, to allow gmx solvate to work
        # Also: rename water atoms so not to overlap with ligand atom names 
        
        # ...co-ordinate file
        files_for_replacement = ['complex.gro','ligand.gro']
        for f in files_for_replacement:
            with open(f'./parameters/{p}/{l}/{f}', 'r') as file:
                filedata = file.read()
                
            filedata = filedata.replace('HOH      O', 'SOL     OW')
            filedata = filedata.replace('HOH     H1', 'SOL    HW1')
            filedata = filedata.replace('HOH     H2', 'SOL    HW2')
            
            with open(f'./solvated_systems/{p}/{l}/{f}', 'w') as file:
                file.write(filedata)
        
        # ...topolgy files
        files_for_replacement = ['complex.top','ligand.top']
        # If SOL is missing, it needs to be added....
        for f in files_for_replacement:
            with open(f'./parameters/{p}/{l}/{f}', 'r') as file:
                filedata = file.read()
        
            contains_SOL = False
            for line in filedata:
                if line.startswith("SOL"):
                    contains_SOL = True
            
            if contains_SOL == True:
                filedata = filedata.replace('1         O1      1    HOH      O','1         O1      1    SOL     OW')
                filedata = filedata.replace('2         H1      1    HOH     H1','2         H1      1    SOL    HW1')
                filedata = filedata.replace('3         H1      1    HOH     H2','3         H1      1    SOL    HW2 ')
                filedata = filedata.replace('HOH', 'SOL')
            
            elif contains_SOL == False:
                filedata = filedata.replace('[ system ]',"""
[ moleculetype ]
; Name            nrexcl
SOL          3

[ atoms ]
;   nr       type  resnr residue  atom   cgnr    charge       mass  typeB    chargeB      massB
; residue    1 SOL rtp SOL q 0.0
    1         O1      1    SOL     OW      1 -0.83400000  15.999430   ; qtot -0.834000
    2         H1      1    SOL    HW1      2 0.41700000   1.007947   ; qtot -0.417000
    3         H1      1    SOL    HW2       3 0.41700000   1.007947   ; qtot 0.000000

#ifdef FLEXIBLE

[ bonds ]
;    ai     aj funct         c0         c1         c2         c3
      2      1     1   0.09572 462750.400000
      3      1     1   0.09572 462750.400000

[ angles ]
;    ai     aj     ak funct         c0         c1         c2         c3
      2      1      3     1   104.5200000 836.800000


#else

[ settles ]
; i     funct   doh     dhh
1     1   0.09572000   0.15139007

#endif

[ exclusions ]
1  2  3
2  1  3
3  1  2

[ system ]""")
            
            with open(f'./solvated_systems/{p}/{l}/{f}', 'w') as file:
                file.write(filedata)
        
        
        #####################
        # Solvate the complex
        #####################
        stdout_comp = []
        sterr_comp = []
        os.chdir(f'./solvated_systems/{p}/{l}/')
        process = Popen(['gmx' , 'editconf', '-f', 'complex.gro',
                             '-o', 'editconf.gro',
                             '-bt', 'cubic',
                             '-d', '1.0',
                             '-c'],
                                stdout=PIPE, stderr=PIPE)
        stdout_comp.append(process.communicate()[0])
        sterr_comp.append(process.communicate()[1])
        
        process = Popen(['gmx', 'solvate', '-cp', 'editconf.gro',
                            '-o', f'complex_solvated.gro',
                            '-cs', 'spc216',
                            '-p', 'complex.top'],
                                stdout=PIPE, stderr=PIPE)
        stdout_comp.append(process.communicate()[0])
        sterr_comp.append(process.communicate()[1])
        
        # Run gmx grompp:
        # Warning: This will rename the SOL atom names to match the ones given by openmm (O instead of OW, H1 instead of HW1 and H2 instead of HW2)
        process = Popen(['gmx', 'grompp', '-c', 'complex_solvated.gro',
                            '-f', f'{cwd}/genion.mdp',
                            '-p', 'complex.top',
                            '-o', f'for_genion.tpr',
                            '-maxwarn', '3'],
                                stdout=PIPE, stderr=PIPE)
        stdout_comp.append(process.communicate()[0])
        sterr_comp.append(process.communicate()[1])

            
        # Run gmx genion:
        subprocess.call([f'echo "SOL" | gmx genion -s for_genion.tpr -o complex_ions.gro -p complex.top -conc 0.15 -neutral'],
            shell=True, stdout=PIPE, stderr=PIPE)
        stdout_comp.append(process.communicate()[0])
        sterr_comp.append(process.communicate()[1])
                     
        #TODO: Save logs
        
        ####################
        # Solvate the ligand
        ####################
        
        stdout_lig = []
        sterr_lig = []

        process = Popen(['gmx' , 'editconf', '-f', 'ligand.gro',
                             '-o', 'ligand_editconf.gro',
                             '-bt', 'cubic',
                             '-d', '1.0',
                             '-c'],
                                stdout=PIPE, stderr=PIPE)
        stdout_lig.append(process.communicate()[0])
        sterr_lig.append(process.communicate()[1])
        
        process = Popen(['gmx', 'solvate', '-cp', 'ligand_editconf.gro',
                            '-o', f'ligand_solvated.gro',
                            '-cs', 'spc216',
                            '-p', 'ligand.top'],
                                stdout=PIPE, stderr=PIPE)
        stdout_lig.append(process.communicate()[0])
        sterr_lig.append(process.communicate()[1])
        
        # Run gmx grompp:
        # Warning: This will rename the SOL atom names to match the ones given by openmm (O instead of OW, H1 instead of HW1 and H2 instead of HW2)
        process = Popen(['gmx', 'grompp', '-c', 'ligand_solvated.gro',
                            '-f', f'{cwd}/genion.mdp',
                            '-p', 'ligand.top',
                            '-o', f'for_genion.tpr',
                            '-maxwarn', '3'],
                                stdout=PIPE, stderr=PIPE)
        stdout_lig.append(process.communicate()[0])
        sterr_lig.append(process.communicate()[1])

            
        # Run gmx genion:
        subprocess.call([f'echo "SOL" | gmx genion -s for_genion.tpr -o ligand_ions.gro -p ligand.top -conc 0.15 -neutral'],
            shell=True, stdout=PIPE, stderr=PIPE)
        stdout_lig.append(process.communicate()[0])
        sterr_lig.append(process.communicate()[1])
        
        #TODO: Save logs
        
        #TODO: Cleanup directory
        
        os.chdir(cwd)




Starting on thrombin...
  Working on ligand1...
  Working on ligand2...
Starting on cmet...
  Working on ligand1...
  Working on ligand2...
Starting on cyclod...
  Working on ligand1...
  Working on ligand2...
  Working on ligand3...
Starting on jnk1...
  Working on ligand1...
  Working on ligand2...
Starting on p38...
  Working on ligand1...
  Working on ligand2...
Starting on brd4...
  Working on ligand1...
  Working on ligand2...


In [36]:
#sterr_lig