# Building data

## Assay data

In [1]:

import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
from pathlib import Path
import yaml

In [2]:
experimental_data = pd.read_csv("https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/Analysis/ExperimentalMeasurements/experimental_measurements.csv", delimiter=";")

guests = experimental_data[experimental_data['ID'].str.startswith("OA-")]

## Create directories for plb

In [3]:
paths = {
    '00_data': Path('../00_data').resolve(),
    '01_protein_crd': Path('../01_protein/crd').resolve(),
    '02_ligands': Path('../02_ligands').resolve()
}
[paths[p].mkdir(exist_ok = True, parents=True) for p in paths]

[None, None, None]

## 00_data

In [4]:
# Edges
edges = {
    'mapper': None,
    'planner': None,
    'edges': None
}

target = {
    'date': '2021-02-04',
    'name': 'OA',
    'netcharge': '?',
    'pdb': 'none',
    'references': {
        'calculations': [
            'https://github.com/samplchallenges/SAMPL6',
            '10.1007/s10822-018-0170-6',
        ],
        'measurement': [
            'https://github.com/samplchallenges/SAMPL6'
        ]
    }
}


In [None]:
with open(paths['00_data'] / 'edges.yml', 'w') as f:
    yaml.dump(edges, f)

with open(paths['00_data'] / 'target.yml', 'w') as f:
    yaml.dump(target, f)


## 01_protein_crd
 

In [15]:
# Here get Espaloma parameters for the Host
! wget -P ../01_protein/crd https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/OctaAcidsAndGuests/OA.sdf

--2024-07-24 17:06:40--  https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/OctaAcidsAndGuests/OA.sdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17519 (17K) [text/plain]
Saving to: ‘../01_protein/crd/OA.sdf’


2024-07-24 17:06:40 (68,8 MB/s) - ‘../01_protein/crd/OA.sdf’ saved [17519/17519]



In [None]:
from toff import Parameterize
parameterizer = Parameterize(force_field_type='espaloma', force_field_code='espaloma-0.3.1', ext_types=['gro', 'top'], overwrite=True, safe_naming_prefix='y', out_dir="../01_protein/crd/espaloma-0.3.1")
parameterizer("../01_protein/crd/OA.sdf", mol_resi_name="HOST")

## 02_ligands

In [None]:
for name in guests.ID:
    ! wget -P ../02_ligands/sdf_split https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/OctaAcidsAndGuests/{name}.sdf

--2024-07-24 17:06:41--  https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/OctaAcidsAndGuests/OA-G0.sdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1961 (1,9K) [text/plain]
Saving to: ‘../02_ligands/OA-G0.sdf’


2024-07-24 17:06:41 (15,6 MB/s) - ‘../02_ligands/OA-G0.sdf’ saved [1961/1961]

--2024-07-24 17:06:41--  https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/OctaAcidsAndGuests/OA-G1.sdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1663 (1

In [None]:
# Build a unified SDF

from rdkit import Chem
from glob import glob

with Chem.SDWriter(f"../02_ligands/ligands.sdf") as w:
    for p in glob("../02_ligands/sdf_split/*.sdf"):
        mol = Chem.MolFromMolFile(p, removeHs=False)
        w.write(mol)


In [6]:
ligands = dict()
for _, row in guests.iterrows():
    mol = Chem.MolFromMolFile(f"../02_ligands/sdf_split/{row.ID}.sdf")

    ligands[row.ID] = {
        'measurement': {
            'charge': float(Chem.GetFormalCharge(mol)),
            'comment': 'https://raw.githubusercontent.com/samplchallenges/SAMPL6/master/host_guest/Analysis/ExperimentalMeasurements/experimental_measurements.csv',
            'doi': '10.1007/s10822-018-0170-6',
            'error': row['dDG'],
            'type': 'dg',
            'unit': 'kcal/mol',
            'value': row['DG'],
        },
        'name': row.ID,
        'smiles': Chem.MolToSmiles(mol)
    }


In [7]:
with open(paths['00_data'] / 'ligands.yml', 'w') as f:
    yaml.dump(ligands, f)

# Liganf OA-G7.sdf was excluded from the analysis as it was unstable during FEP simulation.