# Making Protein-ligand-benchmark-like data

In [36]:
import yaml
from pathlib import Path
from rdkit import Chem
import pandas as pd
import shutil

In [11]:
paths = {
    '00_data': Path('../00_data').resolve(),
    '01_protein_crd': Path('../01_protein/crd').resolve(),
    '02_ligands': Path('../02_ligands').resolve()
}
[paths[p].mkdir(exist_ok = True, parents=True) for p in paths]

[None, None, None]

In [12]:
! git clone --depth 1 https://github.com/bigginlab/ABFE_workflow abfe_workflow
! cp -r abfe_workflow/examples/CyclophilinD_data/minimal_input input
! rm -rf abfe_workflow
! git clone --depth 1 https://github.com/IAlibay/fragment-opt-abfe-benchmark data
! cp data/free_energy_data/abfe.csv .
! rm -rf data


Cloning into 'abfe_workflow'...
remote: Enumerating objects: 308, done.[K
remote: Counting objects: 100% (308/308), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 308 (delta 129), reused 251 (delta 111), pack-reused 0[K
Receiving objects: 100% (308/308), 5.68 MiB | 7.47 MiB/s, done.
Resolving deltas: 100% (129/129), done.
Cloning into 'data'...
remote: Enumerating objects: 197, done.[K
remote: Counting objects: 100% (197/197), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 197 (delta 37), reused 190 (delta 37), pack-reused 0[K
Receiving objects: 100% (197/197), 195.27 MiB | 11.82 MiB/s, done.
Resolving deltas: 100% (37/37), done.


## 00_data and 02_ligand

In [13]:
# Edges
edges = {
    'mapper': None,
    'planner': None,
    'edges': None
}

target = {
    'data': '2023-12-02',
    'name': 'CyclophilinD',
    'netcharge': '+4 e',
    'pdb': 'https://github.com/bigginlab/ABFE_workflow/blob/main/examples/CyclophilinD_data/minimal_input/receptor_protein.pdb',
    'references': {
        'calculations': [
            '0.1021/ja512751q',
            '10.1021/acs.jcim.9b00105',
            '10.1039/C9SC03754C',
        ],
        'measurement': [
            'https://github.com/IAlibay/fragment-opt-abfe-benchmark/blob/main/free_energy_data/abfe.csv'
        ]
    }
}

In [14]:
ref_data = pd.read_csv('abfe.csv')
ref_data = ref_data[ref_data.System == 'CycloD']
ref_data['ligand'] = ref_data['ligand_ID'].apply(lambda x: f"ligand-{x}")

ref_data = ref_data.groupby('ligand').agg({'calc_dG': ['mean', 'sem'], 'exp_dG': 'first'}).reset_index()


# Rename the columns
ref_data.columns = ['ligand', 'calc_dG', 'sem_calc_dG', 'exp_dG']
ref_data

Unnamed: 0,ligand,calc_dG,sem_calc_dG,exp_dG
0,ligand-14,-12.916,0.193639,-11.22
1,ligand-16,-10.54,0.21631,-8.42
2,ligand-2,-7.686,0.167147,-9.06
3,ligand-27,-10.208,0.491573,-7.57
4,ligand-3,-4.706,0.098924,-2.93
5,ligand-39,-12.4885,0.129796,-8.43
6,ligand-4,-4.142,0.361613,-2.9
7,ligand-40,-11.778,0.234764,-8.08
8,ligand-7,-4.854,0.075273,-2.73
9,ligand-8,-7.24,0.263572,-4.04


In [37]:
(paths['02_ligands'] / 'sdf_split').mkdir(exist_ok=True)
ligands = dict()
with Chem.SDWriter(str(paths['02_ligands'] / 'ligands.sdf')) as writer:
    for _, row in ref_data.iterrows():
        mol = Chem.MolFromMolFile(f'input/ligands/{row.ligand}.sdf', removeHs=False)
        writer.write(mol)
        shutil.copy(f'input/ligands/{row.ligand}.sdf', paths['02_ligands'] / 'sdf_split')
        ligands[row.ligand] = {
                'charge': float(Chem.GetFormalCharge(mol)),
                'measurement': {
                    'comment': 'Experimental Data',
                    'doi': 'https://github.com/IAlibay/fragment-opt-abfe-benchmark/blob/main/free_energy_data/abfe.csv',
                    'error': 0,
                    'type': 'dg',
                    'unit': 'kcal/mol',
                    'value': row.exp_dG,
                },
                'measurement_alt': {
                    'comment': 'Theoretical Data / GAFF2m',
                    'doi': ['https://github.com/IAlibay/fragment-opt-abfe-benchmark/blob/main/free_energy_data/abfe.csv',
                            'https://www.nature.com/articles/s42004-022-00721-4'],
                    'error': row.sem_calc_dG,
                    'type': 'dg',
                    'unit': 'kcal/mol',
                    'value': row.calc_dG,
                },
                'name': row.ligand,
                'smiles': Chem.MolToSmiles(mol)
                }

In [38]:
with open(paths['00_data'] / 'edges.yml', 'w') as f:
    yaml.dump(edges, f)

with open(paths['00_data'] / 'target.yml', 'w') as f:
    yaml.dump(target, f)

with open(paths['00_data'] / 'ligands.yml', 'w') as f:
    yaml.dump(ligands, f)


## 01_protein_crd

In [40]:
! cp input/receptor_protein.pdb {paths['01_protein_crd']}