# Format dataframe for constrained embedding task

In [1]:
import os
import glob
import pandas as pd
from rdkit.Chem import PandasTools

In [2]:
pdb = '7JTO'
method = 'difflinker'

In [3]:
gen_folder = 'data/generated'
df = pd.read_csv(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid.csv'))
df.head()

Unnamed: 0,ID,reference,lig_id,protac_smiles,linker_smiles,anchor_smiles,warhead_smiles,anchor_ev,warhead_ev,POI,...,gen_filename,frags,tanimoto,qed_linker,sa_linker,num_rings_linker,num_rot_bonds_linker,branched,PAINS,ring_arom
0,7JTO_difflinker_678,7JTO,MS33,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CCCCCC(=O)OCOCCOC,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CN1CCN(c2ccc(-c3cccc(CN4CCNCC4)c3)cc2NC(=O)C2=...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CN1CCN(c2ccc(-c3cccc(CN4CCN([*:2])CC4)c3)cc2NC...,WDR5,...,output_48_7JTO__MS33_0_len14.sdf,[H]C(=O)N([H])C([H])(C([H])([H])[H])C([H])([H]...,0.125654,0.325778,2.056356,0,9,False,False,True
1,7JTO_difflinker_1584,7JTO,MS33,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CCCCCC(=O)N[C@H](C)CCCC,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CN1CCN(c2ccc(-c3cccc(CN4CCNCC4)c3)cc2NC(=O)C2=...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CN1CCN(c2ccc(-c3cccc(CN4CCN([*:2])CC4)c3)cc2NC...,WDR5,...,output_2613_7JTO__MS33_0_len14.sdf,[H]C(=O)N([H])C([H])(C([H])([H])[H])C([H])([H]...,0.631579,0.597246,2.166247,0,8,True,False,True
2,7JTO_difflinker_3051,7JTO,MS33,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CCOCCC[C@@H]1CCC[C@H]1CC=O,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CN1CCN(c2ccc(-c3cccc(CN4CCNCC4)c3)cc2NC(=O)C2=...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CN1CCN(c2ccc(-c3cccc(CN4CCN([*:2])CC4)c3)cc2NC...,WDR5,...,output_3442_7JTO__MS33_0_len14.sdf,[H]C(=O)N([H])C([H])(C([H])([H])[H])C([H])([H]...,0.164773,0.46424,3.286373,1,7,False,False,True


### Filter by 2D filters

In [4]:
df_fil  = df[(~df.PAINS) & df.ring_arom]
len(df), len(df_fil)

(3, 3)

### Filter by SAScore

In [5]:
# # keep rows with SA score < 4
df_fil = df_fil[df_fil.sa_linker < 4]
len(df_fil)

3

### Deduplicate

In [6]:
df_dedupl = df_fil.drop_duplicates(subset=['protac_smiles'])
len(df_dedupl)

3

### Remove molecules with formal charges

In [7]:
def is_charged(mol):
    for a in mol.GetAtoms():
        charge = a.GetFormalCharge()
        if charge != 0:
            return True
    return False

In [8]:
PandasTools.AddMoleculeColumnToFrame(df_dedupl, smilesCol='protac_smiles', molCol='protac_mol')
df_dedupl['charged'] = df_dedupl.protac_mol.apply(is_charged)
df_dedupl = df_dedupl[~df_dedupl.charged]
len(df_dedupl)

3

### Remove unnecessary columns

In [9]:
df_dedupl.columns

Index(['ID', 'reference', 'lig_id', 'protac_smiles', 'linker_smiles',
       'anchor_smiles', 'warhead_smiles', 'anchor_ev', 'warhead_ev', 'POI',
       'E3', 'gen_filename', 'frags', 'tanimoto', 'qed_linker', 'sa_linker',
       'num_rings_linker', 'num_rot_bonds_linker', 'branched', 'PAINS',
       'ring_arom', 'protac_mol', 'charged'],
      dtype='object')

In [10]:
df_dedupl.drop(columns=['protac_mol'], inplace=True)
df_dedupl.reset_index(drop=True, inplace=True)
df_dedupl.to_csv(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid_fil.csv'), index=False)