# Get 3D metrics for the DiffLinker generated poses

In [1]:
import os
import glob
import sys
from typing import Union, Dict, Tuple, Optional, List

import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem.MolStandardize import rdMolStandardize
import copy

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from utils.calc_sc_rdkit import calc_SC_RDKit_score
from utils.metrics_3d import lig_protein_clash_dist, lig_protein_clash_vdw, calc_torsion_energy, mcs_rmsd
from utils.chem_transforms import remove_atom_indices

Cannot find license file.
 The license files (or license server system network addresses) attempted are 
listed below.  Use LM_LICENSE_FILE to use a different license file,
 or contact your software provider for a license file.
Feature:       PYMOL_MAIN
Filename:      /Library/Application Support/Schrodinger/licenses
License path:  /Library/Application Support/Schrodinger/licenses:
FlexNet Licensing error:-1,359.  System Error: 2 "No such file or directory"


In [2]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [3]:
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [4]:
pdb = '7JTP'
method = 'difflinker'

## Load Data

### Generated data

In [5]:
gen_folder = 'data/generated'
filepath = glob.glob(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid.csv'))
print(filepath)
df = pd.read_csv(filepath[0])
df.head()

['data/generated/7JTP_sampled_difflinker_valid.csv']


Unnamed: 0,ID,reference,lig_id,protac_smiles,linker_smiles,anchor_smiles,warhead_smiles,anchor_ev,warhead_ev,POI,...,num_rot_bonds_linker,branched,PAINS,ring_arom,ori_E_torsion,ori_clashes_cutoff,ori_clashes_vdw,ori_gen_ptc_filename,ori_sc_rdkit,to_3d
0,7JTP_difflinker_0,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNCO,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,1,False,False,True,98.117867,0.0,18.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.954233,True
1,7JTP_difflinker_1,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,1,False,False,True,94.980706,1.0,17.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.946847,True
2,7JTP_difflinker_2,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,1,False,False,True,92.584306,0.0,17.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.960191,False
3,7JTP_difflinker_3,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CCCO,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,1,False,False,True,93.127171,0.0,15.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.925398,True
4,7JTP_difflinker_4,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CCC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,1,False,False,True,101.774595,0.0,17.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.93544,True


In [6]:
len(df)

4901

#### Add generated pose to the data

In [7]:
gen_pose_path = os.path.join(os.path.expanduser('~'), 'Documents/data/protacs/preprint_data/difflinker_results', pdb)

In [8]:
def get_gen_confs(filepath: str, smi: str) -> pd.Series:
    mol_dirty = Chem.SDMolSupplier(os.path.join(gen_pose_path, filepath))[0]
    largest_Fragment = rdMolStandardize.LargestFragmentChooser()
    mol = largest_Fragment.choose(mol_dirty)
    mol_smi = Chem.MolFromSmiles(smi)
    assert mol_smi.HasSubstructMatch(mol), f'{smi} does not match {filepath}'
    return pd.Series({'gen_mol': mol}) 

In [9]:
# store embedded conformers and vinardo scores in dataframe using apply
df[['gen_mol']] = df.progress_apply(lambda x: get_gen_confs(x['gen_filename'], x['protac_smiles']), axis=1)

  0%|          | 0/4901 [00:00<?, ?it/s]

### Xtal references

In [10]:
xtal_folder = 'data/xtal_poses'
pdb_folder = os.path.join(xtal_folder, pdb, f'{pdb}_fragments')
xtal_protein_path = os.path.join(pdb_folder, f'{pdb}_protein.pdb')
# xtal_ext_linker = Chem.MolFromMolFile(os.path.join(pdb_folder,f'{pdb}_linker_extended.sdf'))
xtal_protac = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_protac.sdf'))
# xtal_linker = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_linker.sdf'))
xtal_anchor = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_anchor.sdf'))
xtal_warhead = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_warhead.sdf'))

## Get 3D metrics

In [11]:
def gen_protac_conf(gen_filepath, protac_smiles, xtal_anchor, xtal_warhead) -> pd.Series:
    filepath = os.path.join(gen_pose_path, gen_filepath)
    mol_protac = Chem.MolFromSmiles(protac_smiles)
    mol_gen = Chem.SDMolSupplier(filepath)[0]
    largest_Fragment = rdMolStandardize.LargestFragmentChooser()
    mol_gen = largest_Fragment.choose(mol_gen)
    mol_gen_smi = Chem.MolToSmiles(mol_gen, isomericSmiles=False)
    protac_smiles_flat = Chem.MolToSmiles(mol_protac, isomericSmiles=False)
    if mol_gen_smi != protac_smiles_flat:
        gen_filename = gen_filepath.split('.')[0]
        new_filepath = os.path.join(gen_pose_path, f'{gen_filename}_expanded.sdf')
        success = Chem.AllChem.EmbedMolecule(mol_protac, useRandomCoords=True)
        attempts = 1
        while success != 0:
            success = Chem.AllChem.EmbedMolecule(mol_protac, maxAttempts=10000, useRandomCoords=True, ignoreSmoothingFailures=True)
            attempts += 1
            if attempts > 1:
                print(f'Embedding failed for {gen_filepath}')
                return pd.Series({'gen_mol': None, 'ori_gen_ptc_filename': None})
        conf = mol_protac.GetConformer()
        match_anchor = mol_protac.GetSubstructMatch(xtal_anchor)
        match_warhead = mol_protac.GetSubstructMatch(xtal_warhead)
        mol_gen_match = mol_protac.GetSubstructMatch(mol_gen)
        xtal_anchor_coords = []
        for i, atom in enumerate(xtal_anchor.GetAtoms()):
            xtal_anchor_coords.append(xtal_anchor.GetConformer().GetAtomPosition(i))
        xtal_warhead_coords = []
        for i, atom in enumerate(xtal_warhead.GetAtoms()):
            xtal_warhead_coords.append(xtal_warhead.GetConformer().GetAtomPosition(i))
        mol_gen_coords = []
        for i, atom in enumerate(mol_gen.GetAtoms()):
            mol_gen_coords.append(mol_gen.GetConformer().GetAtomPosition(i))
        reset_ptc_idx = []
        for wrh_idx, ptc_idx in enumerate(match_warhead):
            coord = xtal_warhead_coords[wrh_idx]
            reset_ptc_idx.append(ptc_idx)
            conf.SetAtomPosition(ptc_idx,coord)
        for anc_idx, ptc_idx in enumerate(match_anchor):
            coord = xtal_anchor_coords[anc_idx]
            reset_ptc_idx.append(ptc_idx)
            conf.SetAtomPosition(ptc_idx,coord)
        for gen_idx, ptc_idx in enumerate(mol_gen_match):
            if ptc_idx not in reset_ptc_idx:
                coord = mol_gen_coords[gen_idx]
                conf.SetAtomPosition(ptc_idx,coord)
                reset_ptc_idx.append(ptc_idx)
        # Chem.MolToMolFile(mol_protac, new_filepath)
        assert len(reset_ptc_idx) == len(mol_protac.GetAtoms()), f'{len(reset_ptc_idx)} != {len(mol_protac.GetAtoms())}'
    else:
        new_filepath = filepath
        mol_protac = mol_gen
    return pd.Series({'gen_mol': mol_protac, 'ori_gen_ptc_filename': new_filepath})

In [12]:
def gen_protac_conf_new(gen_filepath, protac_smiles, xtal_anchor, xtal_warhead):
    filepath = os.path.join(gen_pose_path, gen_filepath)
    mol_protac = Chem.MolFromSmiles(protac_smiles)
    mol_gen = Chem.SDMolSupplier(filepath)[0]
    largest_Fragment = rdMolStandardize.LargestFragmentChooser()
    mol_gen = largest_Fragment.choose(mol_gen)
    mol_gen_smi = Chem.MolToSmiles(mol_gen, isomericSmiles=False)
    protac_smiles_flat = Chem.MolToSmiles(mol_protac, isomericSmiles=False)
    if mol_gen_smi != protac_smiles_flat:
        xtal_anc_copy = copy.deepcopy(xtal_anchor)
        xtal_wrh_copy = copy.deepcopy(xtal_warhead)
        mol_gen_coords = []
        for i, atom in enumerate(mol_gen.GetAtoms()):
            atom.SetAtomMapNum(i+1)
            mol_gen_coords.append(mol_gen.GetConformer().GetAtomPosition(i))
        dict_gen = {a.GetAtomMapNum(): a.GetIdx() for a in mol_gen.GetAtoms()}
        xtal_anc_cp_coords = []
        for i, atom in enumerate(xtal_anc_copy.GetAtoms()):
            atom.SetAtomMapNum(i+100)
            xtal_anc_cp_coords.append(xtal_anc_copy.GetConformer().GetAtomPosition(i))
        dict_anc = {a.GetAtomMapNum(): a.GetIdx() for a in xtal_anc_copy.GetAtoms()}
        xtal_wrh_cp_coords = []
        for i, atom in enumerate(xtal_wrh_copy.GetAtoms()):
            atom.SetAtomMapNum(i+200)
            xtal_wrh_cp_coords.append(xtal_wrh_copy.GetConformer().GetAtomPosition(i))
        dict_wrh = {a.GetAtomMapNum(): a.GetIdx() for a in xtal_wrh_copy.GetAtoms()}
        # check if matching coordinates between xtal and gen
        overlap_mapNum_anc = []
        dict_gen2anc = {}
        for i, coord in enumerate(mol_gen_coords):
            for j, anc_coord in enumerate(xtal_anc_cp_coords):
                if coord.x == anc_coord.x and coord.y == anc_coord.y and coord.z == anc_coord.z:
                    overlap_mapNum_anc.append(mol_gen.GetAtomWithIdx(i).GetAtomMapNum())
                    dict_gen2anc[mol_gen.GetAtomWithIdx(i).GetAtomMapNum()] = xtal_anc_copy.GetAtomWithIdx(j).GetAtomMapNum()
        overlap_mapNum_wrh = []
        dict_gen2wrh = {}
        for i, coord in enumerate(mol_gen_coords):
            for j, wrh_coord in enumerate(xtal_wrh_cp_coords):
                if coord.x == wrh_coord.x and coord.y == wrh_coord.y and coord.z == wrh_coord.z:
                    overlap_mapNum_wrh.append(mol_gen.GetAtomWithIdx(i).GetAtomMapNum())
                    dict_gen2wrh[mol_gen.GetAtomWithIdx(i).GetAtomMapNum()] = xtal_wrh_copy.GetAtomWithIdx(j).GetAtomMapNum()
        # find the bond between overlap and not overlap
        map_in_anc = None
        map_in_wrh = None
        for bond in mol_gen.GetBonds():
            if bond.GetBeginAtom().GetAtomMapNum() in overlap_mapNum_anc and bond.GetEndAtom().GetAtomMapNum() not in overlap_mapNum_anc:
                map_in_anc = bond.GetBeginAtom().GetAtomMapNum()
                map_not_in_anc = bond.GetEndAtom().GetAtomMapNum()
                break
            if bond.GetEndAtom().GetAtomMapNum() in overlap_mapNum_anc and bond.GetBeginAtom().GetAtomMapNum() not in overlap_mapNum_anc:
                map_in_anc = bond.GetEndAtom().GetAtomMapNum()
                map_not_in_anc = bond.GetBeginAtom().GetAtomMapNum()
                break
        for bond in mol_gen.GetBonds():
            if bond.GetBeginAtom().GetAtomMapNum() in overlap_mapNum_wrh and bond.GetEndAtom().GetAtomMapNum() not in overlap_mapNum_wrh:
                map_in_wrh = bond.GetBeginAtom().GetAtomMapNum()
                map_not_in_wrh = bond.GetEndAtom().GetAtomMapNum()
                break
            if bond.GetEndAtom().GetAtomMapNum() in overlap_mapNum_wrh and bond.GetBeginAtom().GetAtomMapNum() not in overlap_mapNum_wrh:
                map_in_wrh = bond.GetEndAtom().GetAtomMapNum()
                map_not_in_wrh = bond.GetBeginAtom().GetAtomMapNum()
                break
        if map_in_anc is None or map_in_wrh is None:
            return pd.Series({'gen_mol': None, 'ori_gen_ptc_filename': None})  
        # remove atoms of overlap in gen
        wrh_ind_toremove = [dict_gen[x] for x in overlap_mapNum_wrh]
        # sort in reverse
        wrh_ind_toremove.sort(reverse=True)
        mol_gen_adapt = remove_atom_indices(mol_gen, wrh_ind_toremove)
        dict_gen = {a.GetAtomMapNum(): a.GetIdx() for a in mol_gen_adapt.GetAtoms()}
        anc_ind_toremove = [dict_gen[x] for x in overlap_mapNum_anc]
        # sort in reverse
        anc_ind_toremove.sort(reverse=True)
        mol_gen_adapt = remove_atom_indices(mol_gen_adapt, anc_ind_toremove)
        # remove atom if hydrogen at attechment point in xtal
        index_attach = dict_anc[dict_gen2anc[map_in_anc]]
        # get neighbors for index_attach
        xtal_anc_copy = Chem.AddHs(xtal_anc_copy)
        nbrs = xtal_anc_copy.GetAtomWithIdx(index_attach).GetNeighbors()
        for nbr in nbrs:
            if nbr.GetSymbol() == 'H':
                xtal_anc_copy = remove_atom_indices(xtal_anc_copy, [nbr.GetIdx()])
                break
        index_attach = dict_wrh[dict_gen2wrh[map_in_wrh]]
        # get neighbors for index_attach
        xtal_wrh_copy = Chem.AddHs(xtal_wrh_copy)
        nbrs = xtal_wrh_copy.GetAtomWithIdx(index_attach).GetNeighbors()
        for nbr in nbrs:
            if nbr.GetSymbol() == 'H':
                xtal_wrh_copy = remove_atom_indices(xtal_wrh_copy, [nbr.GetIdx()])
                break
        combo_gen_anc = Chem.CombineMols(mol_gen_adapt, xtal_anc_copy)
        dict_combo = {a.GetAtomMapNum(): a.GetIdx() for a in combo_gen_anc.GetAtoms()}
        combo_gen_anc_edit = Chem.EditableMol(combo_gen_anc)
        combo_gen_anc_edit.AddBond(dict_combo[map_not_in_anc], dict_combo[dict_gen2anc[map_in_anc]], Chem.BondType.SINGLE)
        combo_gen_anc = combo_gen_anc_edit.GetMol()
        combo_all = Chem.CombineMols(combo_gen_anc, xtal_wrh_copy)
        dict_combo = {a.GetAtomMapNum(): a.GetIdx() for a in combo_all.GetAtoms()}
        combo_all_edit = Chem.EditableMol(combo_all)
        combo_all_edit.AddBond(dict_combo[map_not_in_wrh], dict_combo[dict_gen2wrh[map_in_wrh]], Chem.BondType.SINGLE)
        combo_all = combo_all_edit.GetMol()
        combo_all =Chem.RemoveHs(combo_all)
        for a in combo_all.GetAtoms():
            a.SetAtomMapNum(0)
        flat_combo = Chem.MolToSmiles(combo_all, isomericSmiles=False)
        if flat_combo != protac_smiles_flat:
            print('ERROR: flat_combo != protac_smiles_flat')
            return pd.Series({'gen_mol': None, 'ori_gen_ptc_filename': None})  
        gen_filename = gen_filepath.split('.')[0]
        new_filepath = os.path.join(gen_pose_path, f'{gen_filename}_expanded.sdf')    
        Chem.MolToMolFile(combo_all, new_filepath)
        return pd.Series({'gen_mol': combo_all, 'ori_gen_ptc_filename': new_filepath})


In [13]:
df[['gen_mol', 'ori_gen_ptc_filename']] = df.progress_apply(lambda x: gen_protac_conf_new(x['gen_filename'], x['protac_smiles'], xtal_anchor, xtal_warhead), axis=1)

  0%|          | 0/4901 [00:00<?, ?it/s]

In [14]:
df_fil = df[~df.ori_gen_ptc_filename.isnull()]
len(df_fil)

4901

In [15]:
# percent failed
(len(df)-len(df_fil))/len(df)*100

0.0

### SC RDKit

In [16]:
df_fil['ori_sc_rdkit'] = df_fil.apply(lambda x: calc_SC_RDKit_score(x.gen_mol, xtal_protac), axis=1)
print(df_fil['ori_sc_rdkit'].describe())

count    4901.000000
mean        0.976829
std         0.016649
min         0.950274
25%         0.962875
50%         0.977873
75%         0.993348
max         0.999666
Name: ori_sc_rdkit, dtype: float64


In [17]:
print(f'fraction of conformers with SC_RDKIT > 0.7: {(len(df_fil[df_fil["ori_sc_rdkit"] > 0.7])/len(df_fil)*100):.4f}')
print(f'fraction of conformers with SC_RDKIT > 0.8: {(len(df_fil[df_fil["ori_sc_rdkit"] > 0.8])/len(df_fil)*100):.4f}')
print(f'fraction of conformers with SC_RDKIT > 0.9: {(len(df_fil[df_fil["ori_sc_rdkit"] > 0.9])/len(df_fil)*100):.4f}')
print(f'average SC_RDKIT: {df_fil["ori_sc_rdkit"].mean(skipna=True):.4f}')

fraction of conformers with SC_RDKIT > 0.7: 100.0000
fraction of conformers with SC_RDKIT > 0.8: 100.0000
fraction of conformers with SC_RDKIT > 0.9: 100.0000
average SC_RDKIT: 0.9768


### Clashes with protein

In [18]:
df_fil['ori_clashes_cutoff'] = df_fil.apply(lambda x: lig_protein_clash_dist(xtal_protein_path, os.path.join(gen_pose_path, x.ori_gen_ptc_filename.split('/')[-1])), axis=1)
df_fil['ori_clashes_vdw'] = df_fil.apply(lambda x: lig_protein_clash_vdw(xtal_protein_path, os.path.join(gen_pose_path, x.ori_gen_ptc_filename.split('/')[-1])), axis=1)
print(f'average clashes_cutoff: {df_fil["ori_clashes_cutoff"].mean(skipna=True):.4f}')
print(f'average clashes_vdw: {df_fil["ori_clashes_vdw"].mean(skipna=True):.4f}')

average clashes_cutoff: 0.0082
average clashes_vdw: 15.7392


### Torsion energy

In [19]:
df_fil['ori_E_torsion'] = df_fil.apply(lambda x: calc_torsion_energy(os.path.join(gen_pose_path, x.ori_gen_ptc_filename.split('/')[-1])), axis=1)
print(f'average E_torsion: {df_fil["ori_E_torsion"].mean(skipna=True):.4f}')

average E_torsion: 85.8418


In [20]:
len(df_fil)

4901

In [21]:
df_fil.columns, df.columns

(Index(['ID', 'reference', 'lig_id', 'protac_smiles', 'linker_smiles',
        'anchor_smiles', 'warhead_smiles', 'anchor_ev', 'warhead_ev', 'POI',
        'E3', 'gen_filename', 'frags', 'tanimoto', 'qed_linker', 'sa_linker',
        'num_rings_linker', 'num_rot_bonds_linker', 'branched', 'PAINS',
        'ring_arom', 'ori_E_torsion', 'ori_clashes_cutoff', 'ori_clashes_vdw',
        'ori_gen_ptc_filename', 'ori_sc_rdkit', 'to_3d', 'gen_mol'],
       dtype='object'),
 Index(['ID', 'reference', 'lig_id', 'protac_smiles', 'linker_smiles',
        'anchor_smiles', 'warhead_smiles', 'anchor_ev', 'warhead_ev', 'POI',
        'E3', 'gen_filename', 'frags', 'tanimoto', 'qed_linker', 'sa_linker',
        'num_rings_linker', 'num_rot_bonds_linker', 'branched', 'PAINS',
        'ring_arom', 'ori_E_torsion', 'ori_clashes_cutoff', 'ori_clashes_vdw',
        'ori_gen_ptc_filename', 'ori_sc_rdkit', 'to_3d', 'gen_mol'],
       dtype='object'))

In [22]:
df_loss = df[df.ori_gen_ptc_filename.isnull()]
len(df_loss)

0

In [23]:
# concat
df_save = pd.concat([df_fil, df_loss], axis=0)
df_save.reset_index(drop=True, inplace=True)
df_save.head()

Unnamed: 0,ID,reference,lig_id,protac_smiles,linker_smiles,anchor_smiles,warhead_smiles,anchor_ev,warhead_ev,POI,...,branched,PAINS,ring_arom,ori_E_torsion,ori_clashes_cutoff,ori_clashes_vdw,ori_gen_ptc_filename,ori_sc_rdkit,to_3d,gen_mol
0,7JTP_difflinker_0,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNCO,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,False,False,True,87.307063,0.0,18.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.982011,True,<rdkit.Chem.rdchem.Mol object at 0x7f94a16f74c0>
1,7JTP_difflinker_1,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,False,False,True,84.169901,1.0,17.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.974625,True,<rdkit.Chem.rdchem.Mol object at 0x7f94a16f7640>
2,7JTP_difflinker_2,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,False,False,True,81.773502,0.0,17.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.987969,False,<rdkit.Chem.rdchem.Mol object at 0x7f94a16f7520>
3,7JTP_difflinker_3,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CCCO,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,False,False,True,82.316366,0.0,15.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.953176,True,<rdkit.Chem.rdchem.Mol object at 0x7f94a16f7d00>
4,7JTP_difflinker_4,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CCC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,False,False,True,90.963791,0.0,17.0,/Users/rebeccaneeser/Documents/data/protacs/pr...,0.963217,True,<rdkit.Chem.rdchem.Mol object at 0x7f94a16f7700>


In [24]:
# drop gen_mol column
df_save = df_save.drop(columns=['gen_mol'])
df_save.to_csv(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid.csv'), index=False)

#### reinsert corrected value to filtered df

In [25]:
len(df_save)

4901

In [26]:
df_smaller = pd.read_csv(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid_fil.csv'))
df_smaller.head()

Unnamed: 0,ID,reference,lig_id,protac_smiles,linker_smiles,anchor_smiles,warhead_smiles,anchor_ev,warhead_ev,POI,...,to_3d,E_torsion,clashes_cutoff,clashes_vdw,embedded_mol,embedded_path,rmsd_anc,rmsd_wrh,sc_rdkit,vinardo
0,7JTP_difflinker_0,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNCO,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,True,92.469123,0.0,20.0,<rdkit.Chem.rdchem.Mol object at 0x7fe4195a4340>,selected_min_conf0__7JTP_difflinker_0.sdf,0.484488,0.490085,0.813721,-18.57411
1,7JTP_difflinker_1,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CNC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,True,,,,,,,,,
2,7JTP_difflinker_3,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CCCO,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,True,,,,,,,,,
3,7JTP_difflinker_4,7JTP,MS67,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,CCC=O,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,True,,,,,,,,,
4,7JTP_difflinker_6,7JTP,MS67,CC(N=Cc1ccc(F)c(-c2ccc(N3C[C@H](C)N(C)[C@H](C)...,C=NCC,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3ccccc3F)cc2NC(=O)c2c[nH]c(...,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,C[C@@H]1CN(c2ccc(-c3cc([*:2])ccc3F)cc2NC(=O)c2...,WDR5,...,True,87.659153,0.0,17.0,<rdkit.Chem.rdchem.Mol object at 0x7fe4195a4a00>,selected_min_conf0__7JTP_difflinker_6.sdf,0.386168,0.480816,0.861511,-20.0069


In [27]:
df_smaller.columns

Index(['ID', 'reference', 'lig_id', 'protac_smiles', 'linker_smiles',
       'anchor_smiles', 'warhead_smiles', 'anchor_ev', 'warhead_ev', 'POI',
       'E3', 'gen_filename', 'frags', 'tanimoto', 'qed_linker', 'sa_linker',
       'num_rings_linker', 'num_rot_bonds_linker', 'branched', 'PAINS',
       'ring_arom', 'ori_E_torsion', 'ori_clashes_cutoff', 'ori_clashes_vdw',
       'ori_gen_ptc_filename', 'ori_sc_rdkit', 'to_3d', 'E_torsion',
       'clashes_cutoff', 'clashes_vdw', 'embedded_mol', 'embedded_path',
       'rmsd_anc', 'rmsd_wrh', 'sc_rdkit', 'vinardo'],
      dtype='object')

In [28]:
columns_replace = ['ori_E_torsion', 'ori_clashes_cutoff', 'ori_clashes_vdw',
       'ori_gen_ptc_filename', 'ori_sc_rdkit']
for col in columns_replace:
    # store df_save values in df_smaller by  mapping ID
       df_smaller[col] = df_smaller['ID'].map(df_save.set_index('ID')[col])

In [29]:
### correct! add to both valid and valid_fil
df_smaller.to_csv(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid_fil.csv'), index=False)