# Get 3D metrics for the constrained embedded poses

In [1]:
import os
import glob
import sys
from typing import Union, Dict, Tuple, Optional, List

import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from utils.calc_sc_rdkit import calc_SC_RDKit_score
from utils.metrics_3d import lig_protein_clash_dist, lig_protein_clash_vdw, calc_torsion_energy, mcs_rmsd

Cannot find license file.
 The license files (or license server system network addresses) attempted are 
listed below.  Use LM_LICENSE_FILE to use a different license file,
 or contact your software provider for a license file.
Feature:       PYMOL_MAIN
Filename:      /Library/Application Support/Schrodinger/licenses
License path:  /Library/Application Support/Schrodinger/licenses:
FlexNet Licensing error:-1,359.  System Error: 2 "No such file or directory"


In [2]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [3]:
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [4]:
pdb = '7ZNT'
method = 'base'

## Load Data

### Generated data

In [5]:
gen_folder = 'data/generated'
filepath = glob.glob(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid_fil.csv'))
print(filepath)
df = pd.read_csv(filepath[0])
df.head()

['data/generated/7ZNT_sampled_base_valid_fil.csv']


Unnamed: 0,ID,reference,protac_smiles,linker_smiles,extended_linker_smiles,anchor_smiles,warhead_smiles,anchor_ev,warhead_ev,POI,...,qed_linker,num_rings_linker,num_rot_bonds_linker,branched,PAINS,ring_arom,sa_linker,chamfer_distance,rmsd,to_3d
0,7ZNT_base_0,7ZNT,COc1cc(CSC(C)(C)[C@@H](NC(=O)C2(F)CC2)C(=O)N2C...,CCn1cc(C(=O)c2ncc(C)cc2OC)nc1N,COc1cc(CSC)cnc1C(=O)c1cn(CCNC=O)c(N)n1,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,0.840641,2,4,False,False,True,2.815789,4.314247,2.389408,True
1,7ZNT_base_2,7ZNT,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CCOc1cc(C)nc2cc(N)cnc12,CSCc1cc(OCCNC=O)c2ncc(N)cc2n1,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,0.809549,2,2,False,False,True,2.354916,3.941218,5.534881,True
2,7ZNT_base_3,7ZNT,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CC(=O)OCc1cnc(C)o1,CSCc1ncc(COC(=O)CNC=O)o1,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,0.598692,1,2,False,False,True,2.63585,1.938837,1.462539,True
3,7ZNT_base_4,7ZNT,COc1cccnc1NCc1nc(CSC(C)(C)[C@@H](NC(=O)C2(F)CC...,CC#CCOc1ccc(C)nc1CNc1ncccc1OC,COc1cccnc1NCc1nc(CSC)ccc1OCC#CCNC=O,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,0.830917,2,6,False,False,True,2.656086,6.670227,3.77847,True
4,7ZNT_base_5,7ZNT,COc1c(C(=O)OCCNC(=O)C[C@H]2N=C(c3ccc(Cl)cc3)c3...,CCOCC(=O)Nc1ccc(N)c(OC(=O)c2cccc(C(=O)OCC)c2OC)c1,COc1c(C(=O)OCCNC=O)cccc1C(=O)Oc1cc(NC(=O)COCCS...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,0.362923,2,9,False,False,True,2.315432,8.777076,3.583126,True


#### Add constrained embedded pose to the data

In [6]:
const_embed_path = os.path.join(os.path.expanduser('~'), 'Documents', 'data', 'protacs', 'preprint_data', 'const_embed', f'{pdb}_{method}_embed')
const_embed_pose_folder = glob.glob(os.path.join(const_embed_path, 'selected_min_confs'))
print(const_embed_pose_folder)

['/Users/rebeccaneeser/Documents/data/protacs/preprint_data/const_embed/7ZNT_base_embed/selected_min_confs']


In [7]:
def get_embedded_confs(mol_id: str, folderpath: str) -> pd.Series:
    sdfpath = glob.glob(os.path.join(folderpath, f'*__{mol_id}.sdf'))
    if sdfpath:
        filename = os.path.basename(sdfpath[0])
        sdfpath = sdfpath[0]
        mol = Chem.SDMolSupplier(sdfpath)[0]
        vinardo = float(mol.GetProp('minimizedAffinity'))
    else:
        mol = None
        vinardo = np.nan
        filename = None
    return pd.Series({'vinardo': vinardo, 'embedded_mol': mol, 'embedded_path': filename}) 

In [8]:
# store embedded conformers and vinardo scores in dataframe using apply
df[['vinardo', 'embedded_mol', 'embedded_path']] = df.progress_apply(lambda x: get_embedded_confs(x['ID'], const_embed_pose_folder[0]), axis=1)

  0%|          | 0/4240 [00:00<?, ?it/s]

In [9]:
df_fil = df[~df['embedded_mol'].isna()]
print(f'failed embedding: {(len(df)-len(df_fil))/len(df)*100:.4f}%')

failed embedding: 54.1509%


In [10]:
len(df_fil)

1944

### Xtal references

In [11]:
xtal_folder = 'data/xtal_poses'
pdb_folder = os.path.join(xtal_folder, pdb, f'{pdb}_fragments')
xtal_protein_path = os.path.join(pdb_folder, f'{pdb}_protein.pdb')
# xtal_ext_linker = Chem.MolFromMolFile(os.path.join(pdb_folder,f'{pdb}_linker_extended.sdf'))
xtal_protac = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_protac.sdf'))
# xtal_linker = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_linker.sdf'))
xtal_anchor = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_anchor.sdf'))
xtal_warhead = Chem.MolFromMolFile(os.path.join(pdb_folder, f'{pdb}_warhead.sdf'))

## Get 3D metrics

In [12]:
# rename ramds_anc to rmsd_anc and ramds_wrh to rmsd_wrh
df_fil.rename(columns={'ramds_anc': 'rmsd_anc', 'ramds_wrh': 'rmsd_wrh'}, inplace=True)
df.rename(columns={'ramds_anc': 'rmsd_anc', 'ramds_wrh': 'rmsd_wrh'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fil.rename(columns={'ramds_anc': 'rmsd_anc', 'ramds_wrh': 'rmsd_wrh'}, inplace=True)


In [13]:
# only calculate if metrics are nan
if 'rmsd_anc' in df_fil.columns:
    df_calc = df_fil[df_fil['rmsd_wrh'].isna()]
else:
    df_calc = df_fil.copy()
len(df_calc)

1944

### RMSD

In [14]:
df_calc['rmsd_anc'] = df_calc.apply(lambda x: mcs_rmsd(x['embedded_mol'], xtal_anchor), axis=1)
df_calc['rmsd_wrh'] = df_calc.apply(lambda x: mcs_rmsd(x['embedded_mol'], xtal_warhead), axis=1)
print(f'average anchor RMSD: {df_calc["rmsd_anc"].mean(skipna=True):.4f}')
print(f'average warhead RMSD: {df_calc["rmsd_wrh"].mean(skipna=True):.4f}')

average anchor RMSD: 0.4658
average warhead RMSD: 0.8312


### SC RDKit

In [15]:
df_calc['sc_rdkit'] = df_calc.apply(lambda x: calc_SC_RDKit_score(x.embedded_mol, xtal_protac), axis=1)
print(df_calc['sc_rdkit'].describe())

count    1944.000000
mean        0.691572
std         0.037700
min         0.512151
25%         0.665467
50%         0.690869
75%         0.716167
max         0.803309
Name: sc_rdkit, dtype: float64


In [16]:
print(f'average SC_RDKIT: {df_calc["sc_rdkit"].mean(skipna=True):.4f}')

average SC_RDKIT: 0.6916


### Clashes with protein

In [17]:
df_calc['clashes_cutoff'] = df_calc.apply(lambda x: lig_protein_clash_dist(xtal_protein_path, os.path.join(const_embed_pose_folder[0], x.embedded_path)), axis=1)
df_calc['clashes_vdw'] = df_calc.apply(lambda x: lig_protein_clash_vdw(xtal_protein_path, os.path.join(const_embed_pose_folder[0], x.embedded_path)), axis=1)
print(f'average clashes_cutoff: {df_calc["clashes_cutoff"].mean(skipna=True):.4f}')
print(f'average clashes_vdw: {df_calc["clashes_vdw"].mean(skipna=True):.4f}')

average clashes_cutoff: 0.0000
average clashes_vdw: 10.2803


### Torsion energy

In [18]:
df_calc['E_torsion'] = df_calc.apply(lambda x: calc_torsion_energy(os.path.join(const_embed_pose_folder[0], x.embedded_path)), axis=1)
print(f'average E_torsion: {df_calc["E_torsion"].mean(skipna=True):.4f}')

average E_torsion: 73.4453


## Format output and combine results

In [19]:
# replace respective rows in df_fil with df_calc based on ID
df_fil.loc[df_calc.index] = df_calc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fil.loc[df_calc.index] = df_calc


In [20]:
len(df_fil), len(df), len(df_calc)

(1944, 4240, 1944)

In [21]:
df_fil.columns
# use df_fil if already calculated something before! (must have those rows so taht are replaced by df_calc)

Index(['ID', 'reference', 'protac_smiles', 'linker_smiles',
       'extended_linker_smiles', 'anchor_smiles', 'warhead_smiles',
       'anchor_ev', 'warhead_ev', 'POI', 'E3', 'tanimoto', 'qed_linker',
       'num_rings_linker', 'num_rot_bonds_linker', 'branched', 'PAINS',
       'ring_arom', 'sa_linker', 'chamfer_distance', 'rmsd', 'to_3d',
       'vinardo', 'embedded_mol', 'embedded_path'],
      dtype='object')

In [22]:
# drop embedded_mol column
add_cols = ['E_torsion',
  'clashes_cutoff',
  'clashes_vdw',
  'embedded_mol',
  'embedded_path',
  'rmsd_anc',
  'rmsd_wrh',
  'sc_rdkit',
  'vinardo']
# add cols to val_to3d by mapping ID
for col in add_cols:
    df[col] = df['ID'].map(df_calc.set_index('ID')[col])
df = df.drop(columns=['embedded_mol'])

In [23]:
df.head()

Unnamed: 0,ID,reference,protac_smiles,linker_smiles,extended_linker_smiles,anchor_smiles,warhead_smiles,anchor_ev,warhead_ev,POI,...,rmsd,to_3d,vinardo,embedded_path,E_torsion,clashes_cutoff,clashes_vdw,rmsd_anc,rmsd_wrh,sc_rdkit
0,7ZNT_base_0,7ZNT,COc1cc(CSC(C)(C)[C@@H](NC(=O)C2(F)CC2)C(=O)N2C...,CCn1cc(C(=O)c2ncc(C)cc2OC)nc1N,COc1cc(CSC)cnc1C(=O)c1cn(CCNC=O)c(N)n1,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,2.389408,True,,,,,,,,
1,7ZNT_base_2,7ZNT,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CCOc1cc(C)nc2cc(N)cnc12,CSCc1cc(OCCNC=O)c2ncc(N)cc2n1,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,5.534881,True,-16.92254,selected_min_conf1__7ZNT_base_2.sdf,64.545705,0.0,9.0,0.355224,0.566088,0.762957
2,7ZNT_base_3,7ZNT,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,CC(=O)OCc1cnc(C)o1,CSCc1ncc(COC(=O)CNC=O)o1,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,1.462539,True,-16.5098,selected_min_conf2__7ZNT_base_3.sdf,64.828444,0.0,9.0,0.58143,0.74855,0.731996
3,7ZNT_base_4,7ZNT,COc1cccnc1NCc1nc(CSC(C)(C)[C@@H](NC(=O)C2(F)CC...,CC#CCOc1ccc(C)nc1CNc1ncccc1OC,COc1cccnc1NCc1nc(CSC)ccc1OCC#CCNC=O,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,3.77847,True,,,,,,,,
4,7ZNT_base_5,7ZNT,COc1c(C(=O)OCCNC(=O)C[C@H]2N=C(c3ccc(Cl)cc3)c3...,CCOCC(=O)Nc1ccc(N)c(OC(=O)c2cccc(C(=O)OCC)c2OC)c1,COc1c(C(=O)OCCNC=O)cccc1C(=O)Oc1cc(NC(=O)COCCS...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(N)=O)c1...,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N*)c...,BRD4-BD2,...,3.583126,True,,,,,,,,


In [24]:
len(df[df.rmsd_anc.isnull()]), len(df)

(2296, 4240)

In [25]:
df.to_csv(os.path.join(gen_folder, f'{pdb}_sampled_{method}_valid_fil.csv'), index=False)

### Summary of metrics

In [None]:
print(f'average anchor RMSD: {df["rmsd_anc"].mean(skipna=True):.4f}')
print(f'average warhead RMSD: {df["rmsd_wrh"].mean(skipna=True):.4f}')
print(f'average SC_RDKIT: {df["sc_rdkit"].mean(skipna=True):.4f}')
print(f'average clashes_vdw: {df["clashes_vdw"].mean(skipna=True):.4f}')
print(f'average E_torsion: {df["E_torsion"].mean(skipna=True):.4f}')