In [None]:
import pandas as pd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
import py3Dmol
import tqdm

from rdkit import Chem
from rdkit.Chem import RDConfig
import os
import sys

# Ensure the 'Contrib/SA_Score' directory is in your path to import sascorer
# If sascorer.py is in the same directory, you can just import it directly
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer
from rdkit.Chem import Descriptors


In [None]:
df_main = pd.read_csv('/path/to/dataset.csv')  # Replace with your dataset path



In [None]:
from rdkit.Chem import QED
from rdkit.Chem import Crippen
from rdkit.Chem import AllChem, DataStructs


def get_fp_similarity(smile1, smile2):
    try:
            mol1 = Chem.MolFromSmiles(smile1)
            mol2 = Chem.MolFromSmiles(smile2)
            fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048, useChirality=False)
            fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048, useChirality=False)
            return DataStructs.TanimotoSimilarity(fp1, fp2)
    except: 
        return 0.0

def enrich(df,mol_col, org_mol_col, mol_aff_col,org_mol_aff_col):
    org_mols_smi = df[org_mol_col].unique()
    org_mols = [Chem.MolFromSmiles(smi) for smi in org_mols_smi]
    org_mols_d = {smi: mol for smi, mol in zip(org_mols_smi, org_mols)}
    org_mols = df[org_mol_col].map(org_mols_d)
    
    
    mols = df[mol_col].apply(lambda smi: Chem.MolFromSmiles(smi))
    try:
        del df['similarity']
    except:
        pass
    df['Similarity'] = [get_fp_similarity(smi1, smi2) for smi1, smi2 in zip(df[mol_col], df[org_mol_col])]
    df['Success'] = (df[mol_aff_col] < df[org_mol_aff_col]) & (df['Similarity'] > 0.4) & (df['Similarity'] < 1)
    df['Diversity'] = 1 - df['Similarity']
    df['SA'] = mols.apply(lambda m: sascorer.calculateScore(m))
    df['mw'] = mols.apply(lambda m: Descriptors.MolWt(m))
    df['LogP'] = mols.apply(lambda m: Descriptors.MolLogP(m))
    df['QED'] = mols.apply(lambda m: QED.qed(m)) 
    return df

enrich(df_main, 'optimized_smiles' ,'original_smiles', 'optimized_affinity', 'original_affinity')

In [None]:
del_cols = ['protein_pdb', 'original_sdf','optimized_smiles', 'original_affinity','original_smiles']

def join_dfs(**dfs):
    common_mols = None
    for df in dfs.values():
        df = df.dropna()
        mols = set(df['original_smiles'].unique())
        if common_mols is None:
            common_mols = mols
        else:
            common_mols = common_mols.intersection(mols)
    print(f'Common molecules: {len(common_mols)}')
    summary_rows = []
    for name, df in dfs.items():
        df = df.dropna()
        df = df[df['original_smiles'].isin(common_mols)]
        df = df.drop(columns=del_cols)
        row = df.agg(['mean', 'var']).unstack()
        row.name = name
        summary_rows.append(row)
    summary_df = pd.DataFrame(summary_rows)
    return summary_df

df_ref = df_main.copy()
df_ref['optimized_smiles'] = df_ref['original_smiles']
df_ref['optimized_affinity'] = df_ref['original_affinity']

summary_df = join_dfs(Ref=df_ref, MODOLO=df_main)
# summary_df = join_dfs(Ref=df_ref, MODOLO=df_main, CFOM=df_cfom, DiffSBDD=df_diffsbdd)
print(summary_df.to_string(float_format=lambda x: f'{x:.2f}',col_space=6))