In [1]:
import pandas as pd
from pathlib import Path

df = pd.DataFrame([str(i) for i in list(Path("./").rglob("*.pdbqt"))])
df.columns = ['file']

In [2]:
import re
import sys
sys.path.append('..')
from pdbqt import get_first_mol_from_pdbqt

def extract_id(file_name):
    finds = re.findall(r"([0-9]+)", file_name) # ids are just numbers
    return finds[0]

In [3]:
df[['smiles', 'score']] = df['file'].apply(lambda x: pd.Series(get_first_mol_from_pdbqt(x, return_type='smiles')))

In [4]:
from rdkit.Chem import AllChem
morgan = AllChem.GetMorganGenerator(radius=2, fpSize=512)

def morgan_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return morgan.GetFingerprint(mol)

df['morgan'] = df['smiles'].apply(morgan_fp)

In [5]:
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

def get_cluster_representative(df, cutoff):
    fps = df['morgan']
    n_fps = len(fps)
    dists = []
    for i in range(1, n_fps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], list(fps[:i]))
        dists.extend([1 - x for x in sims])
    
    clusters = Butina.ClusterData(data=dists, nPts=n_fps, distThresh=cutoff, isDistData=True)
    
    # choose one representative per cluster: pick molecule with best docking score
    selected_idx = []
    for cl in clusters:
        if len(cl) == 1:
            selected_idx.append(cl[0])
            continue
        best = df.iloc[list(cl)]['score'].idxmin()
        selected_idx.append(best)

    return df.loc[selected_idx]

In [6]:
leads = get_cluster_representative(df, 0.4).sort_values(by='score', ascending=True).head(10)

In [7]:
leads[['smiles', 'score']].to_csv('top10.csv', index=False)