In [None]:
## PART 4: VIRTUAL SCREENING FOR TEST DATASET (DRUGBANK DATASET2) USING TRAINED MODEL

In [None]:
## https://go.drugbank.com/releases/latest#open-data link for DRUGBANK dataset downloaded sdf format "open structures.sdf"

## The sdf format file viewed in Discovery studio and extracted common names and Smiles and saved as "VS_new_compounds.csv" data

In [16]:
# 1 Prepare the virtual screening library

# Collect potential compounds for screening as SMILES.


In [17]:
!pip install rdkit



In [18]:
import pandas as pd
from rdkit import Chem

vs_data = pd.read_csv("VS_new_compounds.csv")  # Column: "SMILES"
vs_data.dropna(subset=['SMILES'], inplace=True)


In [19]:
print(vs_data)

              Common_Name                                             SMILES
0             Bivalirudin  CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...
1               Goserelin  CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
2            Gramicidin D  CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
3            Desmopressin  N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]...
4              Cetrorelix  CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1cc...
...                   ...                                                ...
12304       Tulmimetostat  COC1CN([C@H]2CC[C@H]([C@@]3(C)Oc4c(Cl)cc(C(=O)...
12305        Ibuzatrelvir  COC(=O)N[C@H](C(=O)N1C[C@H](C(F)(F)F)C[C@H]1C(...
12306        Cetyl oleate         CCCCCCCC/C=C\CCCCCCCC(=O)OCCCCCCCCCCCCCCCC
12307  Cetyl myristoleate             CCCC/C=C\CCCCCCCC(=O)OCCCCCCCCCCCCCCCC
12308  Cetyl palmitoleate           CCCCCC/C=C\CCCCCCCC(=O)OCCCCCCCCCCCCCCCC

[12309 rows x 2 columns]


In [20]:
# 2 Compute Morgan fingerprints for the new compounds

# Use the same fingerprint generator as for training:

In [21]:
from rdkit.Chem import rdFingerprintGenerator
import numpy as np

# Load your saved Morgan generator
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

def smiles_to_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(1024)
    return morgan_gen.GetFingerprintAsNumPy(mol)

vs_data['MorganFingerprint'] = vs_data['SMILES'].apply(smiles_to_morgan)
X_vs = np.vstack(vs_data['MorganFingerprint'].values)


[14:56:00] Unusual charge on atom 42 number of radical electrons set to zero


In [None]:
# 3 Load trained model

In [23]:
import joblib

model = joblib.load("xgb_model.pkl")


In [24]:
# 4 Make predictions

In [25]:
# Predict probability of being active
y_proba = model.predict_proba(X_vs)[:, 1]  # probability of Active
vs_data['Predicted_Prob'] = y_proba

# Optional: classify using 0.5 threshold
vs_data['Predicted_Activity'] = (y_proba >= 0.5).astype(int)


In [None]:
# 5 Rank compounds

# rank by predicted probability for prioritizing experimental validation:

In [27]:
vs_data_sorted = vs_data.sort_values(by='Predicted_Prob', ascending=False)
vs_data_sorted.to_csv("virtual_screening_results.csv", index=False)


In [30]:
print(vs_data_sorted)

                                             Common_Name  \
5436   (3Z)-6-(4-HYDROXY-3-METHOXYPHENYL)-3-(1H-PYRRO...   
11110                                            PEN-866   
1778                     Furo[2,3d]Pyrimidine Antifolate   
11063                                 Carotegrast methyl   
5880   4-{5-[(Z)-(2-Imino-4-Oxo-1,3-Thiazolidin-5-Yli...   
...                                                  ...   
8164                                         Cilengitide   
10348                                           AZD-4017   
7140                                Gabapentin enacarbil   
7414                                             AS-8112   
10443                                           AZD-8186   

                                                  SMILES  \
5436     COc1cc(-c2ccc3c(c2)NC(=O)/C3=C\c2ccc[nH]2)ccc1O   
11110  CCc1c2c(nc3ccc(OC(=O)N4CCC(CCn5ccc6cc(-n7c(-c8...   
1778   CN(Cc1coc2nc(N)nc(N)c12)c1ccc(C(=O)N[C@@H](CCC...   
11063  COC(=O)[C@H](Cc1ccc(-n2c(=O)c3cc