In [12]:
from rdkit import Chem
import pandas as pd

In [14]:
# Function to read an SDF file and return a list of dictionaries with molecule data
def read_sdf_to_dict(sdf_file):
    supplier = Chem.SDMolSupplier(sdf_file)
    molecule_data = []

    for mol in supplier:
        if mol is None:
            continue
        
        # Get the SMILES string
        smiles = Chem.MolToSmiles(mol)
        
        # Get the molecule's name (if available)
        mol_name = mol.GetProp("_Name") if mol.HasProp("_Name") else "Unknown"
        
        if mol_name[:4]=="CHEM":
            kind = "actives"
        elif mol_name[:4]=="ZINC":
            kind = "decoys"
        
        # You can extract more properties if needed using mol.GetProp("property_name")
        
        # Append to the list as a dictionary
        molecule_data.append({
            "Molecule_Name": mol_name,
            "SMILES": smiles,
            "Ligand": kind
        })
    
    return molecule_data

In [15]:
# Paths to your two SDF files
actives_file = "actives_final.sdf"
decoys_file = "decoys_final.sdf"

In [16]:
# Read both SDF files
actives_data = read_sdf_to_dict(actives_file)
decoys_data = read_sdf_to_dict(decoys_file)

In [17]:
# Combine the two lists
combined_ligands = actives_data + decoys_data

In [18]:
df = pd.DataFrame(combined_ligands)

In [21]:
df.to_csv("HMDH_combined_ligand_SMILES.csv")