In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import selfies as sf

In [2]:
# Load the existing HDF5 file into a DataFrame
hdf5_file = "np_selfies_dataset.h5"
df = pd.read_hdf(hdf5_file, key='data')

In [3]:

# Define a function to calculate properties from an RDKit molecule
def calculate_properties(mol):
    if mol is None:  # Return None values if molecule conversion fails
        print("Molecule conversion failed")
        return None, None, None, None, None
    
    # Calculate properties
    mol_weight = Descriptors.MolWt(mol)
    log_p = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)
    
    return mol_weight, log_p, tpsa, h_donors, h_acceptors

In [4]:
# Prepare lists for storing results
smiles_list, mw_list, logp_list, tpsa_list, h_donors_list, h_acceptors_list = [], [], [], [], [], []

# Process each SELFIES string in the DataFrame
count = 0
for selfie in df['SELFIES']:
    # Convert SELFIES to SMILES
    try:
        smiles = sf.decoder(selfie)
        mol = Chem.MolFromSmiles(smiles)
    except Exception as e:
        print("Failed to convert from SELFIES to SMILES")
        smiles, mol = None, None  # Handle conversion errors
    
    smiles_list.append(smiles)
    
    # Calculate properties
    mw, logp, tpsa, h_donors, h_acceptors = calculate_properties(mol)
    mw_list.append(mw)
    logp_list.append(logp)
    tpsa_list.append(tpsa)
    h_donors_list.append(h_donors)
    h_acceptors_list.append(h_acceptors)

    # Log progress
    count += 1
    if count % 100 == 0:
        print(f"Processed {count}/{len(df['SELFIES'])}")

# Add the results as new columns to the DataFrame
df['SMILES'] = smiles_list
df['molecular_weight'] = mw_list
df['logP'] = logp_list
df['tpsa'] = tpsa_list
df['h_bond_donors'] = h_donors_list
df['h_bond_acceptors'] = h_acceptors_list

# Save the updated DataFrame to a new HDF5 file
output_hdf5_file = "np_selfies_with_properties.h5"
df.to_hdf(output_hdf5_file, key='data', mode='w', format='table')

print("New HDF5 file created with SELFIES, SMILES, and calculated properties.")

Processed 100/404318
Processed 200/404318
Processed 300/404318
Processed 400/404318
Processed 500/404318
Processed 600/404318
Processed 700/404318
Processed 800/404318
Processed 900/404318
Processed 1000/404318
Processed 1100/404318
Processed 1200/404318
Processed 1300/404318
Processed 1400/404318
Processed 1500/404318
Processed 1600/404318
Processed 1700/404318
Processed 1800/404318
Processed 1900/404318
Processed 2000/404318
Processed 2100/404318
Processed 2200/404318
Processed 2300/404318
Processed 2400/404318
Processed 2500/404318
Processed 2600/404318
Processed 2700/404318
Processed 2800/404318
Processed 2900/404318
Processed 3000/404318
Processed 3100/404318
Processed 3200/404318
Processed 3300/404318
Processed 3400/404318
Processed 3500/404318
Processed 3600/404318
Processed 3700/404318
Processed 3800/404318
Processed 3900/404318
Processed 4000/404318
Processed 4100/404318
Processed 4200/404318
Processed 4300/404318
Processed 4400/404318
Processed 4500/404318
Processed 4600/4043