In [2]:
import pandas as pd
from rdkit import Chem

# Load the original dataset
df = pd.read_csv('drug_dataset.csv')

# Determine the SMILES column.
# The code checks for common column names ('smiles' or 'SMILES').
smiles_col = None
for col in ['smiles', 'SMILES']:
    if col in df.columns:
        smiles_col = col
        break

if smiles_col is None:
    raise ValueError("No SMILES column found in the data. Expected a column named 'smiles' or 'SMILES'.")

# Get unique SMILES from the dataset (ignoring missing values)
original_smiles = df[smiles_col].dropna().unique()
print(f"Found {len(original_smiles)} unique original SMILES.")

# Set to hold the synthetic (randomized) SMILES
synthetic_smiles_set = set()

# Generate synthetic SMILES until we have at least 2000 unique ones.
# For each original SMILES, we generate one randomized version per iteration.
while len(synthetic_smiles_set) < 2000:
    for s in original_smiles:
        mol = Chem.MolFromSmiles(s)
        if mol is not None:
            # Generate a randomized SMILES (non-canonical)
            new_smile = Chem.MolToSmiles(mol, doRandom=True)
            synthetic_smiles_set.add(new_smile)
        # Break early if we've reached our target
        if len(synthetic_smiles_set) >= 2000:
            break

print(f"Generated {len(synthetic_smiles_set)} unique synthetic SMILES.")

# Convert the set to a DataFrame and save to CSV
synthetic_smiles_list = list(synthetic_smiles_set)
synthetic_df = pd.DataFrame({smiles_col: synthetic_smiles_list})
synthetic_df.to_csv('new_gen_smiles1.csv', index=False)

print("Synthetic SMILES saved to 'synthetic_dengue_smiles1.csv'")


Found 36 unique original SMILES.
Generated 2000 unique synthetic SMILES.
Synthetic SMILES saved to 'synthetic_dengue_smiles1.csv'
