In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
smiles_df = pd.read_csv("smiles.csv", header=None, names=["Drug", "SMILES"])
print("Loaded SMILES file:")
print(smiles_df.head())
smiles_df["Mol"] = smiles_df["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
if smiles_df["Mol"].isnull().any():
    print("Warning: some SMILES failed to parse.")
alerts_df = pd.read_csv("rd_filters/rd_filters/data/alert_collection.csv")
alerts_df = alerts_df[["description", "smarts"]].rename(columns={"description":"Name", "smarts":"SMARTS"})
print("Loaded SMARTS alerts:")
print(alerts_df.head())
def safe_smarts(smarts):
    try:
        return Chem.MolFromSmarts(smarts)
    except:
        return None
alerts_df["Mol"] = alerts_df["SMARTS"].apply(safe_smarts)
alerts_df = alerts_df[alerts_df["Mol"].notnull()].reset_index(drop=True)
print(f"Valid SMARTS patterns: {len(alerts_df)}")
feature_matrix = np.zeros((len(smiles_df), len(alerts_df)), dtype=int)
for i, drug_mol in enumerate(smiles_df["Mol"]):
    for j, alert_mol in enumerate(alerts_df["Mol"]):
        if drug_mol.HasSubstructMatch(alert_mol):
            feature_matrix[i, j] = 1
tox_df = pd.DataFrame(feature_matrix, columns=alerts_df["Name"])
tox_df["Drug"] = smiles_df["Drug"]
tox_df.to_csv("drug_toxicophore_features_descriptive.csv", index=False)
print("Toxicophore features saved successfully")