In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
smiles_df = pd.read_csv("smiles.csv", header=None)
smiles_df.columns = ["Drug", "SMILES"]
print("Loaded SMILES:")
display(smiles_df.head())
def generate_ecfp6(smiles, radius=6, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=int)
    Chem.DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
fingerprints = []
for idx, row in smiles_df.iterrows():
    drug = row["Drug"]
    smiles = row["SMILES"]
    fp = generate_ecfp6(smiles)
    if fp is None:
        print(f"Could not parse SMILES for drug {drug}")
        continue
    fp_row = [drug] + fp.tolist()
    fingerprints.append(fp_row)
columns = ["Drug"] + [f"ECFP6_{i}" for i in range(1024)]
ecfp_df = pd.DataFrame(fingerprints, columns=columns)

print("Generated ECFP6 shape:", ecfp_df.shape)
display(ecfp_df.head())
output_file = "ECFP6_1024bits.xlsx"
ecfp_df.to_excel(output_file, index=False)

Loaded SMILES:


Unnamed: 0,Drug,SMILES
0,5-FU,O=c1[nH]cc(F)c(=O)[nH]1
1,ABT-888,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
2,AZD1775,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)cc3)nc2n1-...
3,BEZ-235,Cn1c(=O)n(-c2ccc(C(C)(C)C#N)cc2)c2c3cc(-c4cnc5...
4,BORTEZOMIB,CC(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)c1cnccn1)B(O)O




Generated ECFP6 shape: (38, 1025)


Unnamed: 0,Drug,ECFP6_0,ECFP6_1,ECFP6_2,ECFP6_3,ECFP6_4,ECFP6_5,ECFP6_6,ECFP6_7,ECFP6_8,...,ECFP6_1014,ECFP6_1015,ECFP6_1016,ECFP6_1017,ECFP6_1018,ECFP6_1019,ECFP6_1020,ECFP6_1021,ECFP6_1022,ECFP6_1023
0,5-FU,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ABT-888,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AZD1775,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BEZ-235,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,BORTEZOMIB,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



ECFP6 file saved as: ECFP6_1024bits.xlsx
