In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
import numpy as np
from tqdm import tqdm

In [2]:
data_prefix = 'data_cleaned/'
df = pd.read_csv(data_prefix + 'original_data.csv')
df

Unnamed: 0,MOL_1,MOL_2,logV,MolFrac_1,T,benchmark,type,combos
0,B(Br)(Br)Br,CCO,-0.140112,1.0,298.00,train,original,B(Br)(Br)Br-CCO
1,BrBr,CCO,-0.011187,1.0,298.00,train,original,BrBr-CCO
2,BrC(Br)(Br)Br,c1ccccc1,-0.219755,0.0,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
3,BrC(Br)(Br)Br,c1ccccc1,-0.080765,0.1,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
4,BrC(Br)(Br)Br,c1ccccc1,0.050070,0.2,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
...,...,...,...,...,...,...,...,...
81805,CCCO,c1ccncc1,-0.055304,0.4,308.15,test,expanded,CCCO-c1ccncc1
81806,CCCO,c1ccncc1,-0.073118,0.3,308.15,test,expanded,CCCO-c1ccncc1
81807,CCCO,c1ccncc1,-0.082963,0.2,308.15,test,expanded,CCCO-c1ccncc1
81808,CCCO,c1ccncc1,-0.091413,0.1,308.15,test,expanded,CCCO-c1ccncc1


In [3]:
unique_smiles_1 = df['MOL_1'].unique()
unique_smiles_2 = df['MOL_2'].unique()
all_unique_smiles = pd.Series(list(set(unique_smiles_1) | set(unique_smiles_2)))

print(f"MOL_1 unique SMILES: {len(unique_smiles_1)}")
print(f"MOL_2 unique SMILES: {len(unique_smiles_2)}")
print(f"Total unique SMILES: {len(all_unique_smiles)}")

df_feats = pd.DataFrame({'SMILES': all_unique_smiles})
df_feats

MOL_1 unique SMILES: 1753
MOL_2 unique SMILES: 1753
Total unique SMILES: 1753


Unnamed: 0,SMILES
0,CC1C=CCC1
1,CCOCC(C)OCC(C)O
2,C
3,CCCCC(CC)C(=O)O
4,CC/C=C\CCO
...,...
1748,C1CC(C)C2CCCCC2C1
1749,C(Cl)C(Cl)CC
1750,COB(OC)OC
1751,CC(C)C(=O)C(C)C


In [4]:
# 计算所有RDKit特征的函数
def calculate_rdkit_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    features = {}
    
    for desc_name, desc_func in Descriptors.descList:
        try:
            features[desc_name] = desc_func(mol)
        except:
            features[desc_name] = np.nan
    return features

In [5]:
print(f"Calculating rdkit features for {len(df_feats)} SMILES...")

all_feature_names = [desc_name for desc_name, _ in Descriptors.descList]
print(f"In total {len(all_feature_names)} Rdkit descriptors")

features_list = []
failed_smiles = []

for idx, smiles in tqdm(enumerate(df_feats['SMILES']), total=len(df_feats)):
    features = calculate_rdkit_features(smiles)
    if features is not None:
        features_list.append(features)
    else:
        failed_smiles.append(smiles)
        features_list.append({key: np.nan for key in all_feature_names})


features_df = pd.DataFrame(features_list)
df_feats = pd.concat([df_feats, features_df], axis=1)

print(f"\nCompleted")
print(f"Success: {len(df_feats) - len(failed_smiles)}")
print(f"Failure: {len(failed_smiles)}")
if failed_smiles:
    print(f"Failed SMILES: {failed_smiles[:5]}...")  

df_feats

Calculating rdkit features for 1753 SMILES...
In total 208 Rdkit descriptors


100%|██████████| 1753/1753 [00:12<00:00, 145.02it/s]


Completed
Success: 1753
Failure: 0





Unnamed: 0,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC1C=CCC1,2.277778,0.865741,2.277778,0.865741,0.391810,82.146,72.066,82.078250,34,...,0,0,0,0,0,0,0,0,0,0
1,CCOCC(C)OCC(C)O,8.860855,-0.389744,8.860855,0.073657,0.627797,162.229,144.085,162.125594,68,...,0,0,0,0,0,0,0,0,0,0
2,C,0.000000,0.000000,0.000000,0.000000,0.359785,16.043,12.011,16.031300,8,...,0,0,0,0,0,0,0,0,0,0
3,CCCCC(CC)C(=O)O,10.436393,-0.642639,10.436393,0.111157,0.642372,144.214,128.086,144.115030,60,...,0,0,0,0,0,0,0,0,0,0
4,CC/C=C\CCO,8.241633,0.274306,8.241633,0.274306,0.530402,100.161,88.065,100.088815,42,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1748,C1CC(C)C2CCCCC2C1,2.474352,1.050046,2.474352,1.050046,0.497143,152.281,132.121,152.156501,64,...,0,0,0,0,0,0,0,0,0,0
1749,C(Cl)C(Cl)CC,5.517747,0.172840,5.517747,0.172840,0.498697,127.014,118.950,126.000306,38,...,0,0,0,0,0,0,0,0,0,0
1750,COB(OC)OC,4.604167,-0.513889,4.604167,0.513889,0.467988,103.914,94.842,104.064475,42,...,0,0,0,0,0,0,0,0,0,0
1751,CC(C)C(=O)C(C)C,10.842593,0.203704,10.842593,0.203704,0.535256,114.188,100.076,114.104465,48,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_merged = df.merge(
    df_feats.add_prefix('MOL1_'), 
    left_on='MOL_1', 
    right_on='MOL1_SMILES',
    how='left'
)

df_merged = df_merged.merge(
    df_feats.add_prefix('MOL2_'),
    left_on='MOL_2',
    right_on='MOL2_SMILES',
    how='left'
)

df_merged = df_merged.drop(columns=['MOL1_SMILES', 'MOL2_SMILES'])

print(f"original data shape: {df.shape}")
print(f"data_feature shape: {df_merged.shape}")
print(f"number of features added: {df_merged.shape[1] - df.shape[1]}")

original data shape: (81810, 8)
data_feature shape: (81810, 424)
number of features added: 416


In [7]:
df_merged

Unnamed: 0,MOL_1,MOL_2,logV,MolFrac_1,T,benchmark,type,combos,MOL1_MaxEStateIndex,MOL1_MinEStateIndex,...,MOL2_fr_sulfide,MOL2_fr_sulfonamd,MOL2_fr_sulfone,MOL2_fr_term_acetylene,MOL2_fr_tetrazole,MOL2_fr_thiazole,MOL2_fr_thiocyan,MOL2_fr_thiophene,MOL2_fr_unbrch_alkane,MOL2_fr_urea
0,B(Br)(Br)Br,CCO,-0.140112,1.0,298.00,train,original,B(Br)(Br)Br-CCO,3.104167,0.270833,...,0,0,0,0,0,0,0,0,0,0
1,BrBr,CCO,-0.011187,1.0,298.00,train,original,BrBr-CCO,2.750000,2.750000,...,0,0,0,0,0,0,0,0,0,0
2,BrC(Br)(Br)Br,c1ccccc1,-0.219755,0.0,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1,3.125000,-0.250000,...,0,0,0,0,0,0,0,0,0,0
3,BrC(Br)(Br)Br,c1ccccc1,-0.080765,0.1,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1,3.125000,-0.250000,...,0,0,0,0,0,0,0,0,0,0
4,BrC(Br)(Br)Br,c1ccccc1,0.050070,0.2,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1,3.125000,-0.250000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81805,CCCO,c1ccncc1,-0.055304,0.4,308.15,test,expanded,CCCO-c1ccncc1,7.875000,0.319444,...,0,0,0,0,0,0,0,0,0,0
81806,CCCO,c1ccncc1,-0.073118,0.3,308.15,test,expanded,CCCO-c1ccncc1,7.875000,0.319444,...,0,0,0,0,0,0,0,0,0,0
81807,CCCO,c1ccncc1,-0.082963,0.2,308.15,test,expanded,CCCO-c1ccncc1,7.875000,0.319444,...,0,0,0,0,0,0,0,0,0,0
81808,CCCO,c1ccncc1,-0.091413,0.1,308.15,test,expanded,CCCO-c1ccncc1,7.875000,0.319444,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_merged.to_parquet(data_prefix + "all_data_with_features.parquet", compression="zstd")
