In [None]:
! pip install tensorflow_addons
! pip install nfp

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        
%matplotlib inline

2025-03-04 17:19:42.152935: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-04 17:19:42.293455: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 17:19:42.865621: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/g16/bsd:/usr/local/g16:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0/extras/CUPTI/lib64::/usr/local/gv/lib
2025-03-04 17:19:42.865720: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] 

In [2]:
import nfp
from preprocess_inputs_cfc import preprocessor
preprocessor.from_json('model_3_tfrecords_multi_halo_cfc/preprocessor.json')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Slice(layers.Layer):
    def call(self, inputs):
        input_shape = tf.shape(inputs)
        num_bonds = input_shape[1] / 2
        output = tf.slice(inputs, [0, 0, 0], [-1, num_bonds, -1])
        output.set_shape(self.compute_output_shape(inputs.shape))
        return output

    def compute_output_shape(self, input_shape):
        return [input_shape[0], None, input_shape[2]]
    
custom_objects = {**nfp.custom_objects,'Slice':Slice}

model = tf.keras.models.load_model('model_3_multi_halo_cfc/best_model.hdf5', custom_objects=custom_objects)

2025-03-04 17:19:45.172002: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
data=pd.read_csv("./CN_shap_250213.csv")
molecule_smiles=data.Canonical_SMILES.unique()

test = np.array(molecule_smiles)

In [5]:
def get_data(smiles):
    input_dict = preprocessor(smiles)
    input_dict['n_atom'] = len(input_dict['atom'] )
    input_dict['n_bond'] = len(input_dict['bond'] )
    return input_dict

test_dataset = (
    tf.data.Dataset.from_generator(
        lambda:  (iter(get_data(smiles) for smiles in test)), 
        output_signature= { **preprocessor.output_signature,'n_atom': tf.TensorSpec(shape=(), dtype=tf.int32, name=None),\
        'n_bond': tf.TensorSpec(shape=(), dtype=tf.int32, name=None) })
    .padded_batch(batch_size=1000, padding_values={**preprocessor.padding_values,'n_atom': tf.constant(0, dtype="int32"),\
        'n_bond': tf.constant(0, dtype="int32")})
)

In [6]:
predicted_bdes = model.predict(test_dataset, verbose=True)

  inputs = self._flatten_to_reference_inputs(inputs)




In [7]:
df = pd.DataFrame(predicted_bdes.reshape(-1, 2), columns=['pred_bde','pred_bdfe'])
df.index = test[np.repeat(np.arange(predicted_bdes.shape[0]), predicted_bdes.shape[1])]

def func(x):
    x['bond_index'] = range(0, predicted_bdes.shape[1])
    return x

pred_bdes = df.reset_index().rename(columns={'index': 'molecule'})
pred_bdes = pred_bdes.groupby('molecule',group_keys=False).apply(func)
pred_bdes = pred_bdes[(pred_bdes['pred_bde'] != 0.000000) &(pred_bdes['pred_bde'] != 0.000000)]

In [8]:
pred_bdes.to_csv('full_bde.csv',index=False)

In [9]:
import pandas as pd
import numpy as np
from rdkit import Chem

df = pd.read_csv('full_bde.csv')
dd = pd.read_csv('CN_shap_250213.csv')

def get_atom_indices(smiles, bond_index):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    if bond_index >= mol.GetNumBonds():
        return None, None 
    bond = mol.GetBondWithIdx(bond_index)
    atom1 = bond.GetBeginAtomIdx()
    atom2 = bond.GetEndAtomIdx()
    return atom1, atom2

def convert_to_list(value):
    try:
        return [float(x) for x in value.strip('[]').split()] 
    except ValueError:
        return []

def get_max_min_indices(atomwise_list):
    if not atomwise_list:
        return set()
    max_indices = set(np.where(atomwise_list == np.max(atomwise_list))[0])
    min_indices = set(np.where(atomwise_list == np.min(atomwise_list))[0])
    return max_indices.union(min_indices)

dd['atomwise_shap'] = dd['atomwise_shap'].apply(convert_to_list)
dd['max_min_indices'] = dd['atomwise_shap'].apply(get_max_min_indices)

bde_results = {}
for rank in range(1, 6):
    bde_results[rank] = []

for smiles, group in df.groupby('molecule'):
    sorted_group = group.sort_values('pred_bde')
    bde_thresholds = sorted_group['pred_bde'].unique()[:5]
    selected_group = sorted_group[sorted_group['pred_bde'].isin(bde_thresholds)]
    
    for rank in range(len(bde_thresholds)):
        for _, row in selected_group[selected_group['pred_bde'] <= bde_thresholds[rank]].iterrows():
            bond_index = int(row['bond_index'])
            atom_indices = get_atom_indices(smiles, bond_index)
            if atom_indices[0] is not None:
                bde_results[rank + 1].append({
                    'molecule': smiles,
                    'atom_index_1': atom_indices[0],
                    'atom_index_2': atom_indices[1],
                    'bond_index': bond_index,
                    'pred_bde': row['pred_bde']
                })

for rank in range(1, 6):
    result_df = pd.DataFrame(bde_results[rank])
    result_df.to_csv(f'filtered_bde_{rank}_smallest.csv', index=False)

input_files = [f'filtered_bde_{rank}_smallest.csv' for rank in range(1, 6)]

for i, input_file in enumerate(input_files, start=1):
    result_df = pd.read_csv(input_file)
    
    merged_df = pd.merge(result_df, dd, left_on='molecule', right_on='Canonical_SMILES', how='left')
    merged_df = merged_df.drop_duplicates(subset=['molecule', 'bond_index', 'atom_index_1', 'atom_index_2'])
    
    smiles_atom_index_map = merged_df.groupby('molecule').apply(
        lambda group: set(group['atom_index_1']).union(set(group['atom_index_2']))
    ).to_dict()

    def check_match(smiles, max_min_indices):
        if smiles not in smiles_atom_index_map:
            return False
        atom_index_set = smiles_atom_index_map[smiles]
        return not max_min_indices.isdisjoint(atom_index_set)

    merged_df['match'] = merged_df.apply(lambda row: check_match(row['molecule'], row['max_min_indices']), axis=1)

    final_result = merged_df[['molecule', 'match']].drop_duplicates()
    true_count = final_result['match'].sum()
    print(f"Total number of True values in '{input_file}': {true_count}")
    output_file = f"rank_match_{i}.csv"
    final_result.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}\n")

Total number of True values in 'filtered_bde_1_smallest.csv': 291
Results saved to rank_match_1.csv

Total number of True values in 'filtered_bde_2_smallest.csv': 378
Results saved to rank_match_2.csv

Total number of True values in 'filtered_bde_3_smallest.csv': 444
Results saved to rank_match_3.csv

Total number of True values in 'filtered_bde_4_smallest.csv': 490
Results saved to rank_match_4.csv

Total number of True values in 'filtered_bde_5_smallest.csv': 516
Results saved to rank_match_5.csv



In [10]:
import pandas as pd
import numpy as np
from rdkit import Chem

df = pd.read_csv('full_bde.csv')
dd = pd.read_csv('CN_shap_250213.csv')

def get_atom_indices(smiles, bond_index):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    if bond_index >= mol.GetNumBonds():
        return None, None 
    bond = mol.GetBondWithIdx(bond_index)
    atom1 = bond.GetBeginAtomIdx()
    atom2 = bond.GetEndAtomIdx()
    return atom1, atom2

def convert_to_list(value):
    try:
        return [float(x) for x in value.strip('[]').split()] 
    except ValueError:
        return []

def get_max_min_indices(atomwise_list):
    if not atomwise_list:
        return set()
    max_indices = set(np.where(atomwise_list == np.max(atomwise_list))[0])
    min_indices = set(np.where(atomwise_list == np.min(atomwise_list))[0])
    return max_indices.union(min_indices)

dd['atomwise_shap'] = dd['atomwise_shap'].apply(convert_to_list)
dd['max_min_indices'] = dd['atomwise_shap'].apply(get_max_min_indices)

bde_results = {"plus_0": [], "plus_2": [], "plus_4": [], "plus_6": [], "plus_8": [], "plus_10": []}

for smiles, group in df.groupby('molecule'):
    sorted_group = group.sort_values('pred_bde')
    weakest_bde = sorted_group['pred_bde'].min()
    
    bde_ranges = {
        "plus_0": weakest_bde,
        "plus_2": weakest_bde + 2,
        "plus_4": weakest_bde + 4,
        "plus_6": weakest_bde + 6,
        "plus_8": weakest_bde + 8,
        "plus_10": weakest_bde + 10,
    }
    
    for key, bde_limit in bde_ranges.items():
        selected_group = sorted_group[sorted_group['pred_bde'] <= bde_limit]
        for _, row in selected_group.iterrows():
            bond_index = int(row['bond_index'])
            atom_indices = get_atom_indices(smiles, bond_index)
            if atom_indices[0] is not None:
                bde_results[key].append({
                    'molecule': smiles,
                    'atom_index_1': atom_indices[0],
                    'atom_index_2': atom_indices[1],
                    'bond_index': bond_index,
                    'pred_bde': row['pred_bde']
                })

for key in bde_results:
    result_df = pd.DataFrame(bde_results[key])
    result_df.to_csv(f'filtered_bde_{key}.csv', index=False)

input_files = [f'filtered_bde_{key}.csv' for key in bde_results]

for i, input_file in enumerate(input_files, start=1):
    result_df = pd.read_csv(input_file)
    
    merged_df = pd.merge(result_df, dd, left_on='molecule', right_on='Canonical_SMILES', how='left')
    merged_df = merged_df.drop_duplicates(subset=['molecule', 'bond_index', 'atom_index_1', 'atom_index_2'])
    
    smiles_atom_index_map = merged_df.groupby('molecule').apply(
        lambda group: set(group['atom_index_1']).union(set(group['atom_index_2']))
    ).to_dict()

    def check_match(smiles, max_min_indices):
        if smiles not in smiles_atom_index_map:
            return False
        atom_index_set = smiles_atom_index_map[smiles]
        return not max_min_indices.isdisjoint(atom_index_set)

    merged_df['match'] = merged_df.apply(lambda row: check_match(row['molecule'], row['max_min_indices']), axis=1)

    final_result = merged_df[['molecule', 'match']].drop_duplicates()
    true_count = final_result['match'].sum()
    print(f"Total number of True values in '{input_file}': {true_count}")
    output_file = f"energy_match_{i}.csv"
    final_result.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}\n")

Total number of True values in 'filtered_bde_plus_0.csv': 291
Results saved to energy_match_1.csv

Total number of True values in 'filtered_bde_plus_2.csv': 369
Results saved to energy_match_2.csv

Total number of True values in 'filtered_bde_plus_4.csv': 444
Results saved to energy_match_3.csv

Total number of True values in 'filtered_bde_plus_6.csv': 456
Results saved to energy_match_4.csv

Total number of True values in 'filtered_bde_plus_8.csv': 470
Results saved to energy_match_5.csv

Total number of True values in 'filtered_bde_plus_10.csv': 479
Results saved to energy_match_6.csv



In [None]:
import pandas as pd
from rdkit import Chem

file_path = "rank_match_1.csv"
df = pd.read_csv(file_path)

def has_ring(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol.HasSubstructMatch(Chem.MolFromSmarts("[R]")) if mol else False

df["has_ring"] = df["molecule"].apply(has_ring)

group_counts = df.groupby(["match", "has_ring"]).size().reset_index(name="count")

group_counts

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors

file_path = "rank_match_1.csv"
df = pd.read_csv(file_path)

df["match"] = df["match"].astype(bool)

def compute_molecular_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return [
            Descriptors.MolWt(mol),      
            Descriptors.MolLogP(mol),       
            Descriptors.TPSA(mol),         
            Descriptors.RingCount(mol),     
            Descriptors.NumHDonors(mol),       
            Descriptors.NumHAcceptors(mol)        
        ]
    else:
        return [None] * 6

feature_columns = ["MolWt", "LogP", "TPSA", "RingCount", "NumHDonors", "NumHAcceptors"]

df_features = df.copy()
df_features[feature_columns] = df_features["molecule"].apply(lambda x: compute_molecular_features(x)).apply(pd.Series)

df_features = df_features.dropna()

df_true = df_features[df_features["match"] == True]
df_false = df_features[df_features["match"] == False]

def select_quantile_representatives(df_subset, num_samples=5):
    selected_molecules = []
    
    quantiles = [0, 0.25, 0.5, 0.75, 1.0]
    
    for col in feature_columns:
        quantile_values = df_subset[col].quantile(quantiles).values
        
        for q_value in quantile_values:
            closest_idx = (df_subset[col] - q_value).abs().idxmin()
            selected_molecules.append(df_subset.loc[closest_idx])
    
    selected_df = pd.DataFrame(selected_molecules).drop_duplicates().head(num_samples)
    
    return selected_df

true_representatives = select_quantile_representatives(df_true, 5)
false_representatives = select_quantile_representatives(df_false, 5)

print("=== 대표적인 True 분자 ===")
print(true_representatives[["molecule", *feature_columns]])

print("\n=== 대표적인 False 분자 ===")
print(false_representatives[["molecule", *feature_columns]])

In [25]:
import pandas as pd

file_path = "CN_shap_250213.csv"  
df = pd.read_csv(file_path)

target_smiles = [
    "CC1=CC=C(C=C1)C(C)C",
    "COc1ccccc1OC",
    "CCOCCCCOCC",
    "COCCOCCO",
    "CCCCCCCCOC(=O)C(C)OC",
    "CCCC1=CC=C(C=C1)O",
    "CCCC=C(CCCC)CCCCCCC",
    "CCCCCCCCCCCCCC(=O)OCC",
    "CC(C)CCOC(=O)C(C)O",
    "CCCCCCCC(=O)OCCCC"
]

filtered_df = df[df["Canonical_SMILES"].isin(target_smiles)][["Canonical_SMILES", "CN", "predicted"]]

filtered_df = filtered_df.set_index("Canonical_SMILES").reindex(target_smiles).reset_index()
filtered_df

Unnamed: 0,Canonical_SMILES,CN,predicted
0,CC1=CC=C(C=C1)C(C)C,4.0,4.175619
1,COc1ccccc1OC,9.8,9.674105
2,CCOCCCCOCC,97.0,99.70122
3,COCCOCCO,38.3,44.25049
4,CCCCCCCCOC(=O)C(C)OC,57.5,57.060303
5,CCCC1=CC=C(C=C1)O,8.6,11.20769
6,CCCC=C(CCCC)CCCCCCC,45.0,46.557545
7,CCCCCCCCCCCCCC(=O)OCC,66.9,73.024704
8,CC(C)CCOC(=O)C(C)O,39.72,37.7431
9,CCCCCCCC(=O)OCCCC,39.6,40.39207


In [39]:
import pandas as pd

file_path_shap = "CN_shap_250213.csv"  
rank_files = [f"rank_match_{i}.csv" for i in range(1, 6)] 

df_shap = pd.read_csv(file_path_shap)[["Canonical_SMILES", "CN", "predicted"]]

results = []

for file_path_rank in rank_files:
    
    df_rank = pd.read_csv(file_path_rank)[["molecule", "match"]]

    df_merged = df_rank.merge(df_shap, left_on="molecule", right_on="Canonical_SMILES", how="left")

    df_true = df_merged[df_merged["match"] == True]
    df_false = df_merged[df_merged["match"] == False]

    mae_true = (df_true["CN"] - df_true["predicted"]).abs().mean()
    mae_false = (df_false["CN"] - df_false["predicted"]).abs().mean()

    results.append({"File": file_path_rank, "MAE_True": mae_true, "MAE_False": mae_false})

df_results = pd.DataFrame(results)
df_results

Unnamed: 0,File,MAE_True,MAE_False
0,rank_match_1.csv,2.248966,2.609559
1,rank_match_2.csv,2.256251,2.72312
2,rank_match_3.csv,2.270585,2.866681
3,rank_match_4.csv,2.367086,2.737389
4,rank_match_5.csv,2.344781,2.915481
