In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from joblib import load
from tqdm import tqdm
from sklearn.metrics import pairwise_distances

train_data_path = r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\Filtered_Train.csv'
pred_data_path = r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\Prediction_Results.csv'

train_data = pd.read_csv(train_data_path)
pred_data = pd.read_csv(pred_data_path)

SMILES_train = train_data['QSAR_READY_SMILES']
SMILES_prediction = pred_data['QSAR_ready_SMILES']

# Generate descriptors for training data
descriptor_names = [desc_name[0] for desc_name in Descriptors._descList]
train_desc_list = []
for sm in tqdm(SMILES_train):
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        vals = []
        for _, func in Descriptors._descList:
            try:
                vals.append(func(mol))
            except:
                vals.append(np.nan)
        train_desc_list.append(vals)
    else:
        train_desc_list.append([np.nan] * len(descriptor_names))

train_desc_df = pd.DataFrame(train_desc_list, columns=descriptor_names)

# Replace inf with large number, etc. exactly as training code did
train_desc_df.replace([np.inf, -np.inf], 1e10, inplace=True)
mask = (train_desc_df.abs() > 1e5).any()
train_desc_df = train_desc_df.loc[:, ~mask]
train_desc_df.fillna(0, inplace=True)

# Load the same scaler used in training
scaler = load('scaler.pkl')
X_train_desc = scaler.transform(train_desc_df.values)

# Morgan fingerprints for training data
train_fp_list = []
for sm in tqdm(SMILES_train, desc="Morgan fingerprints (Train)"):
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        bitvect = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        train_fp_list.append(np.array(bitvect))
    else:
        train_fp_list.append(np.array([0]*1024))

X_train_fp = np.array(train_fp_list)
X_train_combined = np.hstack([X_train_desc, X_train_fp])

print("X_train_combined shape:", X_train_combined.shape)

# Filter SMILES that cannot be converted to fingerprints
filtered_SMILES_fp = []
invalid_SMILES_fp = []

for sm in SMILES_prediction:
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        try:
            _ = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            filtered_SMILES_fp.append(sm)
        except Exception as e:
            invalid_SMILES_fp.append(sm)
    else:
        invalid_SMILES_fp.append(sm)

print(f"Total SMILES invalid for fingerprints: {len(invalid_SMILES_fp)}")

# Verify remaining SMILES can be converted by RDKit
filtered_SMILES_pred = []
invalid_SMILES_rdkit = []

for sm in filtered_SMILES_fp:
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        filtered_SMILES_pred.append(sm)
    else:
        invalid_SMILES_rdkit.append(sm)

print(f"Total SMILES invalid for RDKit: {len(invalid_SMILES_rdkit)}")
SMILES_prediction_filtered = filtered_SMILES_pred

# Create a new filtered prediction dataset
filtered_prediction_data = pred_data[pred_data['QSAR_ready_SMILES'].isin(SMILES_prediction_filtered)]

# Generate RDKit descriptor features for prediction data
mol_list_pred = [Chem.MolFromSmiles(sm) for sm in SMILES_prediction_filtered]
descriptor_names = [desc_name[0] for desc_name in Descriptors._descList]

descriptors_pred = []
for mol in tqdm(mol_list_pred, desc="Calculating RDKit descriptors for prediction data"):
    if mol is not None:
        descriptor_values = []
        for desc_name, desc_fn in Descriptors._descList:
            try:
                descriptor_values.append(desc_fn(mol))
            except Exception as e:
                descriptor_values.append(np.nan)
        descriptors_pred.append(descriptor_values)
    else:
        descriptors_pred.append([np.nan] * len(descriptor_names))

# Convert descriptors to DataFrame
descriptor_pred_df = pd.DataFrame(descriptors_pred, columns=descriptor_names)

# Replace inf and -inf with NaN explicitly
num_inf_replaced = descriptor_pred_df.isin([np.inf, -np.inf]).sum().sum()
descriptor_pred_df.replace([np.inf, -np.inf], np.nan, inplace=True)
print(f"Replaced {num_inf_replaced} cells with inf or -inf to NaN.")

# Replace extremely large values (>1e5) with NaN
num_large_values_replaced = (descriptor_pred_df.abs() > 1e5).sum().sum()
descriptor_pred_df[(descriptor_pred_df.abs() > 1e5)] = np.nan
print(f"Replaced {num_large_values_replaced} cells with values >1e5 to NaN.")

# Fill NaN values with 0
num_nan_replaced = descriptor_pred_df.isna().sum().sum()
descriptor_pred_df.fillna(0, inplace=True)
print(f"Replaced {num_nan_replaced} NaN cells with 0.")

# Drop columns not in training feature list
train_descriptor_columns = load('train_descriptor_columns.pkl')  # Load saved column names from training
descriptor_pred_df = descriptor_pred_df[descriptor_pred_df.columns.intersection(train_descriptor_columns)]
print(f"there are {descriptor_pred_df.shape} feature in pred dataset feature")

# Load the scaler saved during training
scaler = load('scaler.pkl')
X_des_pred = scaler.transform(descriptor_pred_df.values)

# Generate Morgan fingerprints for prediction data
Morgan_fpts_pred = []
for sm in tqdm(SMILES_prediction_filtered, desc="Generating Morgan fingerprints"):
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        try:
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            Morgan_fpts_pred.append(np.array(fpts))
        except Exception as e:
            Morgan_fpts_pred.append(np.array([np.nan] * 1024))
    else:
        Morgan_fpts_pred.append(np.array([np.nan] * 1024))

# Convert fingerprints to DataFrame
X_fp_pred = pd.DataFrame(Morgan_fpts_pred, columns=[f'Bit_{i}' for i in range(1024)])

# Replace NaN values in fingerprints
X_fp_pred.fillna(0, inplace=True)

# Combine descriptors and fingerprints
X_combined_pred = np.hstack([X_des_pred, X_fp_pred.values])

100%|██████████| 594/594 [00:02<00:00, 264.09it/s]
Morgan fingerprints (Train): 100%|██████████| 594/594 [00:00<00:00, 2146.74it/s]


X_train_combined shape: (594, 1233)




Total SMILES invalid for fingerprints: 0
Total SMILES invalid for RDKit: 0


Calculating RDKit descriptors for prediction data: 100%|██████████| 47714/47714 [05:06<00:00, 155.64it/s]


Replaced 4 cells with inf or -inf to NaN.
Replaced 13028 cells with values >1e5 to NaN.
Replaced 14396 NaN cells with 0.
there are (47714, 209) feature in pred dataset feature


Generating Morgan fingerprints: 100%|██████████| 47714/47714 [00:26<00:00, 1798.86it/s]


In [2]:
# Ensure prediction features match training features
X_combined_train_features = X_train_combined  # Training features
X_combined_pred_features = X_combined_pred[:, :X_combined_train_features.shape[1]]

print(f"Final combined prediction feature shape: {X_combined_pred_features.shape}")

Final combined prediction feature shape: (47714, 1233)


In [3]:
# 3-NN Distance-Based AD
dist_train = pairwise_distances(X_train_combined, X_train_combined, metric='euclidean')

# We don't want the diagonal (distance to itself = 0), so set it to inf
dist_train_no_diag = dist_train.copy()
np.fill_diagonal(dist_train_no_diag, np.inf)

# Sort each row so the smallest distances come first
dist_train_sorted = np.sort(dist_train_no_diag, axis=1)

# Take the first 3 distances (the 3 nearest neighbors) and average them
train_dist_3nn = dist_train_sorted[:, :3].mean(axis=1)

# Determine a threshold as the 95th percentile of these 3-NN distances
train_3nn_95th = np.percentile(train_dist_3nn, 95)
print(f"95th percentile of average 3-NN distances (train): {train_3nn_95th:.4f}")

# For prediction data: compute distance to every training sample
dist_pred = pairwise_distances(X_combined_pred_features, X_train_combined, metric='euclidean')

# Sort, take the first 3 for each prediction sample, average them
dist_pred_sorted = np.sort(dist_pred, axis=1)
pred_dist_3nn = dist_pred_sorted[:, :3].mean(axis=1)

# If that 3-NN average distance is greater than train_3nn_95th, consider out-of-domain
out_of_domain_mask = (pred_dist_3nn > train_3nn_95th)

filtered_prediction_data['3nn_dist_to_train'] = pred_dist_3nn
filtered_prediction_data['AD_flag'] = np.where(out_of_domain_mask, 'Out_of_AD', 'In_AD')

# Summaries
in_domain_count = np.sum(~out_of_domain_mask)
out_domain_count = np.sum(out_of_domain_mask)
print(f"Number of prediction samples in AD: {in_domain_count}")
print(f"Number of prediction samples out of AD: {out_domain_count}")

95th percentile of average 3-NN distances (train): 18.5853
Number of prediction samples in AD: 32156
Number of prediction samples out of AD: 15558


In [4]:
import matplotlib.pyplot as plt

# Extract the 3-NN distance values and AD flags from your filtered_prediction_data DataFrame
distances_pred = filtered_prediction_data['3nn_dist_to_train'].values
ad_flags = filtered_prediction_data['AD_flag'].values

# Create boolean masks for in-domain and out-of-domain samples
in_AD_mask = (ad_flags == 'In_AD')
out_AD_mask = (ad_flags == 'Out_of_AD')

# Plot configuration
plt.figure(figsize=(10, 6))

# Scatter plot for in-domain points
plt.scatter(
    np.where(in_AD_mask)[0],
    distances_pred[in_AD_mask],
    alpha=0.5,
    color='blue',
    label='In_AD'
)

# Scatter plot for out-of-domain points
plt.scatter(
    np.where(out_AD_mask)[0],
    distances_pred[out_AD_mask],
    alpha=0.8,
    color='red',
    marker='x',
    label='Out_of_AD'
)

# Horizontal line for the 3-NN 95th percentile threshold
plt.axhline(
    y=train_3nn_95th,  # use the same threshold you computed
    color='r',
    linestyle='--',
    linewidth=2,
    label='3-NN 95th Threshold'
)

# Labels and title
plt.xlabel('Prediction Sample Index')
plt.ylabel('3-NN Distance to Training Set')
plt.title('3-NN Distance-Based AD for Prediction Set')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the figure (adjust path as needed)
plt.savefig(r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\3NN_AD_Prediction_Set.png', dpi=300)

# Close the figure
plt.close()

In [5]:
# Add a new column 'AD' to store the applicability domain results
filtered_prediction_data.loc[:, 'AD'] = np.where(out_of_domain_mask, 0, 1)

# Save the updated DataFrame to a new CSV file
filtered_prediction_data.to_csv(r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\Filtered_Prediction_with_AD.csv', index=False)