In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
import xgboost as xgb
from joblib import load
from tqdm import tqdm
import matplotlib.pyplot as plt

train_data_path = r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\Filtered_Train.csv'
train_data = pd.read_csv(train_data_path)
y_train = train_data['4_hr_value_ppm'].values

# Extract SMILES from training
SMILES_train = train_data['QSAR_READY_SMILES']

# Load the best model
model_path = r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\xgb_best_model.json'
best_model = xgb.Booster()
best_model.load_model(model_path)

## Calculate features
descriptor_names = [desc_name[0] for desc_name in Descriptors._descList]

# Calculate descriptors
train_descriptors = []
for sm in tqdm(SMILES_train, desc="Calculating RDKit descriptors (Train)"):
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        desc_values = []
        for desc_name, desc_fn in Descriptors._descList:
            try:
                desc_values.append(desc_fn(mol))
            except:
                desc_values.append(np.nan)
        train_descriptors.append(desc_values)
    else:
        # Invalid molecule: fill with NaN
        train_descriptors.append([np.nan] * len(descriptor_names))

descriptor_df = pd.DataFrame(train_descriptors, columns=descriptor_names)

descriptor_df.replace([np.inf, -np.inf], 1e10, inplace=True)
mask = (descriptor_df.abs() > 1e5).any()
columns_to_drop = descriptor_df.columns[mask]
descriptor_df = descriptor_df.loc[:, ~mask]
descriptor_df.fillna(0, inplace=True)

print("Descriptor shape after column-dropping:", descriptor_df.shape)

# Load the SAME scaler used in training
scaler = load('scaler.pkl')
X_des_train = scaler.transform(descriptor_df.values)

# Calculate Morgan fingerprints
fp_list = []
for sm in tqdm(SMILES_train, desc="Generating Morgan fingerprints (Train)"):
    mol = Chem.MolFromSmiles(sm)
    if mol is not None:
        fpt = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        fp_list.append(np.array(fpt))
    else:
        # Invalid SMILES -> fill with zeros
        fp_list.append(np.array([0]*1024))

X_fp_train = pd.DataFrame(fp_list, columns=[f"Bit_{i}" for i in range(1024)])

# Combine descriptor + fingerprint arrays
X_train_combined = np.hstack([X_des_train, X_fp_train.values])

# Make predictions on the training set
dtrain = xgb.DMatrix(X_train_combined)
y_pred_train = best_model.predict(dtrain)

# Residuals = Observed - Predicted
residuals = y_train - y_pred_train

# Calculate 5th and 95th percentiles
lower_bound = np.percentile(residuals, 5)
upper_bound = np.percentile(residuals, 95)

print(f"Residual 5th percentile: {lower_bound:.4f}")
print(f"Residual 95th percentile: {upper_bound:.4f}")

# Identify samples considered 'outliers' by residual-based approach
outlier_mask = (residuals < lower_bound) | (residuals > upper_bound)
count_inlier = np.sum(~outlier_mask)
count_outlier = np.sum(outlier_mask)

print(f"Number of inlier samples (within 5th-95th percentiles): {count_inlier}")
print(f"Number of outlier samples: {count_outlier}")

# Visualization of residual-based approach
plt.figure(figsize=(10, 6))

# Plot all residuals
plt.scatter(
    range(len(residuals)),
    residuals,
    alpha=0.5,
    label='Low Residuals (5%-95%)',
    color='blue'
)

# Highlight outliers in red
outlier_indices = np.where(outlier_mask)[0]
plt.scatter(
    outlier_indices,
    residuals[outlier_mask],
    color='red',
    marker='x',
    label='High Residuals (<5% or >95%) '
)

# Horizontal lines at 5th and 95th percentiles
plt.axhline(y=lower_bound, color='r', linestyle='--', linewidth=2, label='5th Percentile')
plt.axhline(y=upper_bound, color='r', linestyle='--', linewidth=2, label='95th Percentile')

plt.xlabel('Training Compound Index')
plt.ylabel('Residual (Observed - Predicted, ppm)')
plt.title('Residual-based Assessment for Training Set')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the figure
plt.savefig(r'E:\OneDrive - UCLA IT Services\UCLA\PhD Project\CoMPAIT\AD_train.png', dpi=300)

# Close the plot
plt.close()


Calculating RDKit descriptors (Train): 100%|██████████| 594/594 [00:02<00:00, 288.42it/s]


Descriptor shape after column-dropping: (594, 209)


Generating Morgan fingerprints (Train): 100%|██████████| 594/594 [00:00<00:00, 2048.42it/s]


Residual 5th percentile: -0.7143
Residual 95th percentile: 0.5843
Number of inlier samples (within 5th-95th percentiles): 534
Number of outlier samples: 60
