# Train & Test Splitting

This script processes molecular descriptors and drug potency data to prepare datasets for regression and classification tasks. Descriptors include physicochemical, quantum, topological, and 3D features. For regression, interaction terms are generated between the dummy variable (indicating imputed potency values) and the physicochemical, quantum, and topological descriptors to capture feature-specific adjustments for imputed data. The dataset undergoes randomized train-test splitting (85%-15%) and 5-fold cross-validation, with molecule IDs retained for traceability. For classification, the data is stratified by potency classes to ensure balanced train-test splits, followed by 5-fold cross-validation to evaluate model performance. All processed datasets, including features and targets, are saved as CSV files for downstream analysis.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

seed=42 #seed to use in all splits
test_size = 0.15 #use test_size of the dataset as test set

## Step 1: Load and Merge Data

In [2]:
# Read descriptors
descriptors_3D = pd.read_csv('../2_descriptors/3D-descriptors-standardized.csv')
descriptors_quantum = pd.read_csv('../2_descriptors/quantum-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_topological = pd.read_csv('../2_descriptors/topological-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_physicochemical = pd.read_csv('../2_descriptors/physicochemical-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])

# Combine all descriptors
descriptors_all = pd.concat([descriptors_3D, descriptors_quantum, descriptors_topological, descriptors_physicochemical], axis=1)

# Read IC50 data
ic50 = pd.read_csv('../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv')[['Molecule ChEMBL ID', 'Standard Value', 'Potency']]
ic50['-logIC50'] = ic50['Standard Value'].apply(lambda x: -np.log(x * 1e-9))

# Merge descriptors with IC50 data
dataset = ic50.drop(columns='Standard Value').merge(descriptors_all, on='Molecule ChEMBL ID')
dataset.to_csv('descriptors_all.csv', index=False)

dataset.head()

Unnamed: 0,Molecule ChEMBL ID,Potency,-logIC50,PMI1,PMI2,PMI3,Asphericity,Eccentricity,InertialShapeFactor,NPR1,...,HAcceptors,HDonors,heteroatoms,rotatableBonds,saturatedCarbocycles,saturatedHeterocycles,satureatedRings,ringCount,molLogP,molMR
0,CHEMBL3235962,High Potency \n(less than 1 $\mu$M),16.304425,0.369589,-0.453167,-0.359669,-0.786832,-0.492323,-0.507247,0.67437,...,-1.25873,-0.131926,-0.580284,-1.503645,-0.374196,-0.315596,-0.499087,0.265767,0.851695,-0.158908
1,CHEMBL3235983,High Potency \n(less than 1 $\mu$M),18.420681,-0.045649,-0.442597,-0.347368,-0.371923,0.097744,-0.407798,0.154252,...,-1.742971,-0.131926,0.362978,-1.503645,-0.374196,-0.315596,-0.499087,-0.787636,0.615532,-0.912202
2,CHEMBL1650511,High Potency \n(less than 1 $\mu$M),21.607574,-0.542816,0.502877,0.305028,0.800887,0.884693,-0.05805,-0.883505,...,-0.774489,-0.131926,0.362978,-1.503645,2.229254,-0.315596,1.579913,1.319169,1.89026,-0.192731
3,CHEMBL2443068,High Potency \n(less than 1 $\mu$M),15.283449,-1.170932,1.289978,1.01896,1.930956,1.151781,0.722147,-1.599513,...,-0.774489,-0.131926,0.048557,-1.945814,-0.374196,2.960593,1.579913,0.265767,0.567644,-0.405199
4,CHEMBL3959823,High Potency \n(less than 1 $\mu$M),13.954773,-0.491711,-1.122955,-1.256339,-1.154136,-1.342932,-0.188332,1.264532,...,-1.25873,-0.131926,-1.523545,0.265031,-0.374196,-0.315596,-0.499087,-0.787636,-0.617269,-0.365892


## Step 2: Prepare Regression Dataset

In [3]:
# Perform randomized train-test split
X_reg = dataset.drop(columns=['-logIC50', 'Potency','Unnamed: 0']).set_index('Molecule ChEMBL ID')
y_reg = dataset[['-logIC50', 'Molecule ChEMBL ID']].set_index('Molecule ChEMBL ID')
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=test_size, random_state=seed, shuffle=True
)

# Save regression datasets with Molecule ChEMBL ID
X_train_reg.to_csv('train_reg.csv')
X_test_reg.to_csv('test_reg.csv')

# Perform 5-Fold Cross-Validation on Regression Dataset
kf = KFold(n_splits=5, shuffle=True, random_state=seed)
fold = 1

for train_index, val_index in kf.split(X_train_reg):
    X_train_fold, X_val_fold = X_train_reg.iloc[train_index], X_train_reg.iloc[val_index]
    y_train_fold, y_val_fold = y_train_reg.iloc[train_index], y_train_reg.iloc[val_index]

    # Save each fold as CSV
    X_train_fold.to_csv(f'train_reg_{fold}.csv')
    X_val_fold.to_csv(f'val_reg_{fold}.csv')
    
    fold += 1
    

## Step 3: Prepare Classification Dataset

In [4]:
X_class = dataset.drop(columns=['-logIC50', 'Potency', 'Unnamed: 0']).set_index('Molecule ChEMBL ID')
y_class = dataset[['Potency', 'Molecule ChEMBL ID']].set_index('Molecule ChEMBL ID')

# Perform stratified train-test split
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=seed, stratify=y_class
)

# Save train and test datasets
X_train_class.to_csv('train_class.csv')
X_test_class.to_csv('test_class.csv')

# Perform Stratified 5-Fold Cross-Validation on Training Data
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
fold = 1

for train_index, val_index in skf.split(X_train_class, y_train_class['Potency']):
    X_train_fold = X_train_class.iloc[train_index]
    y_train_fold = y_train_class.iloc[train_index]
    X_val_fold = X_train_class.iloc[val_index]
    y_val_fold = y_train_class.iloc[val_index]

    # Save each fold as CSV
    X_train_fold.to_csv(f'train_class_{fold}.csv')
    X_val_fold.to_csv(f'val_class_{fold}.csv')

    fold += 1