In [1]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
delaney_chemberta2_valid2 = pd.read_csv('./chemberta2/results/delaney/chemberta2_valid2_delaney_3_predictions.csv')
delaney_molformer_valid2 = pd.read_csv('./molformer/results/delaney/molformer_valid2_delaney_3_99.csv')
delaney_molbert_valid2 = pd.read_csv('./molbert/results/delaney/molbert_valid2_delaney_3.csv')

# Load the test data for each model
delaney_chemberta2_test = pd.read_csv('./chemberta2/results/delaney/chemberta2_test_delaney_3_predictions.csv')
delaney_molformer_test = pd.read_csv('./molformer/results/delaney/molformer_test_delaney_3_99.csv')
delaney_molbert_test = pd.read_csv('./molbert/results/delaney/molbert_test_delaney_3.csv')

train_mean = -3.0955443786982246
train_sd = 2.121246879189704

# features

# Load the features from chemberta
delaney_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/delaney/chemberta2_valid2_delaney_3_features.csv')
delaney_chemberta2_features_test = pd.read_csv('./chemberta2/features/delaney/chemberta2_test_delaney_3_features.csv')

# Load the features from molformer
delaney_molformer_features_valid2 = pd.read_csv('./molformer/features/delaney/molformer_valid2_delaney_3_features.csv')
delaney_molformer_features_test = pd.read_csv('./molformer/features/delaney/molformer_test_delaney_3_features.csv')

# Load the features from molbert
delaney_molbert_features_valid2 = pd.read_csv('./molbert/features/delaney/molbert_valid2_delaney_3_features.csv')
delaney_molbert_features_test = pd.read_csv('./molbert/features/delaney/molbert_test_delaney_3_features.csv')

For delaney (regression)

In [2]:
# Preparing the actual and predicted values
# Chemberta2
delaney_chemberta_actual = delaney_chemberta2_test['target'] 
delaney_chemberta_pred = delaney_chemberta2_test['pred_raw']

# Molformer
delaney_molformer_actual = delaney_molformer_test['target']
delaney_molformer_pred = delaney_molformer_test['pred_raw']

# molbert
delaney_molbert_actual = delaney_molbert_test['target_raw']
delaney_molbert_pred = delaney_molbert_test['pred_raw']

In [3]:
# Calculating metrics
delaney_metrics_results = {}

for model_name, actual, pred in [("Chemberta2", delaney_chemberta_actual, delaney_chemberta_pred),
                                 ("Molformer", delaney_molformer_actual, delaney_molformer_pred),
                                 ("Molbert", delaney_molbert_actual, delaney_molbert_pred)]:
    delaney_metrics_results[model_name] = {
        "MAE": mean_absolute_error(actual, pred),
        "RMSE": np.sqrt(mean_squared_error(actual, pred)),
        "R2 Score": r2_score(actual, pred),
        "Correlation": pearsonr(actual, pred)[0]  # Only record the correlation coefficient
    }

delaney_metrics_results

{'Chemberta2': {'MAE': 0.5606664320638193,
  'RMSE': 0.7305590849172536,
  'R2 Score': 0.8573821186737384,
  'Correlation': 0.9294289768725041},
 'Molformer': {'MAE': 0.4865245807831858,
  'RMSE': 0.662620137396756,
  'R2 Score': 0.8826744633917042,
  'Correlation': 0.9422375703567979},
 'Molbert': {'MAE': 0.5384853131061946,
  'RMSE': 0.6906433653118413,
  'R2 Score': 0.8725408620066292,
  'Correlation': 0.9387197751694945}}

In [4]:
# standardized valid2 labels
delaney_y_ensemble_valid2 = (delaney_chemberta2_valid2['target'] - train_mean)/train_sd

# Create the features for the ensemble from the prediction probabilities of being in class 1
delaney_X_ensemble_valid2 = pd.concat([
    delaney_chemberta2_valid2['pred_z'] - delaney_y_ensemble_valid2,
    delaney_molformer_valid2['pred_z'] - delaney_y_ensemble_valid2, 
    delaney_molbert_valid2['pred_z'] - delaney_y_ensemble_valid2,
    # add features from training set
    delaney_chemberta2_valid2['pred_z'],
    delaney_molformer_valid2['pred_z'],
    delaney_molbert_valid2['pred_z']
], axis=1)

# change feature names of the ensemble so that they are unique
delaney_X_ensemble_valid2.columns = ['residuals_chemberta', 'residuals_molformer', 'residuals_molbert', 'chemberta', 'molformer', 'molbert']

In [5]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
import numpy as np

# Set seeds for reproducibility
np.random.seed(0)

# Compute residuals
chemberta_y_residual = delaney_chemberta2_valid2['pred_z'] - delaney_y_ensemble_valid2
molformer_y_residual = delaney_molformer_valid2['pred_z'] - delaney_y_ensemble_valid2
molbert_y_residual = delaney_molbert_valid2['pred_z'] - delaney_y_ensemble_valid2

# For input features, we use pred_z and the other features
chemberta_X = pd.concat([delaney_chemberta2_valid2['pred_z'], 
                         delaney_chemberta2_features_valid2.iloc[:, 2:]], axis=1)
molformer_X = pd.concat([delaney_molformer_valid2['pred_z'], 
                         delaney_molformer_features_valid2.iloc[:, 1:]], axis=1)
molbert_X = pd.concat([delaney_molbert_valid2['pred_z'], 
                       delaney_molbert_features_valid2.iloc[:, 1:]], axis=1)

# Standardize each dataset
scaler_chemberta = StandardScaler().fit(chemberta_X)
scaler_molformer = StandardScaler().fit(molformer_X)
scaler_molbert = StandardScaler().fit(molbert_X)

chemberta_X_scaled = scaler_chemberta.transform(chemberta_X)
molformer_X_scaled = scaler_molformer.transform(molformer_X)
molbert_X_scaled = scaler_molbert.transform(molbert_X)

# Define RMSE scorer
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define the hyperparameter space for regression using continuous distributions
xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Objective function for hyperopt with XGBRegressor
def objective(params, X_train, y_train):
    # Convert float outputs of hp.quniform to int for certain parameters
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    
    # Initialize XGBRegressor with the given parameters
    model = xgb.XGBRegressor(**params, random_state=0)
    
    # RMSE as the scoring metric
    rmse = make_scorer(rmse_scorer, greater_is_better=False)
    score = cross_val_score(model, X_train, y_train, scoring=rmse, cv=5)
    
    # Minimize the negative RMSE score
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Function to optimize and train the model for each dataset
def optimize_and_train(X_train, y_train):
    # Run the Bayesian optimization
    trials = Trials()
    best_params = fmin(fn=lambda params: objective(params, X_train, y_train), 
                       space=xgb_hyperopt_space, 
                       algo=tpe.suggest, 
                       max_evals=50, 
                       trials=trials,
                       rstate=np.random.default_rng(0),  # Seed for reproducibility in hyperopt
                       early_stop_fn=no_progress_loss(10))

    # Convert the best hyperparameters to the correct data types
    best_params['n_estimators'] = int(best_params['n_estimators'])
    best_params['max_depth'] = int(best_params['max_depth'])

    # Initialize and train the XGBoost model with the best parameters
    model = xgb.XGBRegressor(**best_params, random_state=0)
    model.fit(X_train, y_train)
    
    return model

# Optimizing and training for chemberta dataset
print("Optimizing for chemberta dataset...")
best_model_chemberta = optimize_and_train(chemberta_X_scaled, chemberta_y_residual)

# Optimizing and training for molformer dataset
print("Optimizing for molformer dataset...")
best_model_molformer = optimize_and_train(molformer_X_scaled, molformer_y_residual)

# Optimizing and training for molbert dataset
print("Optimizing for molbert dataset...")
best_model_molbert = optimize_and_train(molbert_X_scaled, molbert_y_residual)

# Print a confirmation message
print("Models trained for chemberta, molformer, and molbert datasets.")

Optimizing for chemberta dataset...
 26%|██▌       | 13/50 [00:33<01:34,  2.55s/trial, best loss: 0.30637958249116276]
Optimizing for molformer dataset...
 58%|█████▊    | 29/50 [02:11<01:35,  4.54s/trial, best loss: 0.27885283602477545]
Optimizing for molbert dataset...
 88%|████████▊ | 44/50 [02:56<00:24,  4.01s/trial, best loss: 0.2997175379425164] 
Models trained for chemberta, molformer, and molbert datasets.


In [6]:
# ChemBERTa
chemberta_X_test = pd.concat([delaney_chemberta2_test['pred_z'], 
                              delaney_chemberta2_features_test.iloc[:, 2:]], axis=1)

# MolFormer
molformer_X_test = pd.concat([delaney_molformer_test['pred_z'], 
                              delaney_molformer_features_test.iloc[:, 1:]], axis=1)

# MolBERT
molbert_X_test = pd.concat([delaney_molbert_test['pred_z'], 
                            delaney_molbert_features_test.iloc[:, 1:]], axis=1)

# Scale the test data using the corresponding scalers
chemberta_X_test_scaled = scaler_chemberta.transform(chemberta_X_test)
molformer_X_test_scaled = scaler_molformer.transform(molformer_X_test)
molbert_X_test_scaled = scaler_molbert.transform(molbert_X_test)

# Predict residuals for the test set using the trained GroupLasso models
y_pred_residuals_chemberta_test = best_model_chemberta.predict(chemberta_X_test_scaled)
y_pred_residuals_molformer_test = best_model_molformer.predict(molformer_X_test_scaled)
y_pred_residuals_molbert_test = best_model_molbert.predict(molbert_X_test_scaled)

# Create a DataFrame to store the predicted residuals for the test set
predicted_residuals_test = pd.DataFrame({
    'Residuals_Chemberta': y_pred_residuals_chemberta_test,
    'Residuals_Molformer': y_pred_residuals_molformer_test,
    'Residuals_Molbert': y_pred_residuals_molbert_test
})

# Load test data into ensemble DataFrame, adding the predicted residuals and original predictions
delaney_X_ensemble_test = pd.concat([
    predicted_residuals_test,
    delaney_chemberta2_test['pred_z'],
    delaney_molformer_test['pred_z'],  
    delaney_molbert_test['pred_z']
], axis=1)

# Rename feature columns so that they are unique
delaney_X_ensemble_test.columns = ['residuals_chemberta', 'residuals_molformer', 'residuals_molbert', 'chemberta', 'molformer', 'molbert']

# True test labels
delaney_y_ensemble_test = delaney_chemberta2_test['target']

In [7]:
# scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
delaney_X_ensemble_valid2_scaled = scaler.fit_transform(delaney_X_ensemble_valid2)
delaney_X_ensemble_test_scaled = scaler.transform(delaney_X_ensemble_test)

delaney_X_ensemble_valid2_scaled = pd.DataFrame(delaney_X_ensemble_valid2_scaled, columns=delaney_X_ensemble_valid2.columns)
delaney_X_ensemble_test_scaled = pd.DataFrame(delaney_X_ensemble_test_scaled, columns=delaney_X_ensemble_test.columns)

In [8]:
delaney_X_ensemble_valid2_selected = delaney_X_ensemble_valid2_scaled
delaney_X_ensemble_test_selected = delaney_X_ensemble_test_scaled

# check shapes
print(delaney_X_ensemble_valid2_selected.shape)
print(delaney_X_ensemble_test_selected.shape)

(226, 6)
(113, 6)


In [9]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
import random

# Set seeds for reproducibility
np.random.seed(0)
random.seed(0)

# Define the hyperparameter space using continuous distributions
delaney_xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Correctly define the RMSE scorer function
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Convert float outputs of hp.quniform to int for certain parameters
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    model = xgb.XGBRegressor(**params, random_state=0)
    
    # Cross-validated RMSE as the objective
    score = cross_val_score(model, delaney_X_ensemble_valid2_selected, delaney_y_ensemble_valid2, 
                            scoring=make_scorer(rmse_scorer, greater_is_better=False), cv=5)
    
    # Minimize the positive RMSE (already negative from scoring)
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Run the Bayesian optimization
trials = Trials()
delaney_xgb_best_params = fmin(fn=objective, 
                            space=delaney_xgb_hyperopt_space, 
                            algo=tpe.suggest, 
                            max_evals=100, 
                            trials=trials,
                            rstate=np.random.default_rng(0),  # Seed for hyperopt
                            early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", delaney_xgb_best_params)


 25%|██▌       | 25/100 [00:07<00:21,  3.56trial/s, best loss: 0.12060325694165033]
Best hyperparameters: {'colsample_bytree': 0.9558186054871323, 'learning_rate': 0.09634469734464615, 'max_depth': 4.0, 'n_estimators': 200.0, 'subsample': 0.8082925817075997}


In [10]:
# fit the model with the best hyperparameters
# Convert parameters obtained from Hyperopt to the correct data type
delaney_xgb_best_params['n_estimators'] = int(delaney_xgb_best_params['n_estimators'])
delaney_xgb_best_params['max_depth'] = int(delaney_xgb_best_params['max_depth'])

# Initialize and train the XGBoost model with the best parameters
delaney_xgb_model = xgb.XGBRegressor(**delaney_xgb_best_params, random_state=0)
delaney_xgb_model.fit(delaney_X_ensemble_valid2_selected, delaney_y_ensemble_valid2)

# Predict the test set
delaney_xgb_best_pred = delaney_xgb_model.predict(delaney_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
delaney_xgb_best_metrics = {
    "MAE": mean_absolute_error(delaney_y_ensemble_test, delaney_xgb_best_pred),
    "RMSE": np.sqrt(mean_squared_error(delaney_y_ensemble_test, delaney_xgb_best_pred)),
    "R2 Score": r2_score(delaney_y_ensemble_test, delaney_xgb_best_pred),
    "Correlation": pearsonr(delaney_y_ensemble_test, delaney_xgb_best_pred)[0]  # Only record the correlation coefficient
}

delaney_xgb_best_metrics

{'MAE': 0.46665421318796885,
 'RMSE': 0.6256620272577275,
 'R2 Score': 0.895397302598155,
 'Correlation': 0.9484961749434442}

In [11]:
# create a table to record all metrics for delaney
delaney_metrics_results["XGBoost"] = delaney_xgb_best_metrics

delaney_metrics_df = pd.DataFrame(delaney_metrics_results).T
# keep 3 digits after the decimal point
delaney_metrics_df = delaney_metrics_df.round(3)

# export table to csv
delaney_metrics_df.to_csv('./split3_delaney_metrics_xgb.csv')