In [1]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
bace_chemberta2_valid2 = pd.read_csv('./chemberta2/results/bace/chemberta2_valid2_bace_2_predictions.csv')
bace_molformer_valid2 = pd.read_csv('./molformer/results/bace/molformer_valid2_bace_2_epoch49.csv')
bace_molbert_valid2 = pd.read_csv('./molbert/results/bace/molbert_valid2_bace_2.csv')

# Load the test data for each model
bace_chemberta2_test = pd.read_csv('./chemberta2/results/bace/chemberta2_test_bace_2_predictions.csv')
bace_molformer_test = pd.read_csv('./molformer/results/bace/molformer_test_bace_2_epoch49.csv')
bace_molbert_test = pd.read_csv('./molbert/results/bace/molbert_test_bace_2.csv')

# features

# Load the features from chemberta
bace_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/bace/chemberta2_valid2_bace_2_features.csv')
bace_chemberta2_features_test = pd.read_csv('./chemberta2/features/bace/chemberta2_test_bace_2_features.csv')

# Load the features from molformer
bace_molformer_features_valid2 = pd.read_csv('./molformer/features/bace/molformer_valid2_bace_2_features.csv')
bace_molformer_features_test = pd.read_csv('./molformer/features/bace/molformer_test_bace_2_features.csv')

# Load the features from molbert
bace_molbert_features_valid2 = pd.read_csv('./molbert/features/bace/molbert_valid2_bace_2_features.csv')
bace_molbert_features_test = pd.read_csv('./molbert/features/bace/molbert_test_bace_2_features.csv')

For BACE (Classification)

In [2]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Preparing the actual and predicted values
# Chemberta2
bace_chemberta_actual = bace_chemberta2_test['Class']
bace_chemberta_pred = bace_chemberta2_test['y_pred']
bace_chemberta_probs = bace_chemberta2_test[['softmax_class_0_prob', 'softmax_class_1_prob']]

# Molformer
bace_molformer_actual = bace_molformer_test['Actual']
bace_molformer_pred = (bace_molformer_test['Prob_Class_1'] > 0.5).astype(int)
bace_molformer_probs = bace_molformer_test[['Prob_Class_0', 'Prob_Class_1']]

# Molbert
bace_molbert_actual = bace_molbert_test['target']
bace_molbert_pred = bace_molbert_test['pred']
bace_molbert_probs = bace_molbert_test['prob']

# Calculating metrics
bace_metrics_results = {}

for model_name, actual, pred, probs in [("Chemberta2", bace_chemberta_actual, bace_chemberta_pred, bace_chemberta_probs['softmax_class_1_prob']),
                                         ("Molformer", bace_molformer_actual, bace_molformer_pred, bace_molformer_probs['Prob_Class_1']),
                                         ("Molbert", bace_molbert_actual, bace_molbert_pred, bace_molbert_probs)]:
    bace_metrics_results[model_name] = {
        "Accuracy": accuracy_score(actual, pred),
        "F1 Score": f1_score(actual, pred),
        "ROC-AUC": roc_auc_score(actual, probs),
        "PR-AUC": average_precision_score(actual, probs)
    }

bace_metrics_results

{'Chemberta2': {'Accuracy': 0.6535947712418301,
  'F1 Score': 0.6442953020134228,
  'ROC-AUC': 0.7465949820788532,
  'PR-AUC': 0.8058944338111258},
 'Molformer': {'Accuracy': 0.6797385620915033,
  'F1 Score': 0.6797385620915033,
  'ROC-AUC': 0.8521505376344086,
  'PR-AUC': 0.8774798970672688},
 'Molbert': {'Accuracy': 0.7189542483660131,
  'F1 Score': 0.7361963190184049,
  'ROC-AUC': 0.775089605734767,
  'PR-AUC': 0.8548501462313669}}

In [3]:
# check shapes
print(bace_chemberta2_valid2.shape)
print(bace_molformer_valid2.shape)
print(bace_molbert_valid2.shape)
print(bace_chemberta2_features_valid2.shape)
print(bace_molformer_features_valid2.shape)
print(bace_molbert_features_valid2.shape)

(305, 8)
(305, 5)
(305, 4)
(305, 386)
(305, 769)
(305, 769)


In [4]:
import pandas as pd
import numpy as np

bace_y_ensemble_valid2 = bace_chemberta2_valid2['Class']

# Convert the ensemble target to a Series if not already done
bace_y_ensemble_valid2_s = pd.Series(bace_y_ensemble_valid2).reset_index(drop=True)

# Create dataframes for each model's class 1 probability
bace_chemberta2_prob = pd.DataFrame({'chemberta2': bace_chemberta2_valid2['softmax_class_1_prob']})
bace_chemberta2_prob.reset_index(drop=True, inplace=True)

bace_molformer_prob = pd.DataFrame({'molformer': bace_molformer_valid2['Prob_Class_1']})
bace_molformer_prob.reset_index(drop=True, inplace=True)

bace_molbert_prob = pd.DataFrame({'molbert': bace_molbert_valid2['prob']})
bace_molbert_prob.reset_index(drop=True, inplace=True)

# do the same for features bace_chemberta2_features_valid2.iloc[:, 2:]
bace_chemberta2_features = pd.DataFrame(bace_chemberta2_features_valid2.iloc[:, 2:])
bace_chemberta2_features.reset_index(drop=True, inplace=True)

bace_molformer_features = pd.DataFrame(bace_molformer_features_valid2.iloc[:, 1:])
bace_molformer_features.reset_index(drop=True, inplace=True)

bace_molbert_features = pd.DataFrame(bace_molbert_features_valid2.iloc[:, 1:])
bace_molbert_features.reset_index(drop=True, inplace=True)

# bace_features = pd.concat([bace_chemberta2_features, bace_molformer_features, bace_molbert_features], axis=1)

# Combine probabilities into one dataframe
train_bace_prob = pd.concat([bace_chemberta2_prob, bace_molformer_prob, bace_molbert_prob], axis=1)

# Function to calculate BCE for each row
def calculate_bce_rowwise(y_true, y_pred):
    return -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Calculate row-wise BCE for each model
bce_chemberta = calculate_bce_rowwise(bace_y_ensemble_valid2_s, bace_chemberta2_prob['chemberta2'])
bce_molformer = calculate_bce_rowwise(bace_y_ensemble_valid2_s, bace_molformer_prob['molformer'])
bce_molbert = calculate_bce_rowwise(bace_y_ensemble_valid2_s, bace_molbert_prob['molbert'])

# Create a dataframe for row-wise BCE losses
bce_loss_df = pd.DataFrame({
    'bce_chemberta': bce_chemberta,
    'bce_molformer': bce_molformer,
    'bce_molbert': bce_molbert
})

# Final ensemble X matrix: Combine row-wise BCE losses, predictions, and features
bace_X_ensemble_valid2 = pd.concat([bce_loss_df, train_bace_prob], axis=1)

In [5]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
import numpy as np

# Set seeds for reproducibility
np.random.seed(0)

# Combine probabilities with their respective feature sets
chemberta_X = pd.concat([bace_chemberta2_prob, bace_chemberta2_features], axis=1)
molformer_X = pd.concat([bace_molformer_prob, bace_molformer_features], axis=1)
molbert_X = pd.concat([bace_molbert_prob, bace_molbert_features], axis=1)

# Standardize each dataset
scaler_chemberta = StandardScaler().fit(chemberta_X)
scaler_molformer = StandardScaler().fit(molformer_X)
scaler_molbert = StandardScaler().fit(molbert_X)

chemberta_X_scaled = scaler_chemberta.transform(chemberta_X)
molformer_X_scaled = scaler_molformer.transform(molformer_X)
molbert_X_scaled = scaler_molbert.transform(molbert_X)

# Define the binary cross-entropy loss values as target variables (y)
chemberta_y_bce = bce_chemberta  # Row-wise BCE loss calculated earlier
molformer_y_bce = bce_molformer  # Row-wise BCE loss calculated earlier
molbert_y_bce = bce_molbert      # Row-wise BCE loss calculated earlier

# Define RMSE scorer
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define the hyperparameter space for regression using continuous distributions
xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Objective function for hyperopt with XGBRegressor
def objective(params, X_train, y_train):
    # Convert float outputs of hp.quniform to int for certain parameters
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    
    # Initialize XGBRegressor with the given parameters
    model = xgb.XGBRegressor(**params, random_state=0)
    
    # RMSE as the scoring metric
    rmse = make_scorer(rmse_scorer, greater_is_better=False)
    score = cross_val_score(model, X_train, y_train, scoring=rmse, cv=5)
    
    # Minimize the negative RMSE score
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Function to optimize and train the model for each dataset
def optimize_and_train(X_train, y_train):
    # Run the Bayesian optimization
    trials = Trials()
    best_params = fmin(fn=lambda params: objective(params, X_train, y_train), 
                       space=xgb_hyperopt_space, 
                       algo=tpe.suggest, 
                       max_evals=50, 
                       trials=trials,
                       rstate=np.random.default_rng(0),  # Seed for reproducibility in hyperopt
                       early_stop_fn=no_progress_loss(10))

    # Convert the best hyperparameters to the correct data types
    best_params['n_estimators'] = int(best_params['n_estimators'])
    best_params['max_depth'] = int(best_params['max_depth'])

    # Initialize and train the XGBoost model with the best parameters
    model = xgb.XGBRegressor(**best_params, random_state=0)
    model.fit(X_train, y_train)
    
    return model

# Optimizing and training for chemberta dataset
print("Optimizing for chemberta dataset...")
best_model_chemberta = optimize_and_train(chemberta_X_scaled, chemberta_y_bce)

# Optimizing and training for molformer dataset
print("Optimizing for molformer dataset...")
best_model_molformer = optimize_and_train(molformer_X_scaled, molformer_y_bce)

# Optimizing and training for molbert dataset
print("Optimizing for molbert dataset...")
best_model_molbert = optimize_and_train(molbert_X_scaled, molbert_y_bce)

# Print a confirmation message
print("Models trained for chemberta, molformer, and molbert datasets.")


Optimizing for chemberta dataset...
 28%|██▊       | 14/50 [00:44<01:55,  3.20s/trial, best loss: 0.6654075905966204]
Optimizing for molformer dataset...
 26%|██▌       | 13/50 [01:15<03:35,  5.81s/trial, best loss: 0.8850283357115307]
Optimizing for molbert dataset...
 36%|███▌      | 18/50 [01:47<03:11,  5.99s/trial, best loss: 1.8050319311462963]
Models trained for chemberta, molformer, and molbert datasets.


In [6]:
import numpy as np
from sklearn.metrics import log_loss

# Test data for each model
bace_chemberta2_prob_test = pd.DataFrame({'chemberta2': bace_chemberta2_test['softmax_class_1_prob']})
bace_chemberta2_prob_test.reset_index(drop=True, inplace=True)

bace_molformer_prob_test = pd.DataFrame({'molformer': bace_molformer_test['Prob_Class_1']})
bace_molformer_prob_test.reset_index(drop=True, inplace=True)

bace_molbert_prob_test = pd.DataFrame({'molbert': bace_molbert_test['prob']})
bace_molbert_prob_test.reset_index(drop=True, inplace=True)

bace_chemberta2_features_t = pd.DataFrame(bace_chemberta2_features_test.iloc[:, 2:])
bace_chemberta2_features_test.reset_index(drop=True, inplace=True)

bace_molformer_features_t  = pd.DataFrame(bace_molformer_features_test.iloc[:, 1:])
bace_molformer_features_test.reset_index(drop=True, inplace=True)

bace_molbert_features_t = pd.DataFrame(bace_molbert_features_test.iloc[:, 1:])
bace_molbert_features_test.reset_index(drop=True, inplace=True)

# Combine probabilities with the respective feature sets for the test set
chemberta_X_test = pd.concat([bace_chemberta2_prob_test, bace_chemberta2_features_t], axis=1)
molformer_X_test = pd.concat([bace_molformer_prob_test, bace_molformer_features_t], axis=1)
molbert_X_test = pd.concat([bace_molbert_prob_test, bace_molbert_features_t], axis=1)

# Standardize the test set based on the previously fitted scalers
chemberta_X_test_scaled = scaler_chemberta.transform(chemberta_X_test)
molformer_X_test_scaled = scaler_molformer.transform(molformer_X_test)
molbert_X_test_scaled = scaler_molbert.transform(molbert_X_test)

# Predict using the best models from valid2
chemberta_pred_test = best_model_chemberta.predict(chemberta_X_test_scaled)
molformer_pred_test = best_model_molformer.predict(molformer_X_test_scaled)
molbert_pred_test = best_model_molbert.predict(molbert_X_test_scaled)

# Convert the predictions (numpy arrays) to pandas Series
chemberta_pred_test_series = pd.Series(chemberta_pred_test, name='bce_chemberta')
molformer_pred_test_series = pd.Series(molformer_pred_test, name='bce_molformer')
molbert_pred_test_series = pd.Series(molbert_pred_test, name='bce_molbert')

# Now concatenate the series with the test set probabilities
bace_X_ensemble_test = pd.concat([
    chemberta_pred_test_series,                     # BCE for Chemberta
    molformer_pred_test_series,                     # BCE for Molformer
    molbert_pred_test_series,                       # BCE for Molbert
    bace_chemberta2_prob_test['chemberta2'],        # Chemberta test probabilities
    bace_molformer_prob_test['molformer'],          # Molformer test probabilities
    bace_molbert_prob_test['molbert']               # Molbert test probabilities
], axis=1)

bace_X_ensemble_test.columns = ['bce_chemberta', 'bce_molformer', 'bce_molbert', 'chemberta2', 'molformer', 'molbert']

# optional for evaluation
bace_y_ensemble_test = bace_chemberta2_test['Class']

In [7]:
# use standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
bace_X_ensemble_valid2_scaled = scaler.fit_transform(bace_X_ensemble_valid2)
bace_X_ensemble_test_scaled = scaler.transform(bace_X_ensemble_test)

# transform back to dataframe
bace_X_ensemble_valid2_scaled = pd.DataFrame(bace_X_ensemble_valid2_scaled, columns=bace_X_ensemble_valid2.columns)
bace_X_ensemble_test_scaled = pd.DataFrame(bace_X_ensemble_test_scaled, columns=bace_X_ensemble_test.columns)

In [8]:
bace_X_ensemble_valid2_selected = bace_X_ensemble_valid2_scaled
bace_X_ensemble_test_selected = bace_X_ensemble_test_scaled
# check shapes
print(bace_X_ensemble_valid2_selected.shape)
print(bace_X_ensemble_test_selected.shape)

(305, 6)
(153, 6)


In [9]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

# Set seeds for reproducibility
np.random.seed(0)

# Define the hyperparameter space using continuous distributions
bace_xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Convert float outputs of hp.quniform to int for certain parameters
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    
    model = xgb.XGBClassifier(**params, random_state=0)
    
    # Create a scorer that calculates the roc_auc score using predicted probabilities
    roc_auc = make_scorer(roc_auc_score, response_method=None)
    score = cross_val_score(model, bace_X_ensemble_valid2_selected, bace_y_ensemble_valid2, scoring=roc_auc, cv=5)
    
    # Minimize the negative ROC AUC score
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Run the Bayesian optimization
trials = Trials()
bace_xgb_best_params = fmin(fn=objective, 
                            space=bace_xgb_hyperopt_space, 
                            algo=tpe.suggest, 
                            max_evals=50, 
                            trials=trials,
                            rstate=np.random.default_rng(0),  # Seed for hyperopt
                            early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", bace_xgb_best_params)


 36%|███▌      | 18/50 [00:03<00:05,  5.85trial/s, best loss: -0.9904395604395605]
Best hyperparameters: {'colsample_bytree': 0.8507154512772237, 'learning_rate': 0.20167619835283912, 'max_depth': 6.0, 'n_estimators': 50.0, 'subsample': 0.5797666220819824}


In [10]:
# Convert parameters obtained from Hyperopt to the correct data type
bace_xgb_best_params['n_estimators'] = int(bace_xgb_best_params['n_estimators'])
bace_xgb_best_params['max_depth'] = int(bace_xgb_best_params['max_depth'])

# Initialize and train the XGBoost model with the best parameters
bace_xgb_model = xgb.XGBClassifier(**bace_xgb_best_params, random_state=0)
bace_xgb_model.fit(bace_X_ensemble_valid2_selected, bace_y_ensemble_valid2)

# Predict the test set
bace_xgb_best_pred = bace_xgb_model.predict(bace_X_ensemble_test_selected)
bace_xgb_best_probs = bace_xgb_model.predict_proba(bace_X_ensemble_test_selected)

# Calculate the metrics
bace_xgb_best_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_xgb_best_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_xgb_best_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_xgb_best_probs[:, 1]),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_xgb_best_probs[:, 1])
}

bace_xgb_best_metrics

{'Accuracy': 0.6601307189542484,
 'F1 Score': 0.7045454545454546,
 'ROC-AUC': 0.7310035842293907,
 'PR-AUC': 0.8225589406742587}

In [11]:
# report all the metrics for ct
bace_metrics_results["XGBoost"] = bace_xgb_best_metrics

bace_metrics_df = pd.DataFrame(bace_metrics_results).T

# keep 3 digits after the decimal point
bace_metrics_df = bace_metrics_df.round(3)

# export as csv
bace_metrics_df.to_csv('./split2_bace_metrics_xgb.csv')