In [25]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
delaney_chemberta2_valid2 = pd.read_csv('./chemberta2/results/delaney/chemberta2_valid2_delaney_1_predictions.csv')
delaney_molformer_valid2 = pd.read_csv('./molformer/results/delaney/molformer_valid2_delaney_1_99.csv')
delaney_molbert_valid2 = pd.read_csv('./molbert/results/delaney/molbert_valid2_delaney_1.csv')

# Load the test data for each model
delaney_chemberta2_test = pd.read_csv('./chemberta2/results/delaney/chemberta2_test_delaney_1_predictions.csv')
delaney_molformer_test = pd.read_csv('./molformer/results/delaney/molformer_test_delaney_1_99.csv')
delaney_molbert_test = pd.read_csv('./molbert/results/delaney/molbert_test_delaney_1.csv')

train_mean = -2.990841715976331
train_sd = 2.0813296797119367

# features

# Load the features from chemberta
delaney_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/delaney/chemberta2_valid2_delaney_1_features.csv')
delaney_chemberta2_features_test = pd.read_csv('./chemberta2/features/delaney/chemberta2_test_delaney_1_features.csv')

# Load the features from molformer
delaney_molformer_features_valid2 = pd.read_csv('./molformer/features/delaney/molformer_valid2_delaney_1_features.csv')
delaney_molformer_features_test = pd.read_csv('./molformer/features/delaney/molformer_test_delaney_1_features.csv')

# Load the features from molbert
delaney_molbert_features_valid2 = pd.read_csv('./molbert/features/delaney/molbert_valid2_delaney_1_features.csv')
delaney_molbert_features_test = pd.read_csv('./molbert/features/delaney/molbert_test_delaney_1_features.csv')

For delaney (regression)

In [26]:
# Preparing the actual and predicted values
# Chemberta2
delaney_chemberta_actual = delaney_chemberta2_test['target'] 
delaney_chemberta_pred = delaney_chemberta2_test['pred_raw']

# Molformer
delaney_molformer_actual = delaney_molformer_test['target']
delaney_molformer_pred = delaney_molformer_test['pred_raw']

# molbert
delaney_molbert_actual = delaney_molbert_test['target_raw']
delaney_molbert_pred = delaney_molbert_test['pred_raw']

In [27]:
# Calculating metrics
delaney_metrics_results = {}

for model_name, actual, pred in [("Chemberta2", delaney_chemberta_actual, delaney_chemberta_pred),
                                 ("Molformer", delaney_molformer_actual, delaney_molformer_pred),
                                 ("Molbert", delaney_molbert_actual, delaney_molbert_pred)]:
    delaney_metrics_results[model_name] = {
        "MAE": mean_absolute_error(actual, pred),
        "RMSE": np.sqrt(mean_squared_error(actual, pred)),
        "R2 Score": r2_score(actual, pred),
        "Correlation": pearsonr(actual, pred)[0]  # Only record the correlation coefficient
    }

delaney_metrics_results

{'Chemberta2': {'MAE': 0.4756639858529221,
  'RMSE': 0.6352937523015375,
  'R2 Score': 0.8983541925476519,
  'Correlation': 0.9496737010935903},
 'Molformer': {'MAE': 0.5006743870707965,
  'RMSE': 0.6749164430905726,
  'R2 Score': 0.8852796879749746,
  'Correlation': 0.9549263564534224},
 'Molbert': {'MAE': 0.49171757511504427,
  'RMSE': 0.6436237764976567,
  'R2 Score': 0.8956711404586751,
  'Correlation': 0.9464944351824109}}

In [28]:
# standardized valid2 labels
delaney_y_ensemble_valid2 = (delaney_chemberta2_valid2['target'] - train_mean)/train_sd

# Create the features for the ensemble from the prediction probabilities of being in class 1
delaney_X_ensemble_valid2 = pd.concat([
    delaney_chemberta2_valid2['pred_z'] - delaney_y_ensemble_valid2,
    delaney_molformer_valid2['pred_z'] - delaney_y_ensemble_valid2, 
    delaney_molbert_valid2['pred_z'] - delaney_y_ensemble_valid2,
    # add features from training set
    delaney_chemberta2_valid2['pred_z'],
    delaney_molformer_valid2['pred_z'],
    delaney_molbert_valid2['pred_z']
], axis=1)

delaney_chemberta2_pred = delaney_chemberta2_valid2['pred_z']
delaney_molformer_pred = delaney_molformer_valid2['pred_z']
delaney_molbert_pred = delaney_molbert_valid2['pred_z']

# change feature names of the ensemble so that they are unique
delaney_X_ensemble_valid2.columns = ['residuals_chemberta', 'residuals_molformer', 'residuals_molbert', 'chemberta', 'molformer', 'molbert']

In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Compute residuals
chemberta_y_residual = delaney_chemberta2_valid2['pred_z'] - delaney_y_ensemble_valid2
molformer_y_residual = delaney_molformer_valid2['pred_z'] - delaney_y_ensemble_valid2
molbert_y_residual = delaney_molbert_valid2['pred_z'] - delaney_y_ensemble_valid2

# For input features, we use pred_z and the other features
chemberta_X = pd.concat([delaney_chemberta2_valid2['pred_z'], 
                         delaney_chemberta2_features_valid2.iloc[:, 2:]], axis=1)
molformer_X = pd.concat([delaney_molformer_valid2['pred_z'], 
                         delaney_molformer_features_valid2.iloc[:, 1:]], axis=1)
molbert_X = pd.concat([delaney_molbert_valid2['pred_z'], 
                       delaney_molbert_features_valid2.iloc[:, 1:]], axis=1)

# Standardize each dataset
scaler_chemberta = StandardScaler().fit(chemberta_X)
scaler_molformer = StandardScaler().fit(molformer_X)
scaler_molbert = StandardScaler().fit(molbert_X)

chemberta_X_scaled = scaler_chemberta.transform(chemberta_X)
molformer_X_scaled = scaler_molformer.transform(molformer_X)
molbert_X_scaled = scaler_molbert.transform(molbert_X)

In [30]:
import pandas as pd

# Export valid2 probabilities
delaney_chemberta2_pred.to_csv('./processed_data/delaney_chemberta_pred.csv', index=False)
delaney_molformer_pred.to_csv('./processed_data/delaney_molformer_pred.csv', index=False)
delaney_molbert_pred.to_csv('./processed_data/delaney_molbert_pred.csv', index=False)

# Convert numpy arrays to pandas DataFrames and export
pd.DataFrame(chemberta_X_scaled).to_csv('./processed_data/delaney_chemberta_X.csv', index=False)
pd.DataFrame(molformer_X_scaled).to_csv('./processed_data/delaney_molformer_X.csv', index=False)
pd.DataFrame(molbert_X_scaled).to_csv('./processed_data/delaney_molbert_X.csv', index=False)

# Export BCE targets
chemberta_y_residual.to_csv('./processed_data/delaney_chemberta_y.csv', index=False)
molformer_y_residual.to_csv('./processed_data/delaney_molformer_y.csv', index=False)
molbert_y_residual.to_csv('./processed_data/delaney_molbert_y.csv', index=False)

# Export ensemble predictions
delaney_y_ensemble_valid2.to_csv('./processed_data/delaney_y_ensemble_valid2.csv', index=False)

In [31]:
# ChemBERTa
chemberta_X_test = pd.concat([delaney_chemberta2_test['pred_z'], 
                              delaney_chemberta2_features_test.iloc[:, 2:]], axis=1)

# MolFormer
molformer_X_test = pd.concat([delaney_molformer_test['pred_z'], 
                              delaney_molformer_features_test.iloc[:, 1:]], axis=1)

# MolBERT
molbert_X_test = pd.concat([delaney_molbert_test['pred_z'], 
                            delaney_molbert_features_test.iloc[:, 1:]], axis=1)

delaney_chemberta2_pred_test = delaney_chemberta2_test['pred_z']
delaney_molformer_pred_test = delaney_molformer_test['pred_z']
delaney_molbert_pred_test = delaney_molbert_test['pred_z']

# Scale the test data using the corresponding scalers
chemberta_X_test_scaled = scaler_chemberta.transform(chemberta_X_test)
molformer_X_test_scaled = scaler_molformer.transform(molformer_X_test)
molbert_X_test_scaled = scaler_molbert.transform(molbert_X_test)

In [32]:
import pandas as pd

# Export test predabilities
delaney_chemberta2_pred_test.to_csv('./processed_data/delaney_chemberta_pred_test.csv', index=False)
delaney_molformer_pred_test.to_csv('./processed_data/delaney_molformer_pred_test.csv', index=False)
delaney_molbert_pred_test.to_csv('./processed_data/delaney_molbert_pred_test.csv', index=False)

# Convert numpy arrays to pandas DataFrames and export test scaled data
pd.DataFrame(chemberta_X_test_scaled).to_csv('./processed_data/delaney_chemberta_X_test.csv', index=False)
pd.DataFrame(molformer_X_test_scaled).to_csv('./processed_data/delaney_molformer_X_test.csv', index=False)
pd.DataFrame(molbert_X_test_scaled).to_csv('./processed_data/delaney_molbert_X_test.csv', index=False)

# Export ensemble test class labels
delaney_y_ensemble_test = delaney_chemberta2_test['target']
delaney_y_ensemble_test.to_csv('./processed_data/delaney_y_ensemble_test.csv', index=False)