In [6]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
bace_chemberta2_valid2 = pd.read_csv('./chemberta2/results/bace/chemberta2_valid2_bace_2_predictions.csv')
bace_molformer_valid2 = pd.read_csv('./molformer/results/bace/molformer_valid2_bace_2_epoch49.csv')
bace_molbert_valid2 = pd.read_csv('./molbert/results/bace/molbert_valid2_bace_2.csv')

# Load the test data for each model
bace_chemberta2_test = pd.read_csv('./chemberta2/results/bace/chemberta2_test_bace_2_predictions.csv')
bace_molformer_test = pd.read_csv('./molformer/results/bace/molformer_test_bace_2_epoch49.csv')
bace_molbert_test = pd.read_csv('./molbert/results/bace/molbert_test_bace_2.csv')

# features

# Load the features from chemberta
bace_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/bace/chemberta2_valid2_bace_2_features.csv')
bace_chemberta2_features_test = pd.read_csv('./chemberta2/features/bace/chemberta2_test_bace_2_features.csv')

# Load the features from molformer
bace_molformer_features_valid2 = pd.read_csv('./molformer/features/bace/molformer_valid2_bace_2_features.csv')
bace_molformer_features_test = pd.read_csv('./molformer/features/bace/molformer_test_bace_2_features.csv')

# Load the features from molbert
bace_molbert_features_valid2 = pd.read_csv('./molbert/features/bace/molbert_valid2_bace_2_features.csv')
bace_molbert_features_test = pd.read_csv('./molbert/features/bace/molbert_test_bace_2_features.csv')

For BACE (Classification)

In [7]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Preparing the actual and predicted values
# Chemberta2
bace_chemberta_actual = bace_chemberta2_test['Class']
bace_chemberta_pred = bace_chemberta2_test['y_pred']
bace_chemberta_probs = bace_chemberta2_test[['softmax_class_0_prob', 'softmax_class_1_prob']]

# Molformer
bace_molformer_actual = bace_molformer_test['Actual']
bace_molformer_pred = (bace_molformer_test['Prob_Class_1'] > 0.5).astype(int)
bace_molformer_probs = bace_molformer_test[['Prob_Class_0', 'Prob_Class_1']]

# Molbert
bace_molbert_actual = bace_molbert_test['target']
bace_molbert_pred = bace_molbert_test['pred']
bace_molbert_probs = bace_molbert_test['prob']

# Calculating metrics
bace_metrics_results = {}

for model_name, actual, pred, probs in [("Chemberta2", bace_chemberta_actual, bace_chemberta_pred, bace_chemberta_probs['softmax_class_1_prob']),
                                         ("Molformer", bace_molformer_actual, bace_molformer_pred, bace_molformer_probs['Prob_Class_1']),
                                         ("Molbert", bace_molbert_actual, bace_molbert_pred, bace_molbert_probs)]:
    bace_metrics_results[model_name] = {
        "Accuracy": accuracy_score(actual, pred),
        "F1 Score": f1_score(actual, pred),
        "ROC-AUC": roc_auc_score(actual, probs),
        "PR-AUC": average_precision_score(actual, probs)
    }

bace_metrics_results

{'Chemberta2': {'Accuracy': 0.6535947712418301,
  'F1 Score': 0.6442953020134228,
  'ROC-AUC': 0.7465949820788532,
  'PR-AUC': 0.8058944338111258},
 'Molformer': {'Accuracy': 0.6797385620915033,
  'F1 Score': 0.6797385620915033,
  'ROC-AUC': 0.8521505376344086,
  'PR-AUC': 0.8774798970672688},
 'Molbert': {'Accuracy': 0.7189542483660131,
  'F1 Score': 0.7361963190184049,
  'ROC-AUC': 0.775089605734767,
  'PR-AUC': 0.8548501462313669}}

In [8]:
# check shapes
print(bace_chemberta2_valid2.shape)
print(bace_molformer_valid2.shape)
print(bace_molbert_valid2.shape)
print(bace_chemberta2_features_valid2.shape)
print(bace_molformer_features_valid2.shape)
print(bace_molbert_features_valid2.shape)

(305, 8)
(305, 5)
(305, 4)
(305, 386)
(305, 769)
(305, 769)


In [9]:
import pandas as pd
import numpy as np

bace_y_ensemble_valid2 = bace_chemberta2_valid2['Class']

# Convert the ensemble target to a Series if not already done
bace_y_ensemble_valid2_s = pd.Series(bace_y_ensemble_valid2).reset_index(drop=True)

# Create dataframes for each model's class 1 probability
bace_chemberta2_prob = pd.DataFrame({'chemberta2': bace_chemberta2_valid2['softmax_class_1_prob']})
bace_chemberta2_prob.reset_index(drop=True, inplace=True)

bace_molformer_prob = pd.DataFrame({'molformer': bace_molformer_valid2['Prob_Class_1']})
bace_molformer_prob.reset_index(drop=True, inplace=True)

bace_molbert_prob = pd.DataFrame({'molbert': bace_molbert_valid2['prob']})
bace_molbert_prob.reset_index(drop=True, inplace=True)

# do the same for features bace_chemberta2_features_valid2.iloc[:, 2:]
bace_chemberta2_features = pd.DataFrame(bace_chemberta2_features_valid2.iloc[:, 2:])
bace_chemberta2_features.reset_index(drop=True, inplace=True)

bace_molformer_features = pd.DataFrame(bace_molformer_features_valid2.iloc[:, 1:])
bace_molformer_features.reset_index(drop=True, inplace=True)

bace_molbert_features = pd.DataFrame(bace_molbert_features_valid2.iloc[:, 1:])
bace_molbert_features.reset_index(drop=True, inplace=True)

# bace_features = pd.concat([bace_chemberta2_features, bace_molformer_features, bace_molbert_features], axis=1)

# Combine probabilities into one dataframe
train_bace_prob = pd.concat([bace_chemberta2_prob, bace_molformer_prob, bace_molbert_prob], axis=1)

# Function to calculate BCE for each row
def calculate_bce_rowwise(y_true, y_pred):
    return -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Calculate row-wise BCE for each model
bce_chemberta = calculate_bce_rowwise(bace_y_ensemble_valid2_s, bace_chemberta2_prob['chemberta2'])
bce_molformer = calculate_bce_rowwise(bace_y_ensemble_valid2_s, bace_molformer_prob['molformer'])
bce_molbert = calculate_bce_rowwise(bace_y_ensemble_valid2_s, bace_molbert_prob['molbert'])

# Create a dataframe for row-wise BCE losses
bce_loss_df = pd.DataFrame({
    'bce_chemberta': bce_chemberta,
    'bce_molformer': bce_molformer,
    'bce_molbert': bce_molbert
})

# Final ensemble X matrix: Combine row-wise BCE losses, predictions, and features
bace_X_ensemble_valid2 = pd.concat([bce_loss_df, train_bace_prob], axis=1)

In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Set seed for reproducibility
seed = 0

# Combine probabilities with their respective feature sets
chemberta_X = pd.concat([bace_chemberta2_prob, bace_chemberta2_features], axis=1)
molformer_X = pd.concat([bace_molformer_prob, bace_molformer_features], axis=1)
molbert_X = pd.concat([bace_molbert_prob, bace_molbert_features], axis=1)

# Standardize each dataset
scaler_chemberta = StandardScaler().fit(chemberta_X)
scaler_molformer = StandardScaler().fit(molformer_X)
scaler_molbert = StandardScaler().fit(molbert_X)

chemberta_X_scaled = scaler_chemberta.transform(chemberta_X)
molformer_X_scaled = scaler_molformer.transform(molformer_X)
molbert_X_scaled = scaler_molbert.transform(molbert_X)

# Define the binary cross-entropy loss values as target variables (y)
chemberta_y_bce = bce_chemberta  # Row-wise BCE loss calculated earlier
molformer_y_bce = bce_molformer  # Row-wise BCE loss calculated earlier
molbert_y_bce = bce_molbert      # Row-wise BCE loss calculated earlier

# Initialize the ElasticNet models with l1_ratio set to 1 for LASSO, and random_state for reproducibility
lasso_chemberta = ElasticNet(max_iter=10000, tol=0.0001, random_state=seed, l1_ratio=1.0)
lasso_molformer = ElasticNet(max_iter=10000, tol=0.0001, random_state=seed, l1_ratio=1.0)
lasso_molbert = ElasticNet(max_iter=10000, tol=0.0001, random_state=seed, l1_ratio=1.0)

# Setup cross-validation for alpha tuning only (since l1_ratio is fixed at 1 for LASSO)
param_grid = {
    'alpha': np.logspace(-4, 1, 10)
}

# Set up GridSearchCV for LASSO with neg_mean_squared_error scoring and random_state
cv_chemberta = GridSearchCV(estimator=lasso_chemberta, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
cv_molformer = GridSearchCV(estimator=lasso_molformer, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
cv_molbert = GridSearchCV(estimator=lasso_molbert, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit GridSearchCV for each dataset with BCE as the target
cv_chemberta.fit(chemberta_X_scaled, chemberta_y_bce)
cv_molformer.fit(molformer_X_scaled, molformer_y_bce)
cv_molbert.fit(molbert_X_scaled, molbert_y_bce)

# Retrieve the best models and parameters
best_model_chemberta = cv_chemberta.best_estimator_
best_model_molformer = cv_molformer.best_estimator_
best_model_molbert = cv_molbert.best_estimator_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [11]:
import numpy as np
from sklearn.metrics import log_loss

# Test data for each model
bace_chemberta2_prob_test = pd.DataFrame({'chemberta2': bace_chemberta2_test['softmax_class_1_prob']})
bace_chemberta2_prob_test.reset_index(drop=True, inplace=True)

bace_molformer_prob_test = pd.DataFrame({'molformer': bace_molformer_test['Prob_Class_1']})
bace_molformer_prob_test.reset_index(drop=True, inplace=True)

bace_molbert_prob_test = pd.DataFrame({'molbert': bace_molbert_test['prob']})
bace_molbert_prob_test.reset_index(drop=True, inplace=True)

bace_chemberta2_features_t = pd.DataFrame(bace_chemberta2_features_test.iloc[:, 2:])
bace_chemberta2_features_test.reset_index(drop=True, inplace=True)

bace_molformer_features_t  = pd.DataFrame(bace_molformer_features_test.iloc[:, 1:])
bace_molformer_features_test.reset_index(drop=True, inplace=True)

bace_molbert_features_t = pd.DataFrame(bace_molbert_features_test.iloc[:, 1:])
bace_molbert_features_test.reset_index(drop=True, inplace=True)

# Combine probabilities with the respective feature sets for the test set
chemberta_X_test = pd.concat([bace_chemberta2_prob_test, bace_chemberta2_features_t], axis=1)
molformer_X_test = pd.concat([bace_molformer_prob_test, bace_molformer_features_t], axis=1)
molbert_X_test = pd.concat([bace_molbert_prob_test, bace_molbert_features_t], axis=1)

# Standardize the test set based on the previously fitted scalers
chemberta_X_test_scaled = scaler_chemberta.transform(chemberta_X_test)
molformer_X_test_scaled = scaler_molformer.transform(molformer_X_test)
molbert_X_test_scaled = scaler_molbert.transform(molbert_X_test)

# Predict using the best models from valid2
chemberta_pred_test = best_model_chemberta.predict(chemberta_X_test_scaled)
molformer_pred_test = best_model_molformer.predict(molformer_X_test_scaled)
molbert_pred_test = best_model_molbert.predict(molbert_X_test_scaled)

# Convert the predictions (numpy arrays) to pandas Series
chemberta_pred_test_series = pd.Series(chemberta_pred_test, name='bce_chemberta')
molformer_pred_test_series = pd.Series(molformer_pred_test, name='bce_molformer')
molbert_pred_test_series = pd.Series(molbert_pred_test, name='bce_molbert')

# Now concatenate the series with the test set probabilities
bace_X_ensemble_test = pd.concat([
    chemberta_pred_test_series,                     # BCE for Chemberta
    molformer_pred_test_series,                     # BCE for Molformer
    molbert_pred_test_series,                       # BCE for Molbert
    bace_chemberta2_prob_test['chemberta2'],        # Chemberta test probabilities
    bace_molformer_prob_test['molformer'],          # Molformer test probabilities
    bace_molbert_prob_test['molbert']               # Molbert test probabilities
], axis=1)

bace_X_ensemble_test.columns = ['bce_chemberta', 'bce_molformer', 'bce_molbert', 'chemberta2', 'molformer', 'molbert']

# optional for evaluation
bace_y_ensemble_test = bace_chemberta2_test['Class']

In [12]:
# use standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
bace_X_ensemble_valid2_scaled = scaler.fit_transform(bace_X_ensemble_valid2)
bace_X_ensemble_test_scaled = scaler.transform(bace_X_ensemble_test)

# transform back to dataframe
bace_X_ensemble_valid2_scaled = pd.DataFrame(bace_X_ensemble_valid2_scaled, columns=bace_X_ensemble_valid2.columns)
bace_X_ensemble_test_scaled = pd.DataFrame(bace_X_ensemble_test_scaled, columns=bace_X_ensemble_test.columns)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Define the model with LASSO penalty
lasso_model = LogisticRegression(penalty='l1', solver='saga', max_iter=5000, random_state=0)

# Use fewer discrete values for alpha
alphas = [0.01, 0.1, 1, 3]  # Reduced number of points focusing on lower and mid-range

# Convert alphas to Cs for the parameter grid (since C is the inverse of alpha)
Cs = [1/alpha for alpha in alphas]

# Create a concise grid search using 5-fold cross-validation
params = {
    'C': Cs
}

grid_search = GridSearchCV(lasso_model, param_grid=params, cv=5, scoring='roc_auc')

# Fit the grid search to the data
grid_search.fit(bace_X_ensemble_valid2_scaled, bace_y_ensemble_valid2)

# Best model after grid search
bace_best_lasso_model = grid_search.best_estimator_
print(grid_search.best_params_)

# Predict the test set
bace_lasso_pred = bace_best_lasso_model.predict(bace_X_ensemble_test_scaled)
bace_lasso_probs = bace_best_lasso_model.predict_proba(bace_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
bace_lasso_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_lasso_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_lasso_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_lasso_probs),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_lasso_probs)
}

bace_lasso_metrics

{'C': 0.3333333333333333}


{'Accuracy': 0.6862745098039216,
 'F1 Score': 0.68,
 'ROC-AUC': 0.829032258064516,
 'PR-AUC': 0.8844576341121501}

In [14]:
# report all the metrics for ct
bace_metrics_results["Lasso"] = bace_lasso_metrics

bace_metrics_df = pd.DataFrame(bace_metrics_results).T

# keep 3 digits after the decimal point
bace_metrics_df = bace_metrics_df.round(3)

# export as csv
bace_metrics_df.to_csv('./split2_bace_metrics_lasso.csv')