In [1]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
lipo_chemberta2_valid2 = pd.read_csv('./chemberta2/results/lipo/chemberta2_valid2_lipo_2_predictions.csv')
lipo_molformer_valid2 = pd.read_csv('./molformer/results/lipo/molformer_valid2_lipo_2.csv')
lipo_molbert_valid2 = pd.read_csv('./molbert/results/lipo/molbert_valid2_lipo_2.csv')

# Load the test data for each model
lipo_chemberta2_test = pd.read_csv('./chemberta2/results/lipo/chemberta2_test_lipo_2_predictions.csv')
lipo_molformer_test = pd.read_csv('./molformer/results/lipo/molformer_test_lipo_2.csv')
lipo_molbert_test = pd.read_csv('./molbert/results/lipo/molbert_test_lipo_2.csv')

train_mean = 2.1799722222222226
train_sd = 1.2045431891731355

# features

lipo_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/lipo/chemberta2_valid2_lipo_2_features.csv')
lipo_chemberta2_features_test = pd.read_csv('./chemberta2/features/lipo/chemberta2_test_lipo_2_features.csv')

lipo_molformer_features_valid2 = pd.read_csv('./molformer/features/lipo/molformer_valid2_lipo_2_features.csv')
lipo_molformer_features_test = pd.read_csv('./molformer/features/lipo/molformer_test_lipo_2_features.csv')

lipo_molbert_features_valid2 = pd.read_csv('./molbert/features/lipo/molbert_valid2_lipo_2_features.csv')
lipo_molbert_features_test = pd.read_csv('./molbert/features/lipo/molbert_test_lipo_2_features.csv')

For Lipo (Regression)

In [2]:
# Preparing the actual and predicted values
# Chemberta2
lipo_chemberta_actual = lipo_chemberta2_test['target'] 
lipo_chemberta_pred = lipo_chemberta2_test['pred_raw']

# Molformer
lipo_molformer_actual = lipo_molformer_test['target']
lipo_molformer_pred = lipo_molformer_test['pred_raw']

# molbert
lipo_molbert_actual = lipo_molbert_test['target_raw']
lipo_molbert_pred = lipo_molbert_test['pred_raw']

In [3]:
# Calculating metrics
lipo_metrics_results = {}

for model_name, actual, pred in [("Chemberta2", lipo_chemberta_actual, lipo_chemberta_pred),
                                 ("Molformer", lipo_molformer_actual, lipo_molformer_pred),
                                 ("Molbert", lipo_molbert_actual, lipo_molbert_pred)]:
    lipo_metrics_results[model_name] = {
        "MAE": mean_absolute_error(actual, pred),
        "RMSE": np.sqrt(mean_squared_error(actual, pred)),
        "R2 Score": r2_score(actual, pred),
        "Correlation": pearsonr(actual, pred)[0]  # Only record the correlation coefficient
    }

lipo_metrics_results

{'Chemberta2': {'MAE': 0.47080364152170984,
  'RMSE': 0.6194242164406615,
  'R2 Score': 0.725099924775132,
  'Correlation': 0.8597960575490352},
 'Molformer': {'MAE': 0.45518342900000003,
  'RMSE': 0.6064332745501879,
  'R2 Score': 0.7365097534650544,
  'Correlation': 0.867404233883593},
 'Molbert': {'MAE': 0.5100067769880953,
  'RMSE': 0.6507887204149233,
  'R2 Score': 0.6965560154119396,
  'Correlation': 0.8370118646634633}}

In [4]:
# standardized valid2 labels
lipo_y_ensemble_valid2 = (lipo_chemberta2_valid2['target'] - train_mean)/train_sd

# Create the features for the ensemble from the prediction probabilities of being in class 1
lipo_X_ensemble_valid2 = pd.concat([
    lipo_chemberta2_valid2['pred_z'] - lipo_y_ensemble_valid2,
    lipo_molformer_valid2['pred_z'] - lipo_y_ensemble_valid2, 
    lipo_molbert_valid2['pred_z'] - lipo_y_ensemble_valid2,
    # add features from training set
    lipo_chemberta2_valid2['pred_z'],
    lipo_molformer_valid2['pred_z'],
    lipo_molbert_valid2['pred_z']
], axis=1)

# change feature names of the ensemble so that they are unique
lipo_X_ensemble_valid2.columns = ['residuals_chemberta', 'residuals_molformer', 'residuals_molbert', 'chemberta', 'molformer', 'molbert']

In [5]:
import numpy as np
import pandas as pd
from skglm import GroupLasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

# Compute residuals
chemberta_y_residual = lipo_chemberta2_valid2['pred_z'] - lipo_y_ensemble_valid2
molformer_y_residual = lipo_molformer_valid2['pred_z'] - lipo_y_ensemble_valid2
molbert_y_residual = lipo_molbert_valid2['pred_z'] - lipo_y_ensemble_valid2

# For input features, we use pred_z and the other features
chemberta_X = pd.concat([lipo_chemberta2_valid2['pred_z'], 
                         lipo_chemberta2_features_valid2.iloc[:, 2:]], axis=1)
molformer_X = pd.concat([lipo_molformer_valid2['pred_z'], 
                         lipo_molformer_features_valid2.iloc[:, 1:]], axis=1)
molbert_X = pd.concat([lipo_molbert_valid2['pred_z'], 
                       lipo_molbert_features_valid2.iloc[:, 1:]], axis=1)

# Standardize each dataset
scaler_chemberta = StandardScaler().fit(chemberta_X)
scaler_molformer = StandardScaler().fit(molformer_X)
scaler_molbert = StandardScaler().fit(molbert_X)

chemberta_X_scaled = scaler_chemberta.transform(chemberta_X)
molformer_X_scaled = scaler_molformer.transform(molformer_X)
molbert_X_scaled = scaler_molbert.transform(molbert_X)

# Define groups for each dataset
n_features_chemberta = chemberta_X_scaled.shape[1]
n_features_molformer = molformer_X_scaled.shape[1]
n_features_molbert = molbert_X_scaled.shape[1]

# Example grouping structure: 1st feature is prediction, rest are features
chemberta_groups = [list(range(0, 1)), list(range(1, n_features_chemberta))]
molformer_groups = [list(range(0, 1)), list(range(1, n_features_molformer))]
molbert_groups = [list(range(0, 1)), list(range(1, n_features_molbert))]

# Initialize the GroupLasso models
group_lasso_chemberta = GroupLasso(groups=chemberta_groups, alpha=1.0, tol=0.0001)
group_lasso_molformer = GroupLasso(groups=molformer_groups, alpha=1.0, tol=0.0001)
group_lasso_molbert = GroupLasso(groups=molbert_groups, alpha=1.0, tol=0.0001)

# Setup cross-validation for alpha tuning
param_grid = {'alpha': np.logspace(-4, 1, 10)}

cv_chemberta = GridSearchCV(estimator=group_lasso_chemberta, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
cv_molformer = GridSearchCV(estimator=group_lasso_molformer, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
cv_molbert = GridSearchCV(estimator=group_lasso_molbert, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit GridSearchCV for each dataset
cv_chemberta.fit(chemberta_X_scaled, chemberta_y_residual)
cv_molformer.fit(molformer_X_scaled, molformer_y_residual)
cv_molbert.fit(molbert_X_scaled, molbert_y_residual)

# Retrieve the best models and parameters
best_model_chemberta = cv_chemberta.best_estimator_
best_model_molformer = cv_molformer.best_estimator_
best_model_molbert = cv_molbert.best_estimator_

In [6]:
# ChemBERTa
chemberta_X_test = pd.concat([lipo_chemberta2_test['pred_z'], 
                              lipo_chemberta2_features_test.iloc[:, 2:]], axis=1)

# MolFormer
molformer_X_test = pd.concat([lipo_molformer_test['pred_z'], 
                              lipo_molformer_features_test.iloc[:, 1:]], axis=1)

# MolBERT
molbert_X_test = pd.concat([lipo_molbert_test['pred_z'], 
                            lipo_molbert_features_test.iloc[:, 1:]], axis=1)

# Scale the test data using the corresponding scalers
chemberta_X_test_scaled = scaler_chemberta.transform(chemberta_X_test)
molformer_X_test_scaled = scaler_molformer.transform(molformer_X_test)
molbert_X_test_scaled = scaler_molbert.transform(molbert_X_test)

# Predict residuals for the test set using the trained GroupLasso models
y_pred_residuals_chemberta_test = best_model_chemberta.predict(chemberta_X_test_scaled)
y_pred_residuals_molformer_test = best_model_molformer.predict(molformer_X_test_scaled)
y_pred_residuals_molbert_test = best_model_molbert.predict(molbert_X_test_scaled)

# Create a DataFrame to store the predicted residuals for the test set
predicted_residuals_test = pd.DataFrame({
    'Residuals_Chemberta': y_pred_residuals_chemberta_test,
    'Residuals_Molformer': y_pred_residuals_molformer_test,
    'Residuals_Molbert': y_pred_residuals_molbert_test
})

# Load test data into ensemble DataFrame, adding the predicted residuals and original predictions
lipo_X_ensemble_test = pd.concat([
    predicted_residuals_test,
    lipo_chemberta2_test['pred_z'],
    lipo_molformer_test['pred_z'],  
    lipo_molbert_test['pred_z']
], axis=1)

# Rename feature columns so that they are unique
lipo_X_ensemble_test.columns = ['residuals_chemberta', 'residuals_molformer', 'residuals_molbert', 'chemberta', 'molformer', 'molbert']

# True test labels
lipo_y_ensemble_test = lipo_chemberta2_test['target']

In [7]:
# scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
lipo_X_ensemble_valid2_scaled = scaler.fit_transform(lipo_X_ensemble_valid2)
lipo_X_ensemble_test_scaled = scaler.transform(lipo_X_ensemble_test)

lipo_X_ensemble_valid2_scaled = pd.DataFrame(lipo_X_ensemble_valid2_scaled, columns=lipo_X_ensemble_valid2.columns)
lipo_X_ensemble_test_scaled = pd.DataFrame(lipo_X_ensemble_test_scaled, columns=lipo_X_ensemble_test.columns)

In [8]:
# # export lipo_X_ensemble_valid2 and lipo_y_ensemble_valid2 to csv
# lipo_X_ensemble_valid2_scaled.to_csv('./processed_data/lipo_X_ensemble_valid2_scaled_aux_alt.csv', index=False)
# lipo_y_ensemble_valid2.to_csv('./processed_data/lipo_y_ensemble_valid2.csv', index=False)

# lipo_X_ensemble_test_scaled.to_csv('./processed_data/lipo_X_ensemble_test_scaled_aux_alt.csv', index=False)
# lipo_y_ensemble_test.to_csv('./processed_data/lipo_y_ensemble_test.csv', index=False)

In [9]:
# # lasso model
# from sklearn.linear_model import LassoCV
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Initialize the LassoCV model
# lasso_cv = LassoCV(cv=5, max_iter = 5000, random_state=0)

# # Fit the model
# lasso_cv.fit(lipo_X_ensemble_valid2_scaled, lipo_y_ensemble_valid2)

# # Predict the test set
# lipo_lasso_pred = lasso_cv.predict(lipo_X_ensemble_test_scaled) * train_sd + train_mean

# # Calculate the metrics
# lipo_lasso_metrics = {
#     "MAE": mean_absolute_error(lipo_y_ensemble_test, lipo_lasso_pred),
#     "RMSE": np.sqrt(mean_squared_error(lipo_y_ensemble_test, lipo_lasso_pred)),
#     "R2 Score": r2_score(lipo_y_ensemble_test, lipo_lasso_pred),
#     "Correlation": pearsonr(lipo_y_ensemble_test, lipo_lasso_pred)[0]
# }

# lipo_lasso_metrics

In [10]:
import numpy as np
from skglm import GroupLasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

# two groups: one for the prediction probabilities and one for the features

# Define the groups for each feature
n_features = lipo_X_ensemble_valid2_scaled.shape[1]

groups = [
    list(range(0, 3)),  # Group 0 with feature indices 0, 1, 2
    list(range(3, n_features))  # Group 1 with all remaining features
]

# Initialize the GroupLasso model
group_lasso_model = GroupLasso(
    groups=groups,
    alpha=1.0,
    p0=10,
    verbose=0,
    tol=0.0001,
    positive=False,
    fit_intercept=True,
    warm_start=False,
)


# Setup cross-validation to find the best alpha
param_grid = {'alpha': np.logspace(-4, 1, 10)}
cv = GridSearchCV(
    estimator=group_lasso_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5
)

# Fit GridSearchCV
cv.fit(lipo_X_ensemble_valid2_scaled, lipo_y_ensemble_valid2)

# Best model and parameters
best_model = cv.best_estimator_
print("Best alpha:", cv.best_params_['alpha'])

# Predict using the best model
lipo_pred = best_model.predict(lipo_X_ensemble_test_scaled) * train_sd + train_mean

# Calculate the evaluation metrics
lipo_two_groups_lasso_best_metrics = {
    "MAE": mean_absolute_error(lipo_y_ensemble_test, lipo_pred),
    "RMSE": np.sqrt(mean_squared_error(lipo_y_ensemble_test, lipo_pred)),
    "R2 Score": r2_score(lipo_y_ensemble_test, lipo_pred),
    "Correlation": pearsonr(lipo_y_ensemble_test, lipo_pred)[0]  # Only record the correlation coefficient
}

# Print the calculated metrics
lipo_two_groups_lasso_best_metrics

Best alpha: 0.0001


{'MAE': 0.4217230913485837,
 'RMSE': 0.5507903493858632,
 'R2 Score': 0.782644245400022,
 'Correlation': 0.8860127564024803}

In [11]:
# elastic net
# Define the model with elasticnet penalty for regression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

elastic_net_model = ElasticNet(random_state=0, max_iter=5000)

# Define the hyperparameter grid
# Use fewer discrete values for alpha and l1_ratio
alphas = [0.01, 0.1, 1, 3]  # Reduced number of points focusing on lower and mid-range
l1_ratios = [0.1, 0.5, 0.9]  # Reduced to three points, emphasizing edges and midpoint

params = {
    'alpha': alphas,  # Convert alpha back to C
    'l1_ratio': l1_ratios
}

grid_search = GridSearchCV(elastic_net_model, param_grid=params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(lipo_X_ensemble_valid2_scaled, lipo_y_ensemble_valid2)

# Get the best hyperparameters
lipo_best_elastic_params = grid_search.best_params_
print(lipo_best_elastic_params)

# Initialize and train the best ElasticNet model
lipo_best_elastic_model = ElasticNet(alpha=lipo_best_elastic_params['alpha'], l1_ratio=lipo_best_elastic_params['l1_ratio'], random_state=0, max_iter=5000)
lipo_best_elastic_model.fit(lipo_X_ensemble_valid2_scaled, lipo_y_ensemble_valid2)

# Predict the test set
lipo_elastic_pred = lipo_best_elastic_model.predict(lipo_X_ensemble_test_scaled) * train_sd + train_mean

# Calculate the metrics
lipo_elastic_metrics = {
    "MAE": mean_absolute_error(lipo_y_ensemble_test, lipo_elastic_pred),
    "RMSE": np.sqrt(mean_squared_error(lipo_y_ensemble_test, lipo_elastic_pred)),
    "R2 Score": r2_score(lipo_y_ensemble_test, lipo_elastic_pred),
    "Correlation": pearsonr(lipo_y_ensemble_test, lipo_elastic_pred)[0]
}

lipo_elastic_metrics

{'alpha': 0.01, 'l1_ratio': 0.1}


{'MAE': 0.41634767999691596,
 'RMSE': 0.5468182920612624,
 'R2 Score': 0.7857678893414852,
 'Correlation': 0.8873293059936234}

In [12]:
lipo_X_ensemble_valid2_selected = lipo_X_ensemble_valid2_scaled
lipo_X_ensemble_test_selected = lipo_X_ensemble_test_scaled

# check shapes
print(lipo_X_ensemble_valid2_selected.shape)
print(lipo_X_ensemble_test_selected.shape)

(840, 6)
(420, 6)


In [13]:
# Initialize and train the SVR model
from sklearn.svm import SVR

lipo_svr_model = SVR()
lipo_svr_model.fit(lipo_X_ensemble_valid2_selected, lipo_y_ensemble_valid2)

# Predict the test set
lipo_svr_pred = lipo_svr_model.predict(lipo_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
lipo_svr_metrics = {
    "MAE": mean_absolute_error(lipo_y_ensemble_test, lipo_svr_pred ),
    "RMSE": np.sqrt(mean_squared_error(lipo_y_ensemble_test, lipo_svr_pred )),
    "R2 Score": r2_score(lipo_y_ensemble_test, lipo_svr_pred ),
    "Correlation": pearsonr(lipo_y_ensemble_test, lipo_svr_pred )[0]  # Only record the correlation coefficient
}

lipo_svr_metrics

{'MAE': 0.42379238089263815,
 'RMSE': 0.5520445753646301,
 'R2 Score': 0.7816532199638626,
 'Correlation': 0.8867653060641965}

In [14]:
# initailize and use a 5-fold cross-validation to tune the hyperparameters of a random forest model for regression
from sklearn.ensemble import RandomForestRegressor

lipo_rf_model = RandomForestRegressor(random_state=0)

lipo_rf_model.fit(lipo_X_ensemble_valid2_selected, lipo_y_ensemble_valid2)

# Predict the test set
lipo_rf_best_pred = lipo_rf_model.predict(lipo_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
lipo_rf_best_metrics = {
    "MAE": mean_absolute_error(lipo_y_ensemble_test, lipo_rf_best_pred),
    "RMSE": np.sqrt(mean_squared_error(lipo_y_ensemble_test, lipo_rf_best_pred)),
    "R2 Score": r2_score(lipo_y_ensemble_test, lipo_rf_best_pred),
    "Correlation": pearsonr(lipo_y_ensemble_test, lipo_rf_best_pred)[0]  # Only record the correlation coefficient
}

lipo_rf_best_metrics

{'MAE': 0.42175642857142864,
 'RMSE': 0.5538008231648769,
 'R2 Score': 0.7802617343941833,
 'Correlation': 0.884398409696019}

In [15]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
import random

# Set seeds for reproducibility
np.random.seed(0)
random.seed(0)

# Define the hyperparameter space using continuous distributions
lipo_xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Correctly define the RMSE scorer function
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Convert float outputs of hp.quniform to int for certain parameters
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    model = xgb.XGBRegressor(**params, random_state=0)
    
    # Cross-validated RMSE as the objective
    score = cross_val_score(model, lipo_X_ensemble_valid2_selected, lipo_y_ensemble_valid2, 
                            scoring=make_scorer(rmse_scorer, greater_is_better=False), cv=5)
    
    # Minimize the positive RMSE (already negative from scoring)
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Run the Bayesian optimization
trials = Trials()
lipo_xgb_best_params = fmin(fn=objective, 
                            space=lipo_xgb_hyperopt_space, 
                            algo=tpe.suggest, 
                            max_evals=50, 
                            trials=trials,
                            rstate=np.random.default_rng(0),  # Seed for hyperopt
                            early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", lipo_xgb_best_params)


 50%|█████     | 25/50 [00:08<00:08,  3.02trial/s, best loss: 0.09354631611815283]
Best hyperparameters: {'colsample_bytree': 0.9558186054871323, 'learning_rate': 0.09634469734464615, 'max_depth': 4.0, 'n_estimators': 200.0, 'subsample': 0.8082925817075997}


In [16]:
# fit the model with the best hyperparameters
# Convert parameters obtained from Hyperopt to the correct data type
lipo_xgb_best_params['n_estimators'] = int(lipo_xgb_best_params['n_estimators'])
lipo_xgb_best_params['max_depth'] = int(lipo_xgb_best_params['max_depth'])

# Initialize and train the XGBoost model with the best parameters
lipo_xgb_model = xgb.XGBRegressor(**lipo_xgb_best_params, random_state=0)
lipo_xgb_model.fit(lipo_X_ensemble_valid2_selected, lipo_y_ensemble_valid2)

# Predict the test set
lipo_xgb_best_pred = lipo_xgb_model.predict(lipo_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
lipo_xgb_best_metrics = {
    "MAE": mean_absolute_error(lipo_y_ensemble_test, lipo_xgb_best_pred),
    "RMSE": np.sqrt(mean_squared_error(lipo_y_ensemble_test, lipo_xgb_best_pred)),
    "R2 Score": r2_score(lipo_y_ensemble_test, lipo_xgb_best_pred),
    "Correlation": pearsonr(lipo_y_ensemble_test, lipo_xgb_best_pred)[0]  # Only record the correlation coefficient
}

lipo_xgb_best_metrics

{'MAE': 0.430040680374418,
 'RMSE': 0.5535197115013099,
 'R2 Score': 0.7804847579405618,
 'Correlation': 0.883861404237824}

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

# Set seeds for reproducibility
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

# Define RMSE loss
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.mse = nn.MSELoss()

    def forward(self, y_pred, y_true):
        return torch.sqrt(self.mse(y_pred, y_true))

# Define the neural network model for regression
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(1, num_layers):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1)]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Hyperparameter space with hp.quniform for integer distribution
space = {
    'num_layers': hp.quniform('num_layers', 1, 5, 1),
    'num_neurons': hp.quniform('num_neurons', 16, 256, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.01)),
    'dropout_rate': hp.uniform('dropout_rate', 0.0, 0.5)
}

# Global dataset variables assumed to be defined externally
X = lipo_X_ensemble_valid2_selected
y = lipo_y_ensemble_valid2

# Objective function for Bayesian optimization
def objective(params):
    params['num_layers'] = int(params['num_layers'])  # Ensure num_layers is an integer
    params['num_neurons'] = int(params['num_neurons'])  # Ensure num_neurons is an integer
    kf = KFold(n_splits=5)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Convert DataFrame to numpy arrays before making them PyTorch tensors
        train_dataset = TensorDataset(torch.tensor(X_train.values.astype(np.float32)), 
                                      torch.tensor(y_train.values.astype(np.float32)).unsqueeze(1))
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        model = SimpleNN(input_size=X_train.shape[1], num_layers=params['num_layers'],
                         num_neurons=params['num_neurons'], dropout_rate=params['dropout_rate'])
        criterion = RMSELoss()
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

        model.train()
        for epoch in range(100):
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_preds = model(torch.tensor(X_val.values.astype(np.float32))).squeeze(1)
            val_targets = torch.tensor(y_val.values.astype(np.float32))
            rmse = np.sqrt(mean_squared_error(val_targets.numpy(), val_preds.numpy()))
            rmse_scores.append(rmse)

    avg_rmse = np.mean(rmse_scores)
    return {'loss': avg_rmse, 'status': STATUS_OK} # Minimize RMSE

# Run Bayesian optimization
trials = Trials()
lipo_nn_best_params = fmin(fn=objective,
                           space=space,
                           algo=tpe.suggest,
                           max_evals=50,
                           trials=trials,
                           rstate=np.random.default_rng(0),  # Seed for hyperopt
                           early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", lipo_nn_best_params)


 62%|██████▏   | 31/50 [05:01<03:04,  9.74s/trial, best loss: 0.023755010217428207]
Best hyperparameters: {'dropout_rate': 0.004317426856301848, 'learning_rate': 0.0015656160965808995, 'num_layers': 4.0, 'num_neurons': 207.0}


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

torch.manual_seed(0)

# Define the neural network model again
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(1, num_layers):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1)]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Define a function to compute RMSE
def compute_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Convert parameters to the correct format if necessary
lipo_nn_best_params = {
    'num_layers':  int(lipo_nn_best_params['num_layers']),  # Extracted from Bayesian optimization results
    'num_neurons':  int(lipo_nn_best_params['num_neurons']),  # Extracted from Bayesian optimization results
    'dropout_rate': lipo_nn_best_params['dropout_rate'],  # Extracted from Bayesian optimization results
    'learning_rate': lipo_nn_best_params['learning_rate']  # Extracted from Bayesian optimization results
}

# Prepare datasets
X_train_tensor = torch.tensor(lipo_X_ensemble_valid2_selected.values.astype(np.float32))
y_train_tensor = torch.tensor(lipo_y_ensemble_valid2.values.astype(np.float32)).unsqueeze(1)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test_tensor = torch.tensor(lipo_X_ensemble_test_selected.values.astype(np.float32))
y_test_tensor = torch.tensor(lipo_y_ensemble_test.values.astype(np.float32)).unsqueeze(1)

# Initialize the model
model = SimpleNN(input_size=lipo_X_ensemble_valid2_selected.shape[1], num_layers=lipo_nn_best_params['num_layers'],
                         num_neurons=lipo_nn_best_params['num_neurons'], dropout_rate=lipo_nn_best_params['dropout_rate'])
criterion = RMSELoss()
optimizer = optim.Adam(model.parameters(), lr=lipo_nn_best_params['learning_rate'])

# Training loop
model.train()
for epoch in range(100):  # Number of epochs can be adjusted
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluation on test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predictions = outputs.squeeze(1).numpy() * train_sd + train_mean

    # Calculate metrics
    mae = mean_absolute_error(y_test_tensor.numpy(), predictions)
    rmse = compute_rmse(y_test_tensor.numpy(), predictions)
    r2 = r2_score(y_test_tensor.numpy(), predictions)
    correlation, _ = pearsonr(y_test_tensor.numpy().squeeze(1), predictions)

    lipo_nn_metrics = {
        'MAE': mae,
        'RMSE': rmse,
        'R2 Score': r2,
        'Correlation': correlation
    }

lipo_nn_metrics

{'MAE': 0.4214834,
 'RMSE': 0.55144894,
 'R2 Score': 0.7821241617202759,
 'Correlation': 0.8860915823682385}

In [19]:
# create a table to record all metrics for lipo
lipo_metrics_results["LASSO"] = lipo_lasso_metrics
lipo_metrics_results["Group Lasso (2 groups)"] = lipo_two_groups_lasso_best_metrics
lipo_metrics_results["Elastic Net"] = lipo_elastic_metrics
lipo_metrics_results["SVR"] = lipo_svr_metrics
lipo_metrics_results["Random Forest"] = lipo_rf_best_metrics
lipo_metrics_results["XGBoost"] = lipo_xgb_best_metrics
lipo_metrics_results["Neural Network"] = lipo_nn_metrics

lipo_metrics_df = pd.DataFrame(lipo_metrics_results).T
# keep 3 digits after the decimal point
lipo_metrics_df = lipo_metrics_df.round(3)

# export table to csv
lipo_metrics_df.to_csv('./split2_lipo_metrics_aux_alt.csv')