In [1]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
freesolv_chemberta2_valid2 = pd.read_csv('./chemberta2/results/freesolv/chemberta2_valid2_freesolv_2_predictions.csv')
freesolv_molformer_valid2 = pd.read_csv('./molformer/results/freesolv/molformer_valid2_freesolv_2_99.csv')
freesolv_molbert_valid2 = pd.read_csv('./molbert/results/freesolv/molbert_valid2_freesolv_2.csv')

# Load the test data for each model
freesolv_chemberta2_test = pd.read_csv('./chemberta2/results/freesolv/chemberta2_test_freesolv_2_predictions.csv')
freesolv_molformer_test = pd.read_csv('./molformer/results/freesolv/molformer_test_freesolv_2_99.csv')
freesolv_molbert_test = pd.read_csv('./molbert/results/freesolv/molbert_test_freesolv_2.csv')

train_mean = -3.9326753246753245
train_sd = 3.9386618472414696

# features

# Load the features from chemberta
freesolv_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/freesolv/chemberta2_valid2_freesolv_2_features.csv')
freesolv_chemberta2_features_test = pd.read_csv('./chemberta2/features/freesolv/chemberta2_test_freesolv_2_features.csv')

# Load the features from molformer
freesolv_molformer_features_valid2 = pd.read_csv('./molformer/features/freesolv/molformer_valid2_freesolv_2_features.csv')
freesolv_molformer_features_test = pd.read_csv('./molformer/features/freesolv/molformer_test_freesolv_2_features.csv')

# Load the features from molbert
freesolv_molbert_features_valid2 = pd.read_csv('./molbert/features/freesolv/molbert_valid2_freesolv_2_features.csv')
freesolv_molbert_features_test = pd.read_csv('./molbert/features/freesolv/molbert_test_freesolv_2_features.csv')

For freesolv (regression)

In [2]:
# Preparing the actual and predicted values
# Chemberta2
freesolv_chemberta_actual = freesolv_chemberta2_test['target'] 
freesolv_chemberta_pred = freesolv_chemberta2_test['pred_raw']

# Molformer
freesolv_molformer_actual = freesolv_molformer_test['target']
freesolv_molformer_pred = freesolv_molformer_test['pred_raw']

# molbert
freesolv_molbert_actual = freesolv_molbert_test['target_raw']
freesolv_molbert_pred = freesolv_molbert_test['pred_raw']

In [3]:
# Calculating metrics
freesolv_metrics_results = {}

for model_name, actual, pred in [("Chemberta2", freesolv_chemberta_actual, freesolv_chemberta_pred),
                                 ("Molformer", freesolv_molformer_actual, freesolv_molformer_pred),
                                 ("Molbert", freesolv_molbert_actual, freesolv_molbert_pred)]:
    freesolv_metrics_results[model_name] = {
        "MAE": mean_absolute_error(actual, pred),
        "RMSE": np.sqrt(mean_squared_error(actual, pred)),
        "R2 Score": r2_score(actual, pred),
        "Correlation": pearsonr(actual, pred)[0]  # Only record the correlation coefficient
    }

freesolv_metrics_results

{'Chemberta2': {'MAE': 0.8000800537213537,
  'RMSE': 1.2734474000180689,
  'R2 Score': 0.8797259973876177,
  'Correlation': 0.9453058246807778},
 'Molformer': {'MAE': 0.7382831341538462,
  'RMSE': 1.3460187487220627,
  'R2 Score': 0.8656270163381711,
  'Correlation': 0.9314852770560673},
 'Molbert': {'MAE': 0.854951475116923,
  'RMSE': 1.3825074018587171,
  'R2 Score': 0.8582429482003655,
  'Correlation': 0.9297793139573085}}

In [4]:
# standardized valid2 labels
freesolv_y_ensemble_valid2 = (freesolv_chemberta2_valid2['target'] - train_mean)/train_sd

# Create the features for the ensemble from the prediction probabilities of being in class 1
freesolv_X_ensemble_valid2 = pd.concat([
    freesolv_chemberta2_valid2['pred_z'],
    freesolv_molformer_valid2['pred_z'], 
    freesolv_molbert_valid2['pred_z'],
    # add features from training set
    freesolv_chemberta2_features_valid2.iloc[:, 2:],
    freesolv_molformer_features_valid2.iloc[:, 1:],
    freesolv_molbert_features_valid2.iloc[:, 1:]
], axis=1)

# change feature names of the ensemble so that they are unique
freesolv_X_ensemble_valid2.columns = ['chemberta', 'molformer', 'molbert'] + list(freesolv_chemberta2_features_valid2.columns[2:]) + list(freesolv_molformer_features_valid2.columns[1:]) + list(freesolv_molbert_features_valid2.columns[1:])

# standardized test labels
freesolv_y_ensemble_test_std = (freesolv_chemberta2_test['target']  - train_mean)/train_sd

freesolv_X_ensemble_test = pd.concat([
    freesolv_chemberta2_test['pred_z'],
    freesolv_molformer_test['pred_z'],  
    freesolv_molbert_test['pred_z'],
    # add features from test set
    freesolv_chemberta2_features_test.iloc[:, 2:],
    freesolv_molformer_features_test.iloc[:, 1:],
    freesolv_molbert_features_test.iloc[:, 1:]
], axis=1)

# change feature names of the ensemble so that they are unique
freesolv_X_ensemble_test.columns = ['chemberta', 'molformer', 'molbert'] + list(freesolv_chemberta2_features_test.columns[2:]) + list(freesolv_molformer_features_test.columns[1:]) + list(freesolv_molbert_features_test.columns[1:])

# true test labels
freesolv_y_ensemble_test = freesolv_chemberta2_test['target']

In [5]:
# scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
freesolv_X_ensemble_valid2_scaled = scaler.fit_transform(freesolv_X_ensemble_valid2)
freesolv_X_ensemble_test_scaled = scaler.transform(freesolv_X_ensemble_test)

freesolv_X_ensemble_valid2_scaled = pd.DataFrame(freesolv_X_ensemble_valid2_scaled, columns=freesolv_X_ensemble_valid2.columns)
freesolv_X_ensemble_test_scaled = pd.DataFrame(freesolv_X_ensemble_test_scaled, columns=freesolv_X_ensemble_test.columns)

In [6]:
# export freesolv_X_ensemble_valid2 and freesolv_y_ensemble_valid2 to csv
freesolv_X_ensemble_valid2_scaled.to_csv('./processed_data/freesolv_X_ensemble_valid2_scaled_rawpreds.csv', index=False)
freesolv_X_ensemble_test_scaled.to_csv('./processed_data/freesolv_X_ensemble_test_scaled_rawpreds.csv', index=False)

# export freesolv_X_ensemble_test and freesolv_y_ensemble_test to csv
freesolv_y_ensemble_valid2.to_csv('./processed_data/freesolv_y_ensemble_valid2.csv', index=False)
freesolv_y_ensemble_test.to_csv('./processed_data/freesolv_y_ensemble_test.csv', index=False)

In [7]:
# lasso model
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the LassoCV model
lasso_cv = LassoCV(cv=5, max_iter = 5000, random_state=0)

# Fit the model
lasso_cv.fit(freesolv_X_ensemble_valid2_scaled, freesolv_y_ensemble_valid2)

# Predict the test set
freesolv_lasso_pred = lasso_cv.predict(freesolv_X_ensemble_test_scaled) * train_sd + train_mean

# Calculate the metrics
freesolv_lasso_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_lasso_pred),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_lasso_pred)),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_lasso_pred),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_lasso_pred)[0]
}

freesolv_lasso_metrics

{'MAE': 0.6947811987679593,
 'RMSE': 1.168512854086777,
 'R2 Score': 0.898730951293949,
 'Correlation': 0.9480980838733333}

In [8]:
# use lasso to select features
coefs = pd.Series(lasso_cv.coef_, index=freesolv_X_ensemble_valid2.columns)

selected_features = coefs[coefs != 0].index.tolist()

print("Selected Features:", selected_features)

Selected Features: ['chemberta', 'molformer', 'molbert', 'chemberta2_feature_87', 'chemberta2_feature_92', 'chemberta2_feature_179', 'molformer_feature_499', 'molformer_feature_633', 'molformer_feature_655', 'molformer_feature_694', 'molformer_feature_734', 'molbert_features_18', 'molbert_features_58', 'molbert_features_87', 'molbert_features_93', 'molbert_features_147', 'molbert_features_173', 'molbert_features_267', 'molbert_features_268', 'molbert_features_288', 'molbert_features_581', 'molbert_features_642', 'molbert_features_689', 'molbert_features_740']


In [9]:
import numpy as np
from skglm import GroupLasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

# two groups: one for the prediction probabilities and one for the features

# Define the groups for each feature
n_features = freesolv_X_ensemble_valid2_scaled.shape[1]

groups = [
    list(range(0, 3)),  # Group 0 with feature indices 0, 1, 2
    list(range(3, n_features))  # Group 1 with all remaining features
]

# Initialize the GroupLasso model
group_lasso_model = GroupLasso(
    groups=groups,
    alpha=1e-10,
    p0=10,
    verbose=0,
    tol=0.0001,
    positive=False,
    fit_intercept=True,
    warm_start=False,
)

# # Fit the model
# group_lasso_model.fit(freesolv_X_ensemble_valid2_scaled, freesolv_y_ensemble_valid2)

# # Predict the test set
# freesolv_pred = group_lasso_model.predict(freesolv_X_ensemble_test_scaled) * train_sd + train_mean

# Setup cross-validation to find the best alpha
param_grid = {'alpha': np.logspace(-10, 1, 5)}

cv = GridSearchCV(
    estimator=group_lasso_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5
)

# Fit GridSearchCV
cv.fit(freesolv_X_ensemble_valid2_scaled, freesolv_y_ensemble_valid2)

# Best model and parameters
best_model = cv.best_estimator_
print("Best alpha:", cv.best_params_['alpha'])

# Predict using the best model
freesolv_pred = best_model.predict(freesolv_X_ensemble_test_scaled) * train_sd + train_mean

# Calculate the evaluation metrics
freesolv_two_groups_lasso_best_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_pred),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_pred)),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_pred),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_pred)[0]  # Only record the correlation coefficient
}

# Print the calculated metrics
freesolv_two_groups_lasso_best_metrics

Best alpha: 1e-10


{'MAE': 3.8547403050141966,
 'RMSE': 13.52597442727167,
 'R2 Score': -12.568969492431195,
 'Correlation': 0.5208269199264985}

In [10]:
# print the true and predicted values side by side
freesolv_pred_true = pd.DataFrame({
    "true": freesolv_y_ensemble_test,
    "pred": freesolv_pred
})

freesolv_pred_true

Unnamed: 0,true,pred
0,-2.28,-2.541142
1,-1.99,-2.456767
2,-4.84,-4.727217
3,-6.44,-7.297855
4,-3.88,-5.721687
...,...,...
60,2.30,1.251319
61,-0.48,0.045307
62,0.18,0.321837
63,-9.28,-5.835861


In [11]:
# four groups: predictions, features from chemberta, features from molformer, features from molbert

# Define the groups for each feature
n_features = freesolv_X_ensemble_valid2_scaled.shape[1]
groups = [
    list(range(0, 3)),  # Group 0 with feature indices 0, 1, 2
    list(range(3, 3 + 384)),  # Group 1 with next 384 features
    list(range(3 + 384, 3 + 384 + 768)),  # Group 2 with next 768 features
    list(range(3 + 384 + 768, n_features))  # Group 3 with all remaining features
]

# Initialize the GroupLasso model
group_lasso_model = GroupLasso(
    groups=groups,
    alpha=1.0,
    p0=10,
    verbose=0,
    tol=0.0001,
    positive=False,
    fit_intercept=True,
    warm_start=False,
)


# Setup cross-validation to find the best alpha
param_grid = {'alpha': np.logspace(-10, 1, 5)}

cv = GridSearchCV(
    estimator=group_lasso_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5
)

# Fit GridSearchCV
cv.fit(freesolv_X_ensemble_valid2_scaled, freesolv_y_ensemble_valid2)

# Best model and parameters
best_model = cv.best_estimator_
print("Best alpha:", cv.best_params_['alpha'])

# Predict using the best model
freesolv_pred = best_model.predict(freesolv_X_ensemble_test_scaled) * train_sd + train_mean

# Calculate the evaluation metrics
freesolv_four_groups_lasso_best_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_pred),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_pred)),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_pred),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_pred)[0]  # Only record the correlation coefficient
}

# Print the calculated metrics
freesolv_four_groups_lasso_best_metrics

Best alpha: 1e-10


{'MAE': 3.634845223370996,
 'RMSE': 12.294548734603968,
 'R2 Score': -10.210756704784641,
 'Correlation': 0.5566933579795108}

In [12]:
freesolv_y_ensemble_test

0    -2.28
1    -1.99
2    -4.84
3    -6.44
4    -3.88
      ... 
60    2.30
61   -0.48
62    0.18
63   -9.28
64   -1.29
Name: target, Length: 65, dtype: float64

In [13]:
# put the test and pred side by side
freesolv_ensemble_results = pd.DataFrame({
    "target": freesolv_y_ensemble_test,
    "pred": freesolv_pred
})
freesolv_ensemble_results

Unnamed: 0,target,pred
0,-2.28,-2.541980
1,-1.99,-2.509169
2,-4.84,-4.693454
3,-6.44,-7.343145
4,-3.88,-5.700561
...,...,...
60,2.30,1.193500
61,-0.48,0.039399
62,0.18,0.294528
63,-9.28,-5.876054


In [14]:
# elastic net
# Define the model with elasticnet penalty for regression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

elastic_net_model = ElasticNet(random_state=0, max_iter=5000)

# Define the hyperparameter grid
# Use fewer discrete values for alpha and l1_ratio
alphas = [0.01, 0.1, 1, 3]  # Reduced number of points focusing on lower and mid-range
l1_ratios = [0.1, 0.5, 0.9]  # Reduced to three points, emphasizing edges and midpoint

params = {
    'alpha': alphas,  # Convert alpha back to C
    'l1_ratio': l1_ratios
}

grid_search = GridSearchCV(elastic_net_model, param_grid=params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(freesolv_X_ensemble_valid2_scaled, freesolv_y_ensemble_valid2)

# Get the best hyperparameters
freesolv_best_elastic_params = grid_search.best_params_
print(freesolv_best_elastic_params)

# Initialize and train the best ElasticNet model
freesolv_best_elastic_model = ElasticNet(alpha=freesolv_best_elastic_params['alpha'], l1_ratio=freesolv_best_elastic_params['l1_ratio'], random_state=0, max_iter=5000)
freesolv_best_elastic_model.fit(freesolv_X_ensemble_valid2_scaled, freesolv_y_ensemble_valid2)

# Predict the test set
freesolv_elastic_pred = freesolv_best_elastic_model.predict(freesolv_X_ensemble_test_scaled) * train_sd + train_mean

# Calculate the metrics
freesolv_elastic_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_elastic_pred),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_elastic_pred)),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_elastic_pred),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_elastic_pred)[0]
}

freesolv_elastic_metrics

{'alpha': 0.1, 'l1_ratio': 0.5}


{'MAE': 0.717930920761282,
 'RMSE': 1.1850912637748987,
 'R2 Score': 0.8958370346003615,
 'Correlation': 0.947207323824447}

In [15]:
# use elastic net to select features
coefs = pd.Series(freesolv_best_elastic_model.coef_, index=freesolv_X_ensemble_valid2.columns)

selected_features = coefs[coefs != 0].index.tolist()

print("Selected Features:", selected_features)

Selected Features: ['chemberta', 'molformer', 'molbert', 'chemberta2_feature_82', 'chemberta2_feature_87', 'chemberta2_feature_92', 'chemberta2_feature_179', 'chemberta2_feature_183', 'molformer_feature_499', 'molformer_feature_633', 'molformer_feature_655', 'molformer_feature_694', 'molbert_features_18', 'molbert_features_58', 'molbert_features_93', 'molbert_features_147', 'molbert_features_267', 'molbert_features_268', 'molbert_features_288', 'molbert_features_559', 'molbert_features_581', 'molbert_features_642', 'molbert_features_689', 'molbert_features_708']


In [16]:
freesolv_X_ensemble_valid2_selected = freesolv_X_ensemble_valid2_scaled[selected_features]
freesolv_X_ensemble_test_selected = freesolv_X_ensemble_test_scaled[selected_features]

# check shapes
print(freesolv_X_ensemble_valid2_selected.shape)
print(freesolv_X_ensemble_test_selected.shape)

(129, 24)
(65, 24)


In [17]:
# Initialize and train the SVR model
from sklearn.svm import SVR

freesolv_svr_model = SVR()
freesolv_svr_model.fit(freesolv_X_ensemble_valid2_selected, freesolv_y_ensemble_valid2)

# Predict the test set
freesolv_svr_pred = freesolv_svr_model.predict(freesolv_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
freesolv_svr_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_svr_pred ),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_svr_pred )),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_svr_pred ),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_svr_pred )[0]  # Only record the correlation coefficient
}

freesolv_svr_metrics

{'MAE': 1.1966621320289677,
 'RMSE': 2.1577800359446266,
 'R2 Score': 0.6546781305992417,
 'Correlation': 0.8102804684953412}

In [18]:
# initailize and use a 5-fold cross-validation to tune the hyperparameters of a random forest model for regression
from sklearn.ensemble import RandomForestRegressor

freesolv_rf_model = RandomForestRegressor(random_state=0)

freesolv_rf_model.fit(freesolv_X_ensemble_valid2_selected, freesolv_y_ensemble_valid2)

# Predict the test set
freesolv_rf_best_pred = freesolv_rf_model.predict(freesolv_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
freesolv_rf_best_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_rf_best_pred),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_rf_best_pred)),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_rf_best_pred),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_rf_best_pred)[0]  # Only record the correlation coefficient
}

freesolv_rf_best_metrics

{'MAE': 0.8387938461538459,
 'RMSE': 1.5149272854852442,
 'R2 Score': 0.8297867682221338,
 'Correlation': 0.9116677109792434}

In [19]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

# Define the hyperparameter space using continuous distributions
freesolv_xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Correctly define the RMSE scorer function
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Convert float outputs of hp.quniform to int for certain parameters
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    model = xgb.XGBRegressor(**params, random_state=0)
    
    # Cross-validated RMSE as the objective
    score = cross_val_score(model, freesolv_X_ensemble_valid2_selected, freesolv_y_ensemble_valid2, 
                            scoring=make_scorer(rmse_scorer, greater_is_better=False), cv=5)
    
    # Minimize the positive RMSE (already negative from scoring)
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Run the Bayesian optimization
trials = Trials()
freesolv_xgb_best_params = fmin(fn=objective, 
                            space=freesolv_xgb_hyperopt_space, 
                            algo=tpe.suggest, 
                            max_evals=100, 
                            trials=trials,
                            early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", freesolv_xgb_best_params)


 21%|██        | 21/100 [00:03<00:14,  5.32trial/s, best loss: 0.3357787933768178] 
Best hyperparameters: {'colsample_bytree': 0.5603690896798423, 'learning_rate': 0.17035028068241181, 'max_depth': 4.0, 'n_estimators': 50.0, 'subsample': 0.5818529989625509}


In [20]:
# fit the model with the best hyperparameters
# Convert parameters obtained from Hyperopt to the correct data type
freesolv_xgb_best_params['n_estimators'] = int(freesolv_xgb_best_params['n_estimators'])
freesolv_xgb_best_params['max_depth'] = int(freesolv_xgb_best_params['max_depth'])

# Initialize and train the XGBoost model with the best parameters
freesolv_xgb_model = xgb.XGBRegressor(**freesolv_xgb_best_params, random_state=0)
freesolv_xgb_model.fit(freesolv_X_ensemble_valid2_selected, freesolv_y_ensemble_valid2)

# Predict the test set
freesolv_xgb_best_pred = freesolv_xgb_model.predict(freesolv_X_ensemble_test_selected) * train_sd + train_mean

# Calculate the metrics
freesolv_xgb_best_metrics = {
    "MAE": mean_absolute_error(freesolv_y_ensemble_test, freesolv_xgb_best_pred),
    "RMSE": np.sqrt(mean_squared_error(freesolv_y_ensemble_test, freesolv_xgb_best_pred)),
    "R2 Score": r2_score(freesolv_y_ensemble_test, freesolv_xgb_best_pred),
    "Correlation": pearsonr(freesolv_y_ensemble_test, freesolv_xgb_best_pred)[0]  # Only record the correlation coefficient
}

freesolv_xgb_best_metrics

{'MAE': 0.7015726737976075,
 'RMSE': 1.0880306336860968,
 'R2 Score': 0.9122005109822555,
 'Correlation': 0.9562234362621357}

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

torch.manual_seed(0)

# Define RMSE loss
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.mse = nn.MSELoss()

    def forward(self, y_pred, y_true):
        return torch.sqrt(self.mse(y_pred, y_true))

# Define the neural network model for regression
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(1, num_layers):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1)]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Hyperparameter space with hp.quniform for integer distribution
space = {
    'num_layers': hp.quniform('num_layers', 1, 10, 1),  # Reduced upper limit
    'num_neurons': hp.quniform('num_neurons', 8, 256, 1),  # Reduced upper limit and adjusted the lower limit
    'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.05)),  # Adjusted upper limit
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.5)  # Adjusted lower limit to explore higher dropout
}

# Global dataset variables assumed to be defined externally
X = freesolv_X_ensemble_valid2_selected
y = freesolv_y_ensemble_valid2

# Objective function for Bayesian optimization
def objective(params):
    params['num_layers'] = int(params['num_layers'])  # Ensure num_layers is an integer
    params['num_neurons'] = int(params['num_neurons'])  # Ensure num_neurons is an integer
    kf = KFold(n_splits=5)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Convert DataFrame to numpy arrays before making them PyTorch tensors
        train_dataset = TensorDataset(torch.tensor(X_train.values.astype(np.float32)), 
                                      torch.tensor(y_train.values.astype(np.float32)).unsqueeze(1))
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        model = SimpleNN(input_size=X_train.shape[1], num_layers=params['num_layers'],
                         num_neurons=params['num_neurons'], dropout_rate=params['dropout_rate'])
        criterion = RMSELoss()
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

        model.train()
        for epoch in range(100):
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_preds = model(torch.tensor(X_val.values.astype(np.float32))).squeeze(1)
            val_targets = torch.tensor(y_val.values.astype(np.float32))
            rmse = np.sqrt(mean_squared_error(val_targets.numpy(), val_preds.numpy()))
            rmse_scores.append(rmse)

    avg_rmse = np.mean(rmse_scores)
    return {'loss': avg_rmse, 'status': STATUS_OK} # Minimize RMSE

# Run Bayesian optimization
trials = Trials()
freesolv_nn_best_params = fmin(fn=objective,
                           space=space,
                           algo=tpe.suggest,
                           max_evals=50,
                           trials=trials,
                           early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", freesolv_nn_best_params)


 26%|██▌       | 13/50 [00:27<01:18,  2.13s/trial, best loss: 0.2447439730167389]
Best hyperparameters: {'dropout_rate': 0.32382367245512, 'learning_rate': 0.00039109115900674336, 'num_layers': 2.0, 'num_neurons': 102.0}


In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

torch.manual_seed(0)

# Define the neural network model again
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(1, num_layers):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1)]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Define a function to compute RMSE
def compute_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Convert parameters to the correct format if necessary
freesolv_nn_best_params = {
    'num_layers':  int(freesolv_nn_best_params['num_layers']),  # Extracted from Bayesian optimization results
    'num_neurons':  int(freesolv_nn_best_params['num_neurons']),  # Extracted from Bayesian optimization results
    'dropout_rate': freesolv_nn_best_params['dropout_rate'],  # Extracted from Bayesian optimization results
    'learning_rate': freesolv_nn_best_params['learning_rate']  # Extracted from Bayesian optimization results
}

# Prepare datasets
X_train_tensor = torch.tensor(freesolv_X_ensemble_valid2_selected.values.astype(np.float32))
y_train_tensor = torch.tensor(freesolv_y_ensemble_valid2.values.astype(np.float32)).unsqueeze(1)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test_tensor = torch.tensor(freesolv_X_ensemble_test_selected.values.astype(np.float32))
y_test_tensor = torch.tensor(freesolv_y_ensemble_test.values.astype(np.float32)).unsqueeze(1)

# Initialize the model
model = SimpleNN(input_size=freesolv_X_ensemble_valid2_selected.shape[1], num_layers=freesolv_nn_best_params['num_layers'],
                         num_neurons=freesolv_nn_best_params['num_neurons'], dropout_rate=freesolv_nn_best_params['dropout_rate'])
criterion = RMSELoss()
optimizer = optim.Adam(model.parameters(), lr=freesolv_nn_best_params['learning_rate'])

# Training loop
model.train()
for epoch in range(100):  # Number of epochs can be adjusted
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()


model.eval()
# Evaluation on training set
with torch.no_grad():
    train_outputs = model(X_train_tensor)
    train_predictions = train_outputs.squeeze(1).numpy()

    # Calculate metrics
    train_mae = mean_absolute_error(y_train_tensor.numpy(), train_predictions)
    train_rmse = compute_rmse(y_train_tensor.numpy(), train_predictions)
    train_r2 = r2_score(y_train_tensor.numpy(), train_predictions)
    train_correlation, _ = pearsonr(y_train_tensor.numpy().squeeze(1), train_predictions)

    freesolv_nn_train_metrics = {
        'MAE': train_mae,
        'RMSE': train_rmse,
        'R2 Score': train_r2,
        'Correlation': train_correlation
    }

freesolv_nn_train_metrics

# Evaluation on test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predictions = outputs.squeeze(1).numpy() * train_sd + train_mean

    # Calculate metrics
    mae = mean_absolute_error(y_test_tensor.numpy(), predictions)
    rmse = compute_rmse(y_test_tensor.numpy(), predictions)
    r2 = r2_score(y_test_tensor.numpy(), predictions)
    correlation, _ = pearsonr(y_test_tensor.numpy().squeeze(1), predictions)

    freesolv_nn_metrics = {
        'MAE': mae,
        'RMSE': rmse,
        'R2 Score': r2,
        'Correlation': correlation
    }

freesolv_nn_metrics

{'MAE': 0.96470976,
 'RMSE': 1.3577014,
 'R2 Score': 0.8632842898368835,
 'Correlation': 0.9294056366660468}

In [23]:
# print pred and target side by side
freesolv_nn_results = pd.DataFrame({
    "target": freesolv_y_ensemble_test,
    "pred": predictions
})
freesolv_nn_results

Unnamed: 0,target,pred
0,-2.28,-2.736681
1,-1.99,-1.447645
2,-4.84,-3.733321
3,-6.44,-7.074145
4,-3.88,-4.612207
...,...,...
60,2.30,1.094055
61,-0.48,0.185301
62,0.18,0.660280
63,-9.28,-5.762722


In [24]:
# create a table to record all metrics for freesolv
freesolv_metrics_results["LASSO"] = freesolv_lasso_metrics
freesolv_metrics_results["Group Lasso (2 groups)"] = freesolv_two_groups_lasso_best_metrics
freesolv_metrics_results["Group Lasso (4 groups)"] = freesolv_four_groups_lasso_best_metrics
freesolv_metrics_results["Elastic Net"] = freesolv_elastic_metrics
freesolv_metrics_results["SVR"] = freesolv_svr_metrics
freesolv_metrics_results["Random Forest"] = freesolv_rf_best_metrics
freesolv_metrics_results["XGBoost"] = freesolv_xgb_best_metrics
freesolv_metrics_results["Neural Network"] = freesolv_nn_metrics

freesolv_metrics_df = pd.DataFrame(freesolv_metrics_results).T
# keep 3 digits after the decimal point
freesolv_metrics_df = freesolv_metrics_df.round(3)

# export table to csv
freesolv_metrics_df.to_csv('./split2_freesolv_metrics_rawrpreds.csv')