In [1]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
bbbp_chemberta2_valid2 = pd.read_csv('./chemberta2/results/bbbp/chemberta2_valid2_bbbp_2_predictions.csv')
bbbp_molformer_valid2 = pd.read_csv('./molformer/results/bbbp/molformer_valid2_bbbp_2_epoch29.csv')
bbbp_molbert_valid2 = pd.read_csv('./molbert/results/bbbp/molbert_valid2_bbbp_2.csv')

# Load the test data for each model
bbbp_chemberta2_test = pd.read_csv('./chemberta2/results/bbbp/chemberta2_test_bbbp_2_predictions.csv')
bbbp_molformer_test = pd.read_csv('./molformer/results/bbbp/molformer_test_bbbp_2_epoch29.csv')
bbbp_molbert_test = pd.read_csv('./molbert/results/bbbp/molbert_test_bbbp_2.csv')

# features

# Load the features from chemberta
bbbp_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/bbbp/chemberta2_valid2_bbbp_2_features.csv')
bbbp_chemberta2_features_test = pd.read_csv('./chemberta2/features/bbbp/chemberta2_test_bbbp_2_features.csv')

# Load the features from molformer
bbbp_molformer_features_valid2 = pd.read_csv('./molformer/features/bbbp/molformer_valid2_bbbp_2_features.csv')
bbbp_molformer_features_test = pd.read_csv('./molformer/features/bbbp/molformer_test_bbbp_2_features.csv')

# Load the features from molbert
bbbp_molbert_features_valid2 = pd.read_csv('./molbert/features/bbbp/molbert_valid2_bbbp_2_features.csv')
bbbp_molbert_features_test = pd.read_csv('./molbert/features/bbbp/molbert_test_bbbp_2_features.csv')

For BBBP (Classification) \
Valid 2 \
No missing in chemberta2 and molformer \
3 missing in molbert \
Test \
no missing in chemberta2, 1 missing in molformer, 2 missing in molbert





In [2]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Preparing the actual and predicted values
# Chemberta2
bbbp_chemberta_actual = bbbp_chemberta2_test['p_np']
bbbp_chemberta_pred = bbbp_chemberta2_test['y_pred']
bbbp_chemberta_probs = bbbp_chemberta2_test[['softmax_class_0_prob', 'softmax_class_1_prob']]

# Molformer
bbbp_molformer_actual = bbbp_molformer_test['Actual']
bbbp_molformer_pred = (bbbp_molformer_test['Prob_Class_1'] > 0.5).astype(int)
bbbp_molformer_probs = bbbp_molformer_test[['Prob_Class_0', 'Prob_Class_1']]

# Molbert
bbbp_molbert_actual = bbbp_molbert_test['target']
bbbp_molbert_pred = bbbp_molbert_test['pred']
bbbp_molbert_probs = bbbp_molbert_test['prob']

# Calculating metrics
bbbp_metrics_results = {}

for model_name, actual, pred, probs in [("Chemberta2", bbbp_chemberta_actual, bbbp_chemberta_pred, bbbp_chemberta_probs['softmax_class_1_prob']),
                                         ("Molformer", bbbp_molformer_actual, bbbp_molformer_pred, bbbp_molformer_probs['Prob_Class_1']),
                                         ("Molbert", bbbp_molbert_actual, bbbp_molbert_pred, bbbp_molbert_probs)]:
    bbbp_metrics_results[model_name] = {
        "Accuracy": accuracy_score(actual, pred),
        "F1 Score": f1_score(actual, pred),
        "ROC-AUC": roc_auc_score(actual, probs),
        "PR-AUC": average_precision_score(actual, probs)
    }

bbbp_metrics_results

{'Chemberta2': {'Accuracy': 0.8634146341463415,
  'F1 Score': 0.9072847682119205,
  'ROC-AUC': 0.9566584632212196,
  'PR-AUC': 0.9881838964094798},
 'Molformer': {'Accuracy': 0.9264705882352942,
  'F1 Score': 0.9538461538461539,
  'ROC-AUC': 0.9208172812328014,
  'PR-AUC': 0.9642320592298499},
 'Molbert': {'Accuracy': 0.9411764705882353,
  'F1 Score': 0.9625,
  'ROC-AUC': 0.9640891579526691,
  'PR-AUC': 0.9894812187118154}}

In [3]:
# Identify the 'smiles' values in chemberta2_valid2 that are not in molbert_valid2
missing_smiles_molformer_valid2 = set(bbbp_chemberta2_valid2['smiles']) - set(bbbp_molformer_valid2['smiles'])
print(f"Missing smiles in molformer_valid2: {missing_smiles_molformer_valid2}")

# Identify the 'smiles' values in chemberta2_valid2 that are not in molbert_valid2
missing_smiles_molbert_valid2 = set(bbbp_chemberta2_valid2['smiles']) - set(bbbp_molbert_valid2['smiles'])
print(f"Missing smiles in molbert_valid2: {missing_smiles_molbert_valid2}")

# Combine the invalid indices from molformer_valid2 with the missing indices from molbert_valid2
combined_invalid_smiles_valid2 = list(set(missing_smiles_molformer_valid2).union(set(missing_smiles_molbert_valid2)))

print(f"These indices will be removed from the valid2 set: {combined_invalid_smiles_valid2}")

Missing smiles in molformer_valid2: {'c1c(c(ncc1)CSCCN\\C(=[NH]\\C#N)NCC)Br', 'c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC', 's1cc(CSCCN\\C(NC)=[NH]\\C#N)nc1\\[NH]=C(\\N)N'}
Missing smiles in molbert_valid2: {'[Na+].[O-][S](=O)(=O)CCS', '[Na].CO[C@]1(NC(=O)CSC(F)(F)F)[C@H]2OCC(=C(N2C1=O)C(O)=O)CSc3nnnn3CCO', '[Na+].CCOc1ccc2ccccc2c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C([O-])=O', '[Na+].CC(=O)Nc1c(I)c(NC(C)=O)c(I)c(C([O-])=O)c1I', 'c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC', '[Na+].[Na+].CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N[S]([O-])(=O)=O)c3ccccc3)C(=O)N2[C@H]1C([O-])=O', '[Na+].[Na+].[Na+].[O-]C(=O)[P]([O-])([O-])=O', 's1cc(CSCCN\\C(NC)=[NH]\\C#N)nc1\\[NH]=C(\\N)N', '[Na+].COCC1=C(N2[C@H](SC1)[C@H](NC(=O)C(=N/OC)\\c3csc(N)n3)C2=O)C([O-])=O', 'c1c(c(ncc1)CSCCN\\C(=[NH]\\C#N)NCC)Br'}
These indices will be removed from the valid2 set: ['[Na+].[O-][S](=O)(=O)CCS', '[Na].CO[C@]1(NC(=O)CSC(F)(F)F)[C@H]2OCC(=C(N2C1=O)C(O)=O)CSc3nnnn3CCO', '[Na+].CCOc1ccc2ccccc2c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](

In [4]:
# Function to remove invalid SMILES
def remove_invalid_smiles(df, invalid_smiles_list):
    return df[~df['smiles'].isin(invalid_smiles_list)]

# Remove invalid SMILES from each dataframe
bbbp_chemberta2_valid2 = remove_invalid_smiles(bbbp_chemberta2_valid2, combined_invalid_smiles_valid2)
bbbp_molformer_valid2 = remove_invalid_smiles(bbbp_molformer_valid2, combined_invalid_smiles_valid2)
bbbp_molbert_valid2 = remove_invalid_smiles(bbbp_molbert_valid2, combined_invalid_smiles_valid2)
bbbp_chemberta2_features_valid2 = remove_invalid_smiles(bbbp_chemberta2_features_valid2, combined_invalid_smiles_valid2)
bbbp_molformer_features_valid2 = remove_invalid_smiles(bbbp_molformer_features_valid2, combined_invalid_smiles_valid2)
bbbp_molbert_features_valid2 = remove_invalid_smiles(bbbp_molbert_features_valid2, combined_invalid_smiles_valid2)

# Print the shapes of the dataframes after removal
print(bbbp_chemberta2_valid2.shape)
print(bbbp_molformer_valid2.shape)
print(bbbp_molbert_valid2.shape)
print(bbbp_chemberta2_features_valid2.shape)
print(bbbp_molformer_features_valid2.shape)
print(bbbp_molbert_features_valid2.shape)

(400, 8)
(400, 6)
(400, 4)
(400, 386)
(400, 769)
(400, 769)


In [5]:
bbbp_chemberta2_valid2.reset_index(drop=True, inplace=True)
bbbp_y_ensemble_valid2 = bbbp_chemberta2_valid2['p_np']

# create a new dataframe with one column of bbbp_chemberta2_valid2['softmax_class_1_prob']
bbbp_chemberta2_prob = pd.DataFrame({'chemberta2': bbbp_chemberta2_valid2['softmax_class_1_prob']})
bbbp_chemberta2_prob.reset_index(drop=True, inplace=True)

# create a new dataframe with one column of bbbp_molformer_valid2['Prob_Class_1']
bbbp_molformer_prob = pd.DataFrame({'molformer': bbbp_molformer_valid2['Prob_Class_1']})
bbbp_molformer_prob.reset_index(drop=True, inplace=True)

# create a new dataframe with one column of bbbp_molbert_valid2['Probabilities']
bbbp_molbert_prob = pd.DataFrame({'molbert': bbbp_molbert_valid2['prob']})
bbbp_molbert_prob.reset_index(drop=True, inplace=True)

# concatenate the three dataframes
bbbp_prob = pd.concat([bbbp_chemberta2_prob, bbbp_molformer_prob, bbbp_molbert_prob], axis=1)

# do the same for features bbbp_chemberta2_features_valid2.iloc[:, 2:]
bbbp_chemberta2_features = pd.DataFrame(bbbp_chemberta2_features_valid2.iloc[:, 2:])
bbbp_chemberta2_features.reset_index(drop=True, inplace=True)
bbbp_molformer_features = pd.DataFrame(bbbp_molformer_features_valid2.iloc[:, 1:])
bbbp_molformer_features.reset_index(drop=True, inplace=True)
bbbp_molbert_features = pd.DataFrame(bbbp_molbert_features_valid2.iloc[:, 1:])
bbbp_molbert_features.reset_index(drop=True, inplace=True)

bbbp_features = pd.concat([bbbp_chemberta2_features, bbbp_molformer_features, bbbp_molbert_features], axis=1)

# combine the features and probabilities
bbbp_X_ensemble_valid2 = pd.concat([bbbp_prob, bbbp_features], axis=1)

In [6]:
# identify missing in test
missing_smiles_molformer_test = set(bbbp_chemberta2_test['smiles']) - set(bbbp_molformer_test['smiles'])
print(f"Missing smiles in molformer_test: {missing_smiles_molformer_test}")

missing_smiles_molbert_test = set(bbbp_chemberta2_test['smiles']) - set(bbbp_molbert_test['smiles'])
print(f"Missing smiles in molbert_test: {missing_smiles_molbert_test}")

# Combine the invalid smiles from molformer_test with the missing smiles from molbert_test
combined_invalid_smiles_test = list(set(missing_smiles_molformer_test).union(set(missing_smiles_molbert_test)))
print(f"These indices will be removed from the test set: {combined_invalid_smiles_test}")

Missing smiles in molformer_test: {'Cc1nc(sc1)\\[NH]=C(\\N)N'}
Missing smiles in molbert_test: {'Cc1nc(sc1)\\[NH]=C(\\N)N'}
These indices will be removed from the test set: ['Cc1nc(sc1)\\[NH]=C(\\N)N']


In [7]:
# Remove invalid SMILES from each dataframe
bbbp_chemberta2_test = remove_invalid_smiles(bbbp_chemberta2_test, combined_invalid_smiles_test)
bbbp_molformer_test = remove_invalid_smiles(bbbp_molformer_test, combined_invalid_smiles_test)
bbbp_molbert_test = remove_invalid_smiles(bbbp_molbert_test, combined_invalid_smiles_test)
bbbp_chemberta2_features_test = remove_invalid_smiles(bbbp_chemberta2_features_test, combined_invalid_smiles_test)
bbbp_molformer_features_test = remove_invalid_smiles(bbbp_molformer_features_test, combined_invalid_smiles_test)
bbbp_molbert_features_test = remove_invalid_smiles(bbbp_molbert_features_test, combined_invalid_smiles_test)

# Print the shapes of the dataframes after removal
print(bbbp_chemberta2_test.shape)
print(bbbp_molformer_test.shape)
print(bbbp_molbert_test.shape)
print(bbbp_chemberta2_features_test.shape)
print(bbbp_molformer_features_test.shape)
print(bbbp_molbert_features_test.shape)

(204, 8)
(204, 6)
(204, 4)
(204, 386)
(204, 769)
(204, 769)


In [8]:
bbbp_chemberta2_test.reset_index(drop=True, inplace=True)
bbbp_y_ensemble_test = bbbp_chemberta2_test['p_np']

# do the same for test probs and features
bbbp_chemberta2_prob = pd.DataFrame({'chemberta2': bbbp_chemberta2_test['softmax_class_1_prob']})
bbbp_chemberta2_prob.reset_index(drop=True, inplace=True)
bbbp_molformer_prob = pd.DataFrame({'molformer': bbbp_molformer_test['Prob_Class_1']})
bbbp_molformer_prob.reset_index(drop=True, inplace=True)
bbbp_molbert_prob = pd.DataFrame({'molbert': bbbp_molbert_test['prob']})
bbbp_molbert_prob.reset_index(drop=True, inplace=True)

bbbp_prob = pd.concat([bbbp_chemberta2_prob, bbbp_molformer_prob, bbbp_molbert_prob], axis=1)

bbbp_chemberta2_features = pd.DataFrame(bbbp_chemberta2_features_test.iloc[:, 2:])
bbbp_chemberta2_features.reset_index(drop=True, inplace=True)
bbbp_molformer_features = pd.DataFrame(bbbp_molformer_features_test.iloc[:, 1:])
bbbp_molformer_features.reset_index(drop=True, inplace=True)
bbbp_molbert_features = pd.DataFrame(bbbp_molbert_features_test.iloc[:, 1:])
bbbp_molbert_features.reset_index(drop=True, inplace=True)
bbbp_features = pd.concat([bbbp_chemberta2_features, bbbp_molformer_features, bbbp_molbert_features], axis=1)

bbbp_X_ensemble_test = pd.concat([bbbp_prob, bbbp_features], axis=1)

In [9]:
# use standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
bbbp_X_ensemble_valid2_scaled = scaler.fit_transform(bbbp_X_ensemble_valid2)
bbbp_X_ensemble_test_scaled = scaler.transform(bbbp_X_ensemble_test)

# transform back to dataframe
bbbp_X_ensemble_valid2_scaled = pd.DataFrame(bbbp_X_ensemble_valid2_scaled, columns=bbbp_X_ensemble_valid2.columns)
bbbp_X_ensemble_test_scaled = pd.DataFrame(bbbp_X_ensemble_test_scaled, columns=bbbp_X_ensemble_test.columns)


In [10]:
# save the scaled data to csv
bbbp_X_ensemble_valid2_scaled.to_csv('./processed_data/bbbp_X_ensemble_valid2_scaled_rawpreds.csv', index=False)
bbbp_X_ensemble_test_scaled.to_csv('./processed_data/bbbp_X_ensemble_test_scaled_rawpreds.csv', index=False)

# save the target values to csv
bbbp_y_ensemble_valid2.to_csv('./processed_data/bbbp_y_ensemble_valid2.csv', index=False)
bbbp_y_ensemble_test.to_csv('./processed_data/bbbp_y_ensemble_test.csv', index=False)

In [11]:
# use lasso regression to train the ensemble model
from sklearn.linear_model import LogisticRegressionCV

# cross validation for strength of regularization
lasso_cv = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', max_iter=5000, random_state=0, scoring='roc_auc')

# Fit the model
lasso_cv.fit(bbbp_X_ensemble_valid2_scaled, bbbp_y_ensemble_valid2)

# Predict the test set
bbbp_lasso_pred = lasso_cv.predict(bbbp_X_ensemble_test_scaled)
bbbp_lasso_probs = lasso_cv.predict_proba(bbbp_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
bbbp_lasso_metrics = {
    "Accuracy": accuracy_score(bbbp_y_ensemble_test, bbbp_lasso_pred),
    "F1 Score": f1_score(bbbp_y_ensemble_test, bbbp_lasso_pred),
    "ROC-AUC": roc_auc_score(bbbp_y_ensemble_test, bbbp_lasso_probs),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_lasso_probs)
}

bbbp_lasso_metrics

{'Accuracy': 0.9264705882352942,
 'F1 Score': 0.9517684887459807,
 'ROC-AUC': 0.9649146945514585,
 'PR-AUC': 0.9908427330876456}

In [12]:
coefs = pd.Series(lasso_cv.coef_[0], index=bbbp_X_ensemble_valid2.columns)

# Filter to get the selected features
selected_features = coefs[coefs != 0].index.tolist()

# Check if 'chemberta2', 'molformer', 'molbert' are in the selected features, if not, add them
for model in ['chemberta2', 'molformer', 'molbert']:
    if model not in selected_features:
        selected_features.append(model)

print("Selected Features:", selected_features)
# check how many features are selected
print(len(selected_features))

Selected Features: ['chemberta2', 'molformer', 'molbert', 'chemberta2_feature_19', 'chemberta2_feature_44', 'chemberta2_feature_99', 'chemberta2_feature_106', 'chemberta2_feature_113', 'chemberta2_feature_136', 'chemberta2_feature_171', 'chemberta2_feature_289', 'chemberta2_feature_364', 'molformer_feature_17', 'molformer_feature_24', 'molformer_feature_33', 'molformer_feature_68', 'molformer_feature_212', 'molformer_feature_215', 'molformer_feature_218', 'molformer_feature_246', 'molformer_feature_258', 'molformer_feature_303', 'molformer_feature_304', 'molformer_feature_361', 'molformer_feature_363', 'molformer_feature_375', 'molformer_feature_515', 'molformer_feature_520', 'molformer_feature_581', 'molformer_feature_599', 'molformer_feature_635', 'molformer_feature_641', 'molformer_feature_702', 'molformer_feature_717', 'molformer_feature_740', 'molformer_feature_747', 'molbert_features_5', 'molbert_features_39', 'molbert_features_122', 'molbert_features_166', 'molbert_features_188'

In [13]:
from group_lasso import LogisticGroupLasso
import numpy as np

# Create an array that specifies the group for each feature
# Assuming the number of features in your dataset:
n_features = bbbp_X_ensemble_valid2_scaled.shape[1]
groups = np.zeros(n_features, dtype=int)
groups[:3] = 1  # First three features as one group
groups[3:] = 2  # Rest of the features as another group

# Initialize the Logistic Group Lasso model
group_lasso = LogisticGroupLasso(
    groups=groups,
    group_reg=0.05,  # Regularization strength for the groups
    l1_reg=0,        # No L1 regularization
    scale_reg='none', # Do not automatically scale the regularization
    supress_warning=True,
    tol=1e-2,
    random_state=0
)

# Fit the model
group_lasso.fit(bbbp_X_ensemble_valid2_scaled, bbbp_y_ensemble_valid2)

# Predict the test set
bbbp_group_lasso_pred = group_lasso.predict(bbbp_X_ensemble_test_scaled)
bbbp_group_lasso_probs = group_lasso.predict_proba(bbbp_X_ensemble_test_scaled)[:, 1]

bbbp_two_groups_lasso_metrics = {
    "Accuracy": accuracy_score(bbbp_y_ensemble_test, bbbp_group_lasso_pred),
    "F1 Score": f1_score(bbbp_y_ensemble_test, bbbp_group_lasso_pred),
    "ROC-AUC": roc_auc_score(bbbp_y_ensemble_test, bbbp_group_lasso_probs),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_group_lasso_probs)
}

bbbp_two_groups_lasso_metrics

{'Accuracy': 0.8676470588235294,
 'F1 Score': 0.9120521172638436,
 'ROC-AUC': 0.9263896532746285,
 'PR-AUC': 0.9786166462114604}

In [14]:
from group_lasso import LogisticGroupLasso
import numpy as np

# Create an array that specifies the group for each feature
# Assuming the number of features in your dataset:
n_features = bbbp_X_ensemble_valid2_scaled.shape[1]
groups = np.zeros(n_features, dtype=int)

# Setting the groups according to the new specification:
groups[:3] = 1      # First three features as one group
groups[3:3+384] = 2 # Next 384 features as second group
groups[3+384:3+384+768] = 3 # Next 768 features as third group
groups[3+384+768:] = 4 # Remaining 768 features as fourth group

# Initialize the Logistic Group Lasso model
group_lasso = LogisticGroupLasso(
    groups=groups,
    group_reg=0.05,  # Regularization strength for the groups
    l1_reg=0,        # No L1 regularization
    scale_reg='none', # Do not automatically scale the regularization
    supress_warning=True,
    tol=1e-2,
    random_state=0
)

# Fit the model
group_lasso.fit(bbbp_X_ensemble_valid2_scaled, bbbp_y_ensemble_valid2)

# Predict the test set
bbbp_group_lasso_pred = group_lasso.predict(bbbp_X_ensemble_test_scaled)
bbbp_group_lasso_probs = group_lasso.predict_proba(bbbp_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

bbbp_four_groups_lasso_metrics = {
    "Accuracy": accuracy_score(bbbp_y_ensemble_test, bbbp_group_lasso_pred),
    "F1 Score": f1_score(bbbp_y_ensemble_test, bbbp_group_lasso_pred),
    "ROC-AUC": roc_auc_score(bbbp_y_ensemble_test, bbbp_group_lasso_probs),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_group_lasso_probs)
}

bbbp_four_groups_lasso_metrics

{'Accuracy': 0.8823529411764706,
 'F1 Score': 0.922077922077922,
 'ROC-AUC': 0.9269400110071546,
 'PR-AUC': 0.9787418922716785}

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Define the model with elasticnet penalty
elastic_net_model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=5000, random_state=0)

# Use fewer discrete values for alpha and l1_ratio
alphas = [0.01, 0.1, 1, 3]  # Reduced number of points focusing on lower and mid-range
l1_ratios = [0.1, 0.5, 0.9]  # Reduced to three points, emphasizing edges and midpoint

# Convert alphas to Cs for the parameter grid (since C is the inverse of alpha)
Cs = [1/alpha for alpha in alphas]

# Create a more concise grid search using 5-fold cross-validation
params = {
    'C': Cs,
    'l1_ratio': l1_ratios
}

grid_search = GridSearchCV(elastic_net_model, param_grid=params, cv=5, scoring='roc_auc')

# Fit the grid search to the data
grid_search.fit(bbbp_X_ensemble_valid2_scaled, bbbp_y_ensemble_valid2)

# Best model after grid search
bbbp_best_elastic_model = grid_search.best_estimator_
print(grid_search.best_params_)

# Predict the test set
bbbp_elastic_pred = bbbp_best_elastic_model.predict(bbbp_X_ensemble_test_scaled)
bbbp_elastic_probs = bbbp_best_elastic_model.predict_proba(bbbp_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
bbbp_elastic_metrics = {
    "Accuracy": accuracy_score(bbbp_y_ensemble_test, bbbp_elastic_pred),
    "F1 Score": f1_score(bbbp_y_ensemble_test, bbbp_elastic_pred),
    "ROC-AUC": roc_auc_score(bbbp_y_ensemble_test, bbbp_elastic_probs),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_elastic_probs)
}

bbbp_elastic_metrics

{'C': 0.3333333333333333, 'l1_ratio': 0.5}


{'Accuracy': 0.9019607843137255,
 'F1 Score': 0.9358974358974359,
 'ROC-AUC': 0.9620253164556962,
 'PR-AUC': 0.9901440512499865}

In [16]:
# Access the coefficients from elastic net
coefs = pd.Series(bbbp_best_elastic_model.coef_[0], index=bbbp_X_ensemble_valid2.columns)

# Filter to get the selected features
selected_features = coefs[coefs != 0].index.tolist()

# Check if 'chemberta2', 'molformer', 'molbert' are in the selected features, if not, add them
for model in ['chemberta2', 'molformer', 'molbert']:
    if model not in selected_features:
        selected_features.append(model)

print("Selected Features:", selected_features)
print(len(selected_features))

Selected Features: ['chemberta2', 'molformer', 'molbert', 'chemberta2_feature_19', 'chemberta2_feature_33', 'chemberta2_feature_44', 'chemberta2_feature_73', 'chemberta2_feature_91', 'chemberta2_feature_99', 'chemberta2_feature_106', 'chemberta2_feature_113', 'chemberta2_feature_136', 'chemberta2_feature_157', 'chemberta2_feature_160', 'chemberta2_feature_171', 'chemberta2_feature_173', 'chemberta2_feature_178', 'chemberta2_feature_218', 'chemberta2_feature_243', 'chemberta2_feature_252', 'chemberta2_feature_253', 'chemberta2_feature_256', 'chemberta2_feature_267', 'chemberta2_feature_289', 'chemberta2_feature_307', 'chemberta2_feature_313', 'chemberta2_feature_320', 'chemberta2_feature_353', 'chemberta2_feature_357', 'chemberta2_feature_364', 'chemberta2_feature_379', 'molformer_feature_17', 'molformer_feature_24', 'molformer_feature_32', 'molformer_feature_33', 'molformer_feature_52', 'molformer_feature_68', 'molformer_feature_74', 'molformer_feature_95', 'molformer_feature_100', 'mo

In [17]:
bbbp_X_ensemble_valid2_selected = bbbp_X_ensemble_valid2_scaled
bbbp_X_ensemble_test_selected = bbbp_X_ensemble_test_scaled

# check shapes
print(bbbp_X_ensemble_valid2_selected.shape)
print(bbbp_X_ensemble_test_selected.shape)

(400, 1923)
(204, 1923)


In [18]:
# Initialize and train the SVM model
from sklearn.svm import SVC
import random

# Set seeds for reproducibility
np.random.seed(0)
random.seed(0)

bbbp_svm_model = SVC(probability=True, random_state=0)
bbbp_svm_model.fit(bbbp_X_ensemble_valid2_selected, bbbp_y_ensemble_valid2)

# Predict the test set
bbbp_svm_pred = bbbp_svm_model.predict(bbbp_X_ensemble_test_selected)
bbbp_svm_probs = bbbp_svm_model.predict_proba(bbbp_X_ensemble_test_selected)

# Calculate the metrics
bbbp_svm_metrics = {
    'Accuracy': accuracy_score(bbbp_y_ensemble_test, bbbp_svm_pred),
    'F1 Score': f1_score(bbbp_y_ensemble_test, bbbp_svm_pred),
    'ROC-AUC': roc_auc_score(bbbp_y_ensemble_test, bbbp_svm_probs[:, 1]),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_svm_probs[:, 1])
}

bbbp_svm_metrics

{'Accuracy': 0.8921568627450981,
 'F1 Score': 0.9329268292682927,
 'ROC-AUC': 0.9544578976334617,
 'PR-AUC': 0.9871527253215588}

In [19]:
# initailize and use a 5-fold cross-validation to tune the hyperparameters of a random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

bbbp_rf_model = RandomForestClassifier(random_state=0)

bbbp_rf_model.fit(bbbp_X_ensemble_valid2_selected, bbbp_y_ensemble_valid2)

# Predict the test set
bbbp_rf_best_pred = bbbp_rf_model.predict(bbbp_X_ensemble_test_selected)
bbbp_rf_best_probs = bbbp_rf_model.predict_proba(bbbp_X_ensemble_test_selected)

# Calculate the metrics
bbbp_rf_best_metrics = {
    "Accuracy": accuracy_score(bbbp_y_ensemble_test, bbbp_rf_best_pred),
    "F1 Score": f1_score(bbbp_y_ensemble_test, bbbp_rf_best_pred),
    "ROC-AUC": roc_auc_score(bbbp_y_ensemble_test, bbbp_rf_best_probs[:, 1]),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_rf_best_probs[:, 1])
}

bbbp_rf_best_metrics

{'Accuracy': 0.8970588235294118,
 'F1 Score': 0.9361702127659575,
 'ROC-AUC': 0.9681480462300496,
 'PR-AUC': 0.9910645899853345}

In [20]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

# Set seeds for reproducibility
np.random.seed(0)
random.seed(0)

# Define the hyperparameter space using continuous distributions
bbbp_xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Convert float outputs of hp.quniform to int for certain parameters
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    
    model = xgb.XGBClassifier(**params, random_state=0)
    
    # Cross-validated AUC score as the objective
    roc_auc = make_scorer(roc_auc_score, response_method=None)
    score = cross_val_score(model, bbbp_X_ensemble_valid2_selected, bbbp_y_ensemble_valid2, scoring=roc_auc, cv=5)
    
    # Minimize the negative ROC AUC score
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Run the Bayesian optimization
trials = Trials()
bbbp_xgb_best_params = fmin(fn=objective, 
                          space=bbbp_xgb_hyperopt_space, 
                          algo=tpe.suggest, 
                          max_evals=50, 
                          trials=trials,
                          rstate=np.random.default_rng(0),  # Seed for hyperopt
                          early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", bbbp_xgb_best_params)

 28%|██▊       | 14/50 [01:15<03:15,  5.42s/trial, best loss: -0.9123353465631292]
Best hyperparameters: {'colsample_bytree': 0.5026652166920494, 'learning_rate': 0.13674564131809464, 'max_depth': 6.0, 'n_estimators': 100.0, 'subsample': 0.954663135591284}


In [21]:
# Convert parameters obtained from Hyperopt to the correct data type
bbbp_xgb_best_params['n_estimators'] = int(bbbp_xgb_best_params['n_estimators'])
bbbp_xgb_best_params['max_depth'] = int(bbbp_xgb_best_params['max_depth'])

# Initialize and train the XGBoost model with the best parameters
bbbp_xgb_model = xgb.XGBClassifier(**bbbp_xgb_best_params, random_state=0)
bbbp_xgb_model.fit(bbbp_X_ensemble_valid2_selected, bbbp_y_ensemble_valid2)

# Predict the test set
bbbp_xgb_best_pred = bbbp_xgb_model.predict(bbbp_X_ensemble_test_selected)
bbbp_xgb_best_probs = bbbp_xgb_model.predict_proba(bbbp_X_ensemble_test_selected)

# Calculate the metrics
bbbp_xgb_best_metrics = {
    "Accuracy": accuracy_score(bbbp_y_ensemble_test, bbbp_xgb_best_pred),
    "F1 Score": f1_score(bbbp_y_ensemble_test, bbbp_xgb_best_pred),
    "ROC-AUC": roc_auc_score(bbbp_y_ensemble_test, bbbp_xgb_best_probs[:, 1]),
    "PR-AUC": average_precision_score(bbbp_y_ensemble_test, bbbp_xgb_best_probs[:, 1])
}

bbbp_xgb_best_metrics

{'Accuracy': 0.8921568627450981,
 'F1 Score': 0.93125,
 'ROC-AUC': 0.9657402311502477,
 'PR-AUC': 0.9904942753207473}

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

# Set seeds for reproducibility
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

# Define the neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(num_layers - 1):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1), nn.Sigmoid()]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Objective function for Bayesian optimization
def objective(params):
    kf = KFold(n_splits=5)
    roc_aucs = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        train_dataset = TensorDataset(torch.tensor(X_train.values.astype(np.float32)), 
                                      torch.tensor(y_train.values.astype(np.float32)).unsqueeze(1))
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        model = SimpleNN(input_size=X_train.shape[1], num_layers=int(params['num_layers']), 
                         num_neurons=int(params['num_neurons']), dropout_rate=params['dropout_rate'])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

        model.train()
        for epoch in range(100):
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            X_val_tensor = torch.tensor(X_val.values.astype(np.float32))
            y_val_tensor = torch.tensor(y_val.values.astype(np.float32)).unsqueeze(-1)
            outputs = model(X_val_tensor)
            roc_auc = roc_auc_score(y_val_tensor.numpy(), outputs.numpy())
            roc_aucs.append(roc_auc)

    avg_roc_auc = np.mean(roc_aucs)
    return {'loss': -avg_roc_auc, 'status': STATUS_OK}  # Maximize ROC AUC by minimizing the negative ROC AUC

# Hyperparameter space
space = {
    'num_layers': hp.quniform('num_layers', 1, 5, 1),
    'num_neurons': hp.quniform('num_neurons', 16, 256, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.01)),
    'dropout_rate': hp.uniform('dropout_rate', 0.0, 0.5)
}

X = bbbp_X_ensemble_valid2_selected
y = bbbp_y_ensemble_valid2

# Run Bayesian optimization
trials = Trials()
bbbp_nn_best_params = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            rstate=np.random.default_rng(0),  # Seed for hyperopt
            early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", bbbp_nn_best_params)


 24%|██▍       | 12/50 [01:22<04:21,  6.88s/trial, best loss: -0.9808923131360695]
Best hyperparameters: {'dropout_rate': 0.2844084112121386, 'learning_rate': 0.0002374377408888569, 'num_layers': 2.0, 'num_neurons': 215.0}


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import roc_auc_score

torch.manual_seed(0)

# Define the neural network model again
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(num_layers - 1):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1), nn.Sigmoid()]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Convert parameters to the correct format if necessary
bbbp_nn_best_params = {
    'num_layers': int(bbbp_nn_best_params['num_layers']),  # Extracted from Bayesian optimization results
    'num_neurons': int(bbbp_nn_best_params['num_neurons']),  # Extracted from Bayesian optimization results
    'dropout_rate': bbbp_nn_best_params['dropout_rate'],  # Extracted from Bayesian optimization results
    'learning_rate': bbbp_nn_best_params['learning_rate']  # Extracted from Bayesian optimization results
}

# Prepare datasets
X_train_tensor = torch.tensor(bbbp_X_ensemble_valid2_selected.values.astype(np.float32))
y_train_tensor = torch.tensor(bbbp_y_ensemble_valid2.values.astype(np.float32)).unsqueeze(1)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test_tensor = torch.tensor(bbbp_X_ensemble_test_selected.values.astype(np.float32))
y_test_tensor = torch.tensor(bbbp_y_ensemble_test.values.astype(np.float32)).unsqueeze(1)

# Initialize the model
model = SimpleNN(input_size=bbbp_X_ensemble_valid2_selected.shape[1], num_layers=bbbp_nn_best_params['num_layers'], 
                 num_neurons=bbbp_nn_best_params['num_neurons'], dropout_rate=bbbp_nn_best_params['dropout_rate'])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=bbbp_nn_best_params['learning_rate'])

# Training loop
model.train()
for epoch in range(100):  # Number of epochs can be adjusted
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluation on test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predictions = (outputs > 0.5).float()

    # Calculate metrics
    accuracy = accuracy_score(y_test_tensor.numpy(), predictions.numpy())
    f1 = f1_score(y_test_tensor.numpy(), predictions.numpy())
    roc_auc = roc_auc_score(y_test_tensor.numpy(), outputs.numpy())
    pr_auc = average_precision_score(y_test_tensor.numpy(), outputs.numpy())

    bbbp_nn_metrics = {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    }

bbbp_nn_metrics

{'Accuracy': 0.8970588235294118,
 'F1 Score': 0.932475884244373,
 'ROC-AUC': 0.9327875619152449,
 'PR-AUC': 0.9790374272587836}

In [24]:
# report all the metrics for bbbp
bbbp_metrics_results["LASSO"] = bbbp_lasso_metrics
bbbp_metrics_results["Group Lasso (Two Groups)"] = bbbp_two_groups_lasso_metrics
bbbp_metrics_results["Group Lasso (Four Groups)"] = bbbp_four_groups_lasso_metrics
bbbp_metrics_results["Elastic Net"] = bbbp_elastic_metrics
bbbp_metrics_results["SVM"] = bbbp_svm_metrics
bbbp_metrics_results["Random Forest"] = bbbp_rf_best_metrics
bbbp_metrics_results["XGBoost"] = bbbp_xgb_best_metrics
bbbp_metrics_results["Neural Network"] = bbbp_nn_metrics

bbbp_metrics_df = pd.DataFrame(bbbp_metrics_results).T

# keep 3 digits after the decimal point
bbbp_metrics_df = bbbp_metrics_df.round(3)

# export as csv
bbbp_metrics_df.to_csv('./split2_bbbp_metrics_rawpreds.csv')