In [8]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score, mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr

import numpy as np

# preds

# Load the training set of meta-model
bace_chemberta2_valid2 = pd.read_csv('./chemberta2/results/bace/chemberta2_valid2_bace_1_predictions.csv')
bace_molformer_valid2 = pd.read_csv('./molformer/results/bace/molformer_valid2_bace_1_epoch49.csv')
bace_molbert_valid2 = pd.read_csv('./molbert/results/bace/molbert_valid2_bace_1.csv')

# Load the test data for each model
bace_chemberta2_test = pd.read_csv('./chemberta2/results/bace/chemberta2_test_bace_1_predictions.csv')
bace_molformer_test = pd.read_csv('./molformer/results/bace/molformer_test_bace_1_epoch49.csv')
bace_molbert_test = pd.read_csv('./molbert/results/bace/molbert_test_bace_1.csv')

# features

# Load the features from chemberta
bace_chemberta2_features_valid2 = pd.read_csv('./chemberta2/features/bace/chemberta2_valid2_bace_1_features.csv')
bace_chemberta2_features_test = pd.read_csv('./chemberta2/features/bace/chemberta2_test_bace_1_features.csv')

# Load the features from molformer
bace_molformer_features_valid2 = pd.read_csv('./molformer/features/bace/molformer_valid2_bace_1_features.csv')
bace_molformer_features_test = pd.read_csv('./molformer/features/bace/molformer_test_bace_1_features.csv')

# Load the features from molbert
bace_molbert_features_valid2 = pd.read_csv('./molbert/features/bace/molbert_valid2_bace_1_features.csv')
bace_molbert_features_test = pd.read_csv('./molbert/features/bace/molbert_test_bace_1_features.csv')

For BACE (Classification)

In [9]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Preparing the actual and predicted values
# Chemberta2
bace_chemberta_actual = bace_chemberta2_test['Class']
bace_chemberta_pred = bace_chemberta2_test['y_pred']
bace_chemberta_probs = bace_chemberta2_test[['softmax_class_0_prob', 'softmax_class_1_prob']]

# Molformer
bace_molformer_actual = bace_molformer_test['Actual']
bace_molformer_pred = (bace_molformer_test['Prob_Class_1'] > 0.5).astype(int)
bace_molformer_probs = bace_molformer_test[['Prob_Class_0', 'Prob_Class_1']]

# Molbert
bace_molbert_actual = bace_molbert_test['target']
bace_molbert_pred = bace_molbert_test['pred']
bace_molbert_probs = bace_molbert_test['prob']

# Calculating metrics
bace_metrics_results = {}

for model_name, actual, pred, probs in [("Chemberta2", bace_chemberta_actual, bace_chemberta_pred, bace_chemberta_probs['softmax_class_1_prob']),
                                         ("Molformer", bace_molformer_actual, bace_molformer_pred, bace_molformer_probs['Prob_Class_1']),
                                         ("Molbert", bace_molbert_actual, bace_molbert_pred, bace_molbert_probs)]:
    bace_metrics_results[model_name] = {
        "Accuracy": accuracy_score(actual, pred),
        "F1 Score": f1_score(actual, pred),
        "ROC-AUC": roc_auc_score(actual, probs),
        "PR-AUC": average_precision_score(actual, probs)
    }

bace_metrics_results

{'Chemberta2': {'Accuracy': 0.6535947712418301,
  'F1 Score': 0.6442953020134228,
  'ROC-AUC': 0.7465949820788532,
  'PR-AUC': 0.8058944338111258},
 'Molformer': {'Accuracy': 0.6797385620915033,
  'F1 Score': 0.6797385620915032,
  'ROC-AUC': 0.8521505376344086,
  'PR-AUC': 0.8774798970672688},
 'Molbert': {'Accuracy': 0.7189542483660131,
  'F1 Score': 0.7361963190184049,
  'ROC-AUC': 0.775089605734767,
  'PR-AUC': 0.8548501462313669}}

In [10]:
# check shapes
print(bace_chemberta2_valid2.shape)
print(bace_molformer_valid2.shape)
print(bace_molbert_valid2.shape)
print(bace_chemberta2_features_valid2.shape)
print(bace_molformer_features_valid2.shape)
print(bace_molbert_features_valid2.shape)

(305, 8)
(305, 5)
(305, 4)
(305, 386)
(305, 769)
(305, 769)


In [11]:
# create a new dataframe with one column of bace_chemberta2_valid2['softmax_class_1_prob']
bace_chemberta2_prob = pd.DataFrame({'chemberta2': bace_chemberta2_valid2['softmax_class_1_prob']})
bace_chemberta2_prob.reset_index(drop=True, inplace=True)

# create a new dataframe with one column of bace_molformer_valid2['Prob_Class_1']
bace_molformer_prob = pd.DataFrame({'molformer': bace_molformer_valid2['Prob_Class_1']})
bace_molformer_prob.reset_index(drop=True, inplace=True)

# create a new dataframe with one column of bace_molbert_valid2['Probabilities']
bace_molbert_prob = pd.DataFrame({'molbert': bace_molbert_valid2['prob']})
bace_molbert_prob.reset_index(drop=True, inplace=True)

# concatenate the three dataframes
bace_prob = pd.concat([bace_chemberta2_prob, bace_molformer_prob, bace_molbert_prob], axis=1)

# do the same for features bace_chemberta2_features_valid2.iloc[:, 2:]
bace_chemberta2_features = pd.DataFrame(bace_chemberta2_features_valid2.iloc[:, 2:])
bace_chemberta2_features.reset_index(drop=True, inplace=True)
bace_molformer_features = pd.DataFrame(bace_molformer_features_valid2.iloc[:, 1:])
bace_molformer_features.reset_index(drop=True, inplace=True)
bace_molbert_features = pd.DataFrame(bace_molbert_features_valid2.iloc[:, 1:])
bace_molbert_features.reset_index(drop=True, inplace=True)

bace_features = pd.concat([bace_chemberta2_features, bace_molformer_features, bace_molbert_features], axis=1)

# combine the features and probabilities
bace_X_ensemble_valid2 = pd.concat([bace_prob, bace_features], axis=1)

bace_y_ensemble_valid2 = bace_chemberta2_valid2['Class']

In [12]:
import pandas as pd
from sklearn.cross_decomposition import CCA
from itertools import combinations
from sklearn.preprocessing import StandardScaler

# Assuming bace_chemberta2_features, bace_molformer_features, bace_molbert_features are loaded

# Creating a dictionary of feature sets for easy access
feature_sets = {
    'Chemberta': bace_chemberta2_features_valid2.iloc[:, 2:],
    'Molformer': bace_molformer_features_valid2.iloc[:, 1:],
    'Molbert': bace_molbert_features_valid2.iloc[:, 1:]
}

# Prepare to store results
correlation_results = []

# Initialize a scaler
scaler = StandardScaler()

# Loop through each pair of feature sets
for model1, model2 in combinations(feature_sets.keys(), 2):
    # Get the data for each model and scale it
    
    X = scaler.fit_transform(feature_sets[model1])
    Y = scaler.fit_transform(feature_sets[model2])

    # Print dimensions for debugging
    print(f"Dimensions for {model1}: {X.shape}, Dimensions for {model2}: {Y.shape}")

    # Determine the maximum number of components possible
    # n_comp = min(X.shape[0], X.shape[1], Y.shape[1])

    n_comp = 2

    # Initialize and fit CCA
    cca = CCA(n_components=n_comp)
    cca.fit(X, Y)
    
    X_c, Y_c = cca.transform(X, Y) #transform our datasests to obtain canonical variates
    
    # Calculate the canonical correlations
    # score = cca.score(X, Y)
    
    # Calculate the canonical correlations manually
    correlations = [np.corrcoef(X_c[:, i], Y_c[:, i])[0, 1] for i in range(n_comp)]

    correlation_results.append([model1, model2, np.mean(correlations)])

# Optional: Convert results to DataFrame for better visualization or further analysis
results_df = pd.DataFrame(correlation_results, columns=['Model1', 'Model2', 'Mean Corr'])
print(results_df)


Dimensions for Chemberta: (305, 384), Dimensions for Molformer: (305, 768)
Dimensions for Chemberta: (305, 384), Dimensions for Molbert: (305, 768)
Dimensions for Molformer: (305, 768), Dimensions for Molbert: (305, 768)
      Model1     Model2  Mean Corr
0  Chemberta  Molformer        1.0
1  Chemberta    Molbert        1.0
2  Molformer    Molbert        1.0


In [13]:
feature_sets

{'Chemberta':      chemberta2_feature_1  chemberta2_feature_2  chemberta2_feature_3  \
 0               -0.227219              0.826866             -0.200282   
 1               -0.066664              0.494209             -0.130207   
 2               -0.018035              0.337232             -0.193086   
 3               -0.023992              0.008189             -0.006162   
 4               -0.190709              0.139954             -0.157010   
 ..                    ...                   ...                   ...   
 300             -0.179503              0.655813             -0.223353   
 301              0.124386              0.370901             -0.127989   
 302             -0.101810              0.503486             -0.151391   
 303              0.008240              0.430484             -0.127580   
 304             -0.001410              0.326606             -0.099900   
 
      chemberta2_feature_4  chemberta2_feature_5  chemberta2_feature_6  \
 0               -0.065

In [6]:
# do the same for test probs and features
bace_chemberta2_prob = pd.DataFrame({'chemberta2': bace_chemberta2_test['softmax_class_1_prob']})
bace_chemberta2_prob.reset_index(drop=True, inplace=True)
bace_molformer_prob = pd.DataFrame({'molformer': bace_molformer_test['Prob_Class_1']})
bace_molformer_prob.reset_index(drop=True, inplace=True)
bace_molbert_prob = pd.DataFrame({'molbert': bace_molbert_test['prob']})
bace_molbert_prob.reset_index(drop=True, inplace=True)
bace_prob = pd.concat([bace_chemberta2_prob, bace_molformer_prob, bace_molbert_prob], axis=1)

bace_chemberta2_features = pd.DataFrame(bace_chemberta2_features_test.iloc[:, 2:])
bace_chemberta2_features.reset_index(drop=True, inplace=True)
bace_molformer_features = pd.DataFrame(bace_molformer_features_test.iloc[:, 1:])
bace_molformer_features.reset_index(drop=True, inplace=True)
bace_molbert_features = pd.DataFrame(bace_molbert_features_test.iloc[:, 1:])
bace_molbert_features.reset_index(drop=True, inplace=True)
bace_features = pd.concat([bace_chemberta2_features, bace_molformer_features, bace_molbert_features], axis=1)

bace_X_ensemble_test = pd.concat([bace_prob, bace_features], axis=1)

bace_y_ensemble_test = bace_chemberta2_test['Class']

In [7]:
# export bace_X_ensemble_valid2 and bace_y_ensemble_valid2 to csv
bace_X_ensemble_valid2.to_csv('./processed_data/bace_X_ensemble_valid2.csv', index=False)
bace_y_ensemble_valid2.to_csv('./processed_data/bace_y_ensemble_valid2.csv', index=False)

# export bace_X_ensemble_test and bace_y_ensemble_test to csv
bace_X_ensemble_test.to_csv('./processed_data/bace_X_ensemble_test.csv', index=False)
bace_y_ensemble_test.to_csv('./processed_data/bace_y_ensemble_test.csv', index=False)

In [7]:
import pandas as pd
from sklearn.cross_decomposition import CCA
from itertools import combinations
from sklearn.preprocessing import StandardScaler

# Assuming bace_chemberta2_features, bace_molformer_features, bace_molbert_features are loaded

# Creating a dictionary of feature sets for easy access
feature_sets = {
    'Chemberta': bace_chemberta2_features_valid2.iloc[:, 2:],
    'Molformer': bace_molformer_features_valid2.iloc[:, 1:],
    'Molbert': bace_molbert_features_valid2.iloc[:, 1:]
}

# Prepare to store results
correlation_results = []

# Initialize a scaler
scaler = StandardScaler()

# Loop through each pair of feature sets
for model1, model2 in combinations(feature_sets.keys(), 2):
    # Get the data for each model and scale it
    
    X = scaler.fit_transform(feature_sets[model1])
    Y = scaler.fit_transform(feature_sets[model2])

    # Print dimensions for debugging
    print(f"Dimensions for {model1}: {X.shape}, Dimensions for {model2}: {Y.shape}")

    # Determine the maximum number of components possible
    # n_comp = min(X.shape[0], X.shape[1], Y.shape[1])

    n_comp = 2

    # Initialize and fit CCA
    cca = CCA(n_components=n_comp)
    cca.fit(X, Y)
    
    X_c, Y_c = cca.transform(X, Y) #transform our datasests to obtain canonical variates
    
    # Calculate the canonical correlations
    # score = cca.score(X, Y)
    
    # Calculate the canonical correlations manually
    correlations = [np.corrcoef(X_c[:, i], Y_c[:, i])[0, 1] for i in range(n_comp)]

    correlation_results.append([model1, model2, np.mean(correlations)])

# Optional: Convert results to DataFrame for better visualization or further analysis
results_df = pd.DataFrame(correlation_results, columns=['Model1', 'Model2', 'Mean Corr'])
print(results_df)


Dimensions for Chemberta: (305, 384), Dimensions for Molformer: (305, 768)
Dimensions for Chemberta: (305, 384), Dimensions for Molbert: (305, 768)
Dimensions for Molformer: (305, 768), Dimensions for Molbert: (305, 768)
      Model1     Model2  Mean Corr
0  Chemberta  Molformer        1.0
1  Chemberta    Molbert        1.0
2  Molformer    Molbert        1.0


In [9]:
# use standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
bace_X_ensemble_valid2_scaled = scaler.fit_transform(bace_X_ensemble_valid2)
bace_X_ensemble_test_scaled = scaler.transform(bace_X_ensemble_test)

# transform back to dataframe
bace_X_ensemble_valid2_scaled = pd.DataFrame(bace_X_ensemble_valid2_scaled, columns=bace_X_ensemble_valid2.columns)
bace_X_ensemble_test_scaled = pd.DataFrame(bace_X_ensemble_test_scaled, columns=bace_X_ensemble_test.columns)


In [10]:
# export bace_X_ensemble_valid2 and bace_y_ensemble_valid2 to csv
bace_X_ensemble_valid2_scaled.to_csv('./processed_data/bace_X_ensemble_valid2_scaled.csv', index=False)
bace_X_ensemble_test_scaled.to_csv('./processed_data/bace_X_ensemble_test_scaled.csv', index=False)

In [16]:
# # use min-max scaling
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# bace_X_ensemble_valid2_scaled = scaler.fit_transform(bace_X_ensemble_valid2)
# bace_X_ensemble_valid2_scaled = pd.DataFrame(bace_X_ensemble_valid2_scaled, columns=bace_X_ensemble_valid2.columns)

# bace_X_ensemble_test_scaled = scaler.transform(bace_X_ensemble_test)
# bace_X_ensemble_test_scaled = pd.DataFrame(bace_X_ensemble_test_scaled, columns=bace_X_ensemble_test.columns)

In [17]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# # Define the model with ridge penalty (l2)
# ridge_model = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=0)

# # Prepare a range of alpha values to test (or C values, which are the inverse of alpha)
# alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]  # Fewer points, covering a broad range

# # Convert alphas to Cs for the parameter grid (since C is the inverse of alpha)
# Cs = [1/alpha for alpha in alphas]
# params = {'C': Cs}
# grid_search = GridSearchCV(ridge_model, param_grid=params, cv=5, scoring='roc_auc')

# # Fit the grid search to the data
# grid_search.fit(bace_X_ensemble_valid2, bace_y_ensemble_valid2)

# # Best model after grid search
# bace_best_ridge_model = grid_search.best_estimator_

# # Predict the test set
# bace_ridge_pred = bace_best_ridge_model.predict(bace_X_ensemble_test)
# bace_ridge_probs = bace_best_ridge_model.predict_proba(bace_X_ensemble_test)[:, 1]

# # Calculate the metrics
# bace_ridge_metrics = {
#     "Accuracy": accuracy_score(bace_y_ensemble_test, bace_ridge_pred),
#     "F1 Score": f1_score(bace_y_ensemble_test, bace_ridge_pred),
#     "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_ridge_probs),
#     "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_ridge_probs)
# }

# bace_ridge_metrics

In [18]:
# use lasso regression to train the ensemble model
from sklearn.linear_model import LogisticRegressionCV

# cross validation for strength of regularization
lasso_cv = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', max_iter=5000, random_state=0, scoring='roc_auc')

# Fit the model
lasso_cv.fit(bace_X_ensemble_valid2_scaled, bace_y_ensemble_valid2)

# Predict the test set
bace_lasso_pred = lasso_cv.predict(bace_X_ensemble_test_scaled)
bace_lasso_probs = lasso_cv.predict_proba(bace_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
bace_lasso_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_lasso_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_lasso_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_lasso_probs),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_lasso_probs)
}

bace_lasso_metrics

{'Accuracy': 0.7516339869281046,
 'F1 Score': 0.7710843373493976,
 'ROC-AUC': 0.8150537634408602,
 'PR-AUC': 0.8250748784113499}

In [19]:
coefs = pd.Series(lasso_cv.coef_[0], index=bace_X_ensemble_valid2.columns)

# Filter to get the selected features
selected_features = coefs[coefs != 0].index.tolist()

# Check if 'chemberta2', 'molformer', 'molbert' are in the selected features, if not, add them
for model in ['chemberta2', 'molformer', 'molbert']:
    if model not in selected_features:
        selected_features.append(model)

print("Selected Features:", selected_features)
# check how many features are selected
print(len(selected_features))

Selected Features: ['chemberta2', 'molbert', 'chemberta2_feature_36', 'chemberta2_feature_56', 'chemberta2_feature_133', 'chemberta2_feature_167', 'chemberta2_feature_199', 'chemberta2_feature_230', 'chemberta2_feature_256', 'chemberta2_feature_296', 'chemberta2_feature_302', 'chemberta2_feature_367', 'molformer_feature_12', 'molformer_feature_33', 'molformer_feature_37', 'molformer_feature_76', 'molformer_feature_77', 'molformer_feature_94', 'molformer_feature_98', 'molformer_feature_110', 'molformer_feature_112', 'molformer_feature_133', 'molformer_feature_134', 'molformer_feature_149', 'molformer_feature_174', 'molformer_feature_218', 'molformer_feature_222', 'molformer_feature_226', 'molformer_feature_265', 'molformer_feature_338', 'molformer_feature_350', 'molformer_feature_380', 'molformer_feature_407', 'molformer_feature_440', 'molformer_feature_441', 'molformer_feature_570', 'molformer_feature_574', 'molformer_feature_587', 'molformer_feature_588', 'molformer_feature_601', 'mol

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

# Define the model with elasticnet penalty
elastic_net_model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=5000, random_state=0)

# Use fewer discrete values for alpha and l1_ratio
alphas = [0.01, 0.1, 1, 3]  # Reduced number of points focusing on lower and mid-range
l1_ratios = [0.1, 0.5, 0.9]  # Reduced to three points, emphasizing edges and midpoint

# Convert alphas to Cs for the parameter grid (since C is the inverse of alpha)
Cs = [1/alpha for alpha in alphas]

# Create a more concise grid search using 5-fold cross-validation
params = {
    'C': Cs,
    'l1_ratio': l1_ratios
}

grid_search = GridSearchCV(elastic_net_model, param_grid=params, cv=5, scoring='roc_auc')

# Fit the grid search to the data
grid_search.fit(bace_X_ensemble_valid2_scaled, bace_y_ensemble_valid2)

# Best model after grid search
bace_best_elastic_model = grid_search.best_estimator_
print(grid_search.best_params_)

# Predict the test set
bace_elastic_pred = bace_best_elastic_model.predict(bace_X_ensemble_test_scaled)
bace_elastic_probs = bace_best_elastic_model.predict_proba(bace_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
bace_elastic_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_elastic_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_elastic_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_elastic_probs),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_elastic_probs)
}

bace_elastic_metrics

{'C': 0.3333333333333333, 'l1_ratio': 0.9}


{'Accuracy': 0.7254901960784313,
 'F1 Score': 0.7407407407407407,
 'ROC-AUC': 0.8159498207885305,
 'PR-AUC': 0.8222281229361339}

In [32]:
# Access the coefficients from elastic net
coefs = pd.Series(bace_best_elastic_model.coef_[0], index=bace_X_ensemble_valid2.columns)

# Filter to get the selected features
selected_features = coefs[coefs != 0].index.tolist()

# Check if 'chemberta2', 'molformer', 'molbert' are in the selected features, if not, add them
for model in ['chemberta2', 'molformer', 'molbert']:
    if model not in selected_features:
        selected_features.append(model)

print("Selected Features:", selected_features)

Selected Features: ['chemberta2', 'molformer', 'molbert', 'chemberta2_feature_31', 'chemberta2_feature_36', 'chemberta2_feature_56', 'chemberta2_feature_133', 'chemberta2_feature_163', 'chemberta2_feature_167', 'chemberta2_feature_187', 'chemberta2_feature_199', 'chemberta2_feature_230', 'chemberta2_feature_256', 'chemberta2_feature_296', 'chemberta2_feature_302', 'chemberta2_feature_367', 'molformer_feature_12', 'molformer_feature_13', 'molformer_feature_33', 'molformer_feature_76', 'molformer_feature_77', 'molformer_feature_81', 'molformer_feature_94', 'molformer_feature_98', 'molformer_feature_110', 'molformer_feature_112', 'molformer_feature_133', 'molformer_feature_149', 'molformer_feature_174', 'molformer_feature_218', 'molformer_feature_222', 'molformer_feature_226', 'molformer_feature_240', 'molformer_feature_242', 'molformer_feature_252', 'molformer_feature_264', 'molformer_feature_274', 'molformer_feature_303', 'molformer_feature_350', 'molformer_feature_352', 'molformer_feat

In [19]:
from group_lasso import LogisticGroupLasso
import numpy as np

# Create an array that specifies the group for each feature
# Assuming the number of features in your dataset:
n_features = bace_X_ensemble_valid2_scaled.shape[1]
groups = np.zeros(n_features, dtype=int)
groups[:3] = 1  # First three features as one group
groups[3:] = 2  # Rest of the features as another group

# Initialize the Logistic Group Lasso model
group_lasso = LogisticGroupLasso(
    groups=groups,
    group_reg=0.05,  # Regularization strength for the groups
    l1_reg=0,        # No L1 regularization
    scale_reg='none', # Do not automatically scale the regularization
    supress_warning=True,
    tol=1e-2,
    random_state=0
)

# Fit the model
group_lasso.fit(bace_X_ensemble_valid2_scaled, bace_y_ensemble_valid2)

# Predict the test set
bace_group_lasso_pred = group_lasso.predict(bace_X_ensemble_test_scaled)
bace_group_lasso_probs = group_lasso.predict_proba(bace_X_ensemble_test_scaled)[:, 1]

bace_two_groups_lasso_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_group_lasso_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_group_lasso_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_group_lasso_probs),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_group_lasso_probs)
}

bace_two_groups_lasso_metrics

# Access the coefficients from group lasso
print(group_lasso.coef_)

[[ 0.00041278 -0.00041278]
 [ 0.0007098  -0.0007098 ]
 [ 0.00147986 -0.00147986]
 ...
 [ 0.01648494 -0.01648494]
 [ 0.04236152 -0.04236152]
 [-0.01457062  0.01457062]]


In [23]:
from group_lasso import LogisticGroupLasso
import numpy as np

# Create an array that specifies the group for each feature
# Assuming the number of features in your dataset:
n_features = bace_X_ensemble_valid2_scaled.shape[1]
groups = np.zeros(n_features, dtype=int)

# Setting the groups according to the new specification:
groups[:3] = 1      # First three features as one group
groups[3:3+384] = 2 # Next 384 features as second group
groups[3+384:3+384+768] = 3 # Next 768 features as third group
groups[3+384+768:] = 4 # Remaining 768 features as fourth group

# Initialize the Logistic Group Lasso model
group_lasso = LogisticGroupLasso(
    groups=groups,
    group_reg=0.05,  # Regularization strength for the groups
    l1_reg=0,        # No L1 regularization
    scale_reg='none', # Do not automatically scale the regularization
    supress_warning=True,
    tol=1e-2,
    random_state=0
)

# Fit the model
group_lasso.fit(bace_X_ensemble_valid2_scaled, bace_y_ensemble_valid2)

# Predict the test set
bace_group_lasso_pred = group_lasso.predict(bace_X_ensemble_test_scaled)
bace_group_lasso_probs = group_lasso.predict_proba(bace_X_ensemble_test_scaled)[:, 1]

# Calculate the metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

bace_four_groups_lasso_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_group_lasso_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_group_lasso_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_group_lasso_probs),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_group_lasso_probs)
}

bace_four_groups_lasso_metrics

# After fitting the model, you can access the coefficients:
coefficients = group_lasso.coef_

# If you want to see which features are included in the model (non-zero coefficients):
active_features = np.where(coefficients != 0)[0]  # Indices of non-zero coefficients

# Display coefficients grouped by their groups for better interpretation
for i in range(1, 5):  # Assuming there are four groups
    group_coefs = coefficients[groups == i]
    print(f"Group {i} Coefficients:")
    print(group_coefs)


Group 1 Coefficients:
[[-0.00608233  0.00608233]
 [-0.01020019  0.01020019]
 [-0.01778863  0.01778863]]
Group 2 Coefficients:
[[-4.27824448e-03  4.27824448e-03]
 [ 2.00043982e-03 -2.00043982e-03]
 [ 5.65593454e-03 -5.65593454e-03]
 [-2.84756119e-03  2.84756119e-03]
 [-3.46941372e-03  3.46941372e-03]
 [ 8.44188358e-03 -8.44188358e-03]
 [ 6.50015391e-04 -6.50015391e-04]
 [ 5.54850350e-03 -5.54850350e-03]
 [-1.10078938e-02  1.10078938e-02]
 [ 4.60776432e-03 -4.60776432e-03]
 [-1.48495439e-02  1.48495439e-02]
 [-1.14089702e-02  1.14089702e-02]
 [-8.85027365e-04  8.85027365e-04]
 [-2.93247340e-03  2.93247340e-03]
 [ 3.25816073e-03 -3.25816073e-03]
 [-8.66599589e-03  8.66599589e-03]
 [-3.77094531e-03  3.77094531e-03]
 [ 2.35850009e-03 -2.35850009e-03]
 [-1.80257683e-03  1.80257683e-03]
 [-2.13851167e-03  2.13851167e-03]
 [-1.00563169e-03  1.00563169e-03]
 [-3.49407436e-03  3.49407436e-03]
 [ 6.48371781e-03 -6.48371781e-03]
 [ 4.69556978e-03 -4.69556978e-03]
 [-7.80284833e-03  7.80284833e-03]

In [33]:
# # Access the coefficients from lasso
# coefs = pd.Series(lasso_cv.coef_[0], index=bace_X_ensemble_valid2.columns)

# # Filter to get the selected features
# selected_features = coefs[coefs != 0].index.tolist()

# # Check if 'chemberta2', 'molformer', 'molbert' are in the selected features, if not, add them
# for model in ['chemberta2', 'molformer', 'molbert']:
#     if model not in selected_features:
#         selected_features.append(model)

# # Filter the original DataFrame to keep only selected features
# bace_X_ensemble_valid2_selected = bace_X_ensemble_valid2_scaled[selected_features]
# bace_X_ensemble_test_selected = bace_X_ensemble_test_scaled[selected_features]

# print("Selected Features:", selected_features)
# print("Filtered Dataset Shape:", bace_X_ensemble_valid2_selected.shape)


In [34]:
# Access the coefficients from elastic net
coefs = pd.Series(bace_best_elastic_model.coef_[0], index=bace_X_ensemble_valid2.columns)

# Filter to get the selected features
selected_features = coefs[coefs != 0].index.tolist()

# Check if 'chemberta2', 'molformer', 'molbert' are in the selected features, if not, add them
for model in ['chemberta2', 'molformer', 'molbert']:
    if model not in selected_features:
        selected_features.append(model)

# Filter the original DataFrame to keep only selected features
bace_X_ensemble_valid2_selected = bace_X_ensemble_valid2_scaled[selected_features]
bace_X_ensemble_test_selected = bace_X_ensemble_test_scaled[selected_features]

print("Selected Features:", selected_features)
print("Filtered Dataset Shape:", bace_X_ensemble_valid2_selected.shape)

# Now bace_X_ensemble_valid2_selected contains only the features selected by LassoCV

Selected Features: ['chemberta2', 'molformer', 'molbert', 'chemberta2_feature_31', 'chemberta2_feature_36', 'chemberta2_feature_56', 'chemberta2_feature_133', 'chemberta2_feature_163', 'chemberta2_feature_167', 'chemberta2_feature_187', 'chemberta2_feature_199', 'chemberta2_feature_230', 'chemberta2_feature_256', 'chemberta2_feature_296', 'chemberta2_feature_302', 'chemberta2_feature_367', 'molformer_feature_12', 'molformer_feature_13', 'molformer_feature_33', 'molformer_feature_76', 'molformer_feature_77', 'molformer_feature_81', 'molformer_feature_94', 'molformer_feature_98', 'molformer_feature_110', 'molformer_feature_112', 'molformer_feature_133', 'molformer_feature_149', 'molformer_feature_174', 'molformer_feature_218', 'molformer_feature_222', 'molformer_feature_226', 'molformer_feature_240', 'molformer_feature_242', 'molformer_feature_252', 'molformer_feature_264', 'molformer_feature_274', 'molformer_feature_303', 'molformer_feature_350', 'molformer_feature_352', 'molformer_feat

In [35]:
# bace_X_ensemble_valid2_selected = bace_X_ensemble_valid2
# bace_X_ensemble_test_selected = bace_X_ensemble_test
# check shapes
print(bace_X_ensemble_valid2_selected.shape)
print(bace_X_ensemble_test_selected.shape)

(305, 102)
(153, 102)


In [36]:
# Initialize and train the SVM model
from sklearn.svm import SVC

bace_svm_model = SVC(probability=True)
bace_svm_model.fit(bace_X_ensemble_valid2_selected, bace_y_ensemble_valid2)

# Predict the test set
bace_svm_pred = bace_svm_model.predict(bace_X_ensemble_test_selected)
bace_svm_probs = bace_svm_model.predict_proba(bace_X_ensemble_test_selected)

# Calculate the metrics
bace_svm_metrics = {
    'Accuracy': accuracy_score(bace_y_ensemble_test, bace_svm_pred),
    'F1 Score': f1_score(bace_y_ensemble_test, bace_svm_pred),
    'ROC-AUC': roc_auc_score(bace_y_ensemble_test, bace_svm_probs[:, 1]),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_svm_probs[:, 1])
}

bace_svm_metrics

{'Accuracy': 0.6666666666666666,
 'F1 Score': 0.6530612244897959,
 'ROC-AUC': 0.839605734767025,
 'PR-AUC': 0.8576578286336599}

In [37]:
# initailize and use a 5-fold cross-validation to tune the hyperparameters of a random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

bace_rf_model = RandomForestClassifier(random_state=0)

bace_rf_model.fit(bace_X_ensemble_valid2_selected, bace_y_ensemble_valid2)

# Predict the test set
bace_rf_best_pred = bace_rf_model.predict(bace_X_ensemble_test_selected)
bace_rf_best_probs = bace_rf_model.predict_proba(bace_X_ensemble_test_selected)

# Calculate the metrics
bace_rf_best_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_rf_best_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_rf_best_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_rf_best_probs[:, 1]),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_rf_best_probs[:, 1])
}

bace_rf_best_metrics

{'Accuracy': 0.7124183006535948,
 'F1 Score': 0.7142857142857143,
 'ROC-AUC': 0.8442652329749104,
 'PR-AUC': 0.8569037274563239}

In [38]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

# Define the hyperparameter space using continuous distributions
bace_xgb_hyperopt_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 7, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Convert float outputs of hp.quniform to int for certain parameters
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    
    model = xgb.XGBClassifier(**params, random_state=0)
    
    # Cross-validated AUC score as the objective
    roc_auc = make_scorer(roc_auc_score, response_method='predict_proba')
    score = cross_val_score(model, bace_X_ensemble_valid2_selected, bace_y_ensemble_valid2, scoring=roc_auc, cv=5)
    
    # Minimize the negative ROC AUC score
    return {'loss': -score.mean(), 'status': STATUS_OK}

# Run the Bayesian optimization
trials = Trials()
bace_xgb_best_params = fmin(fn=objective, 
                          space=bace_xgb_hyperopt_space, 
                          algo=tpe.suggest, 
                          max_evals=50, 
                          trials=trials,
                          early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", bace_xgb_best_params)

 24%|██▍       | 12/50 [00:06<00:21,  1.73trial/s, best loss: -0.9633675213675215]
Best hyperparameters: {'colsample_bytree': 0.5914276458163767, 'learning_rate': 0.24898392374364797, 'max_depth': 4.0, 'n_estimators': 200.0, 'subsample': 0.5111786015355686}


In [39]:
# Convert parameters obtained from Hyperopt to the correct data type
bace_xgb_best_params['n_estimators'] = int(bace_xgb_best_params['n_estimators'])
bace_xgb_best_params['max_depth'] = int(bace_xgb_best_params['max_depth'])

# Initialize and train the XGBoost model with the best parameters
bace_xgb_model = xgb.XGBClassifier(**bace_xgb_best_params, random_state=0)
bace_xgb_model.fit(bace_X_ensemble_valid2_selected, bace_y_ensemble_valid2)

# Predict the test set
bace_xgb_best_pred = bace_xgb_model.predict(bace_X_ensemble_test_selected)
bace_xgb_best_probs = bace_xgb_model.predict_proba(bace_X_ensemble_test_selected)

# Calculate the metrics
bace_xgb_best_metrics = {
    "Accuracy": accuracy_score(bace_y_ensemble_test, bace_xgb_best_pred),
    "F1 Score": f1_score(bace_y_ensemble_test, bace_xgb_best_pred),
    "ROC-AUC": roc_auc_score(bace_y_ensemble_test, bace_xgb_best_probs[:, 1]),
    "PR-AUC": average_precision_score(bace_y_ensemble_test, bace_xgb_best_probs[:, 1])
}

bace_xgb_best_metrics

{'Accuracy': 0.7189542483660131,
 'F1 Score': 0.7361963190184049,
 'ROC-AUC': 0.8243727598566308,
 'PR-AUC': 0.8347558626526247}

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss

torch.manual_seed(0)

# Define the neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(num_layers - 1):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1), nn.Sigmoid()]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Objective function for Bayesian optimization
def objective(params):
    kf = KFold(n_splits=5)
    roc_aucs = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        train_dataset = TensorDataset(torch.tensor(X_train.values.astype(np.float32)), 
                                      torch.tensor(y_train.values.astype(np.float32)).unsqueeze(1))
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        model = SimpleNN(input_size=X_train.shape[1], num_layers=int(params['num_layers']), 
                         num_neurons=int(params['num_neurons']), dropout_rate=params['dropout_rate'])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

        model.train()
        for epoch in range(100):
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            X_val_tensor = torch.tensor(X_val.values.astype(np.float32))
            y_val_tensor = torch.tensor(y_val.values.astype(np.float32)).unsqueeze(-1)
            outputs = model(X_val_tensor)
            roc_auc = roc_auc_score(y_val_tensor.numpy(), outputs.numpy())
            roc_aucs.append(roc_auc)

    avg_roc_auc = np.mean(roc_aucs)
    return {'loss': -avg_roc_auc, 'status': STATUS_OK}  # Maximize ROC AUC by minimizing the negative ROC AUC

# Hyperparameter space
space = {
    'num_layers': hp.quniform('num_layers', 1, 5, 1),
    'num_neurons': hp.quniform('num_neurons', 16, 256, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.01)),
    'dropout_rate': hp.uniform('dropout_rate', 0.0, 0.5)
}

X = bace_X_ensemble_valid2_selected
y = bace_y_ensemble_valid2

# Run Bayesian optimization
trials = Trials()
bace_nn_best_params = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            early_stop_fn=no_progress_loss(10))

print("Best hyperparameters:", bace_nn_best_params)


 30%|███       | 15/50 [01:59<04:39,  7.97s/trial, best loss: -0.9830746960238793]
Best hyperparameters: {'dropout_rate': 0.07163936610985994, 'learning_rate': 0.00023770875576635344, 'num_layers': 1.0, 'num_neurons': 121.0}


In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import roc_auc_score

torch.manual_seed(0)

# Define the neural network model again
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_layers, num_neurons, dropout_rate):
        super(SimpleNN, self).__init__()
        layers = [nn.Linear(input_size, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        for _ in range(num_layers - 1):
            layers += [nn.Linear(num_neurons, num_neurons), nn.ReLU(), nn.Dropout(dropout_rate)]
        
        layers += [nn.Linear(num_neurons, 1), nn.Sigmoid()]
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

# Convert parameters to the correct format if necessary
bace_nn_best_params = {
    'num_layers': int(bace_nn_best_params['num_layers']),  # Extracted from Bayesian optimization results
    'num_neurons': int(bace_nn_best_params['num_neurons']),  # Extracted from Bayesian optimization results
    'dropout_rate': bace_nn_best_params['dropout_rate'],  # Extracted from Bayesian optimization results
    'learning_rate': bace_nn_best_params['learning_rate']  # Extracted from Bayesian optimization results
}

# Prepare datasets
X_train_tensor = torch.tensor(bace_X_ensemble_valid2_selected.values.astype(np.float32))
y_train_tensor = torch.tensor(bace_y_ensemble_valid2.values.astype(np.float32)).unsqueeze(1)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test_tensor = torch.tensor(bace_X_ensemble_test_selected.values.astype(np.float32))
y_test_tensor = torch.tensor(bace_y_ensemble_test.values.astype(np.float32)).unsqueeze(1)

# Initialize the model
model = SimpleNN(input_size=bace_X_ensemble_valid2_selected.shape[1], num_layers=bace_nn_best_params['num_layers'], 
                 num_neurons=bace_nn_best_params['num_neurons'], dropout_rate=bace_nn_best_params['dropout_rate'])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=bace_nn_best_params['learning_rate'])

# Training loop
model.train()
for epoch in range(100):  # Number of epochs can be adjusted
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluation on test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predictions = (outputs > 0.5).float()

    # Calculate metrics
    accuracy = accuracy_score(y_test_tensor.numpy(), predictions.numpy())
    f1 = f1_score(y_test_tensor.numpy(), predictions.numpy())
    roc_auc = roc_auc_score(y_test_tensor.numpy(), outputs.numpy())
    pr_auc = average_precision_score(y_test_tensor.numpy(), outputs.numpy())

    bace_nn_metrics = {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    }

bace_nn_metrics

{'Accuracy': 0.6928104575163399,
 'F1 Score': 0.7006369426751592,
 'ROC-AUC': 0.8236559139784947,
 'PR-AUC': 0.8364694542830201}

In [42]:
# report all the metrics for ct
bace_metrics_results["Elastic Net"] = bace_elastic_metrics
bace_metrics_results["LASSO"] = bace_lasso_metrics
# bace_metrics_results["Ridge"] = bace_ridge_metrics
bace_metrics_results["SVM"] = bace_svm_metrics
bace_metrics_results["Random Forest"] = bace_rf_best_metrics
bace_metrics_results["XGBoost"] = bace_xgb_best_metrics
bace_metrics_results["Neural Network"] = bace_nn_metrics

bace_metrics_df = pd.DataFrame(bace_metrics_results).T

# keep 3 digits after the decimal point
bace_metrics_df = bace_metrics_df.round(3)

# export as csv
bace_metrics_df.to_csv('./split1_bace_metrics_elasticFeatures.csv')