In [5]:
%pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
                                              0.0/1.6 MB ? eta -:--:--
     ------                                   0.2/1.6 MB 7.4 MB/s eta 0:00:01
     -------------                            0.5/1.6 MB 6.6 MB/s eta 0:00:01
     --------------------                     0.8/1.6 MB 6.3 MB/s eta 0:00:01
     --------------------------               1.0/1.6 MB 6.0 MB/s eta 0:00:01
     ---------------------------------        1.3/1.6 MB 6.1 MB/s eta 0:00:01
     ---------------------------------------  1.6/1.6 MB 5.9 MB/s eta 0:00:01
     ---------------------------------------- 1.6/1.6 MB 5.6 MB/s eta 0:00:00
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
                                              0.0/200.5 kB ? eta -:--:--
     -------------------------------------- 200.5/200.5 kB 6.1 MB/s eta 0:00:00
Installing collected packages: py4j, hyperopt
Successfully in

In [7]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
                                              0.0/124.9 MB ? eta -:--:--
                                              0.2/124.9 MB 4.1 MB/s eta 0:00:31
                                              0.4/124.9 MB 4.9 MB/s eta 0:00:26
                                              0.4/124.9 MB 4.9 MB/s eta 0:00:26
                                              0.5/124.9 MB 2.8 MB/s eta 0:00:45
                                              0.8/124.9 MB 3.7 MB/s eta 0:00:34
                                              1.1/124.9 MB 4.0 MB/s eta 0:00:31
                                              1.4/124.9 MB 4.3 MB/s eta 0:00:29
                                              1.6/124.9 MB 4.5 MB/s eta 0:00:28
                                              1.9/124.9 MB 4.6 MB/s eta 0:00:27
                                              2.2/124.9 MB 4.7 MB/s eta 0:00:26
                                              

In [25]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [24]:
def prepare_data(input_file_path, sol):

    df = pd.read_csv(input_file_path, low_memory=False)
    columns_for_X = [f'{sol}_seq_A2', f'{sol}_seq_A3',f'{sol}_seq_A4',f'{sol}_seq_A5',f'{sol}_seq_A6',
                    f'{sol}_seq_C2', f'{sol}_seq_C3',f'{sol}_seq_C4',f'{sol}_seq_C5',f'{sol}_seq_C6',
                    f'{sol}_seq_G2', f'{sol}_seq_G3',f'{sol}_seq_G4',f'{sol}_seq_G5',f'{sol}_seq_G6',
                    f'{sol}_seq_T2', f'{sol}_seq_T3',f'{sol}_seq_T4',f'{sol}_seq_T5',f'{sol}_seq_T6']

    X = df[columns_for_X]
    X = X.dropna(axis=0, how='any')

    y = df[f'{sol}_FRET']

    new_feature_name = ['A1','A2','A3','A4','A5','C1','C2','C3','C4','C5','G1','G2','G3','G4','G5','T1','T2','T3','T4','T5']
    X.columns = new_feature_name

    return X, y



def prepare_data_3dots(input_file_path, sol):

    df = pd.read_csv(input_file_path, low_memory=False)
    columns_to_drop = ['N5_seq','N5_FRET','N50_seq','N50_FRET','N500_seq','N500_FRET','N5M10_seq','N5M10_FRET','N5M100_seq','N5M100_FRET']

    X = df.drop(columns=columns_to_drop)
    X = X.dropna(axis=0, how='any')
    X = X.astype(np.int32)
    
    y = df[f'{sol}_FRET']
    y = y.dropna()

    return X, y



def baseline_error(y_train, y_test):
    baseline_value = np.mean(y_train)
    baseline_predictions = np.full_like(y_test, baseline_value)
    
    baseline_mae = mean_absolute_error(y_test, baseline_predictions)
    baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_predictions))
    # baseline_mse = mean_squared_error(y_test, baseline_predictions)
    return baseline_mae, baseline_rmse #, baseline_mse


In [28]:
# Define the objective function for Hyperopt 
def objective(params):
    num_round = 100  # Number of boosting rounds
    
    # Train the model
    bst = xgb.train(params, dtrain, num_round)
    
    # Make predictions
    preds = bst.predict(dtest)
    
    # Calculate error
    # mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    
    # Correctly format the numeric values in the params dictionary
    formatted_params = {k: f"{v:.4f}" if isinstance(v, float) else v for k, v in params.items()}

    # Print the formatted parameters and MAE
    print(f"Trial with params: {formatted_params}, MAE: {mae:.4f}")
    
    # Hyperopt tries to minimize the objective function, so return the mse
    return {'loss': mae, 'status': STATUS_OK}


# Custom objective function for MAE
def mae_obj(preds, dtrain):
    labels = dtrain.get_label()
    grad = np.sign(preds - labels)  # Gradient
    hess = np.ones_like(grad)       # Hessian (second derivative)
    return grad, hess


In [29]:
solution = ['N5', 'N50', 'N500', 'N5M10', 'N5M100']

# Define the hyperparameter space
space = {
    'max_depth': hp.choice('max_depth', range(5, 10)),
    'eta': hp.uniform('eta', 0.01, 0.3),
    'gamma': hp.uniform('gamma', 0, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'objective': 'reg:squarederror',  # Objective function for regression
    'lambda': hp.uniform('lambda', 1e-3, 10),  # L2 regularization term on weights
    'alpha': hp.uniform('alpha', 1e-3, 10)    # L1 regularization term on weights
}



for sol in solution:
    
    input_file_path = f'YOUR_input_file_path_for_xgb_hyperopt'

    # prepare X, y
    X, y = prepare_data(input_file_path, sol)
    
    # Split into train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # DMatrix format for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    baseline_mae, baseline_rmse = baseline_error(y_train, y_test)

    print(f'{sol}\nbaseline_mae:{baseline_mae:.4f}\nbaseline_rmse:{baseline_rmse:.4f}\n')

    # Run the Hyperopt optimization
    trials = Trials()

    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )
    
    print(f"Best hyperparameters of {sol}:", best)

    # Calculate the best loss (MAE)
    best_loss = min(trials.results, key=lambda x: x['loss'])['loss']
    print(f"Best MAE of {sol}: {best_loss:.4f}")
    
    

N5
baseline_mae:0.0384
baseline_rmse:0.0499

Trial with params: {'alpha': '2.5124', 'colsample_bytree': '0.6732', 'eta': '0.1635', 'gamma': '0.2705', 'lambda': '6.5767', 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': '0.7115'}, MAE: 0.0373
Trial with params: {'alpha': '0.3855', 'colsample_bytree': '0.6369', 'eta': '0.0925', 'gamma': '0.0088', 'lambda': '1.3619', 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': '0.7631'}, MAE: 0.0369
Trial with params: {'alpha': '6.7668', 'colsample_bytree': '0.7903', 'eta': '0.0909', 'gamma': '0.8612', 'lambda': '7.2031', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.9256'}, MAE: 0.0375
Trial with params: {'alpha': '8.8831', 'colsample_bytree': '0.9286', 'eta': '0.2764', 'gamma': '0.6758', 'lambda': '1.1753', 'max_depth': 9, 'objective': 'reg:squarederror', 'subsample': '0.9658'}, MAE: 0.0374
Trial with params: {'alpha': '4.3894', 'colsample_bytree': '0.5953', 'eta': '0.2620', 'gamma': '0.9173', 'lambda': 

Trial with params: {'alpha': '4.9423', 'colsample_bytree': '0.8149', 'eta': '0.0341', 'gamma': '0.1492', 'lambda': '0.3333', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.6533'}, MAE: 0.0372
Trial with params: {'alpha': '0.5908', 'colsample_bytree': '0.7600', 'eta': '0.0482', 'gamma': '0.7309', 'lambda': '1.7285', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.7732'}, MAE: 0.0375
Trial with params: {'alpha': '2.1879', 'colsample_bytree': '0.6820', 'eta': '0.0875', 'gamma': '0.2300', 'lambda': '2.1492', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.5576'}, MAE: 0.0373
Trial with params: {'alpha': '4.0398', 'colsample_bytree': '0.7831', 'eta': '0.1470', 'gamma': '0.2903', 'lambda': '0.8381', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.8537'}, MAE: 0.0372
Trial with params: {'alpha': '9.8923', 'colsample_bytree': '0.6565', 'eta': '0.1845', 'gamma': '0.4154', 'lambda': '0.0297', 'max_depth': 5, 'objective': 'reg:s

Trial with params: {'alpha': '5.7388', 'colsample_bytree': '0.9625', 'eta': '0.2019', 'gamma': '0.9612', 'lambda': '3.5516', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.6208'}, MAE: 0.0473
Trial with params: {'alpha': '6.9623', 'colsample_bytree': '0.7318', 'eta': '0.2132', 'gamma': '0.2118', 'lambda': '7.5219', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.9712'}, MAE: 0.0463
Trial with params: {'alpha': '3.8473', 'colsample_bytree': '0.6741', 'eta': '0.2042', 'gamma': '0.7933', 'lambda': '8.2153', 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': '0.8600'}, MAE: 0.0470
Trial with params: {'alpha': '7.7024', 'colsample_bytree': '0.8994', 'eta': '0.2386', 'gamma': '0.4517', 'lambda': '6.1656', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.5794'}, MAE: 0.0469
Trial with params: {'alpha': '8.6992', 'colsample_bytree': '0.8176', 'eta': '0.2708', 'gamma': '0.6032', 'lambda': '3.2245', 'max_depth': 6, 'objective': 'reg:s

Trial with params: {'alpha': '0.7282', 'colsample_bytree': '0.7195', 'eta': '0.1672', 'gamma': '0.6607', 'lambda': '2.0982', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.6510'}, MAE: 0.0555
Trial with params: {'alpha': '8.1654', 'colsample_bytree': '0.6563', 'eta': '0.2116', 'gamma': '0.1883', 'lambda': '8.6813', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.9384'}, MAE: 0.0543
Trial with params: {'alpha': '3.6933', 'colsample_bytree': '0.7878', 'eta': '0.2837', 'gamma': '0.4046', 'lambda': '5.2967', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.9115'}, MAE: 0.0547
Trial with params: {'alpha': '5.6518', 'colsample_bytree': '0.6560', 'eta': '0.0159', 'gamma': '0.0601', 'lambda': '5.2184', 'max_depth': 9, 'objective': 'reg:squarederror', 'subsample': '0.7354'}, MAE: 0.0547
Trial with params: {'alpha': '7.3231', 'colsample_bytree': '0.8071', 'eta': '0.2104', 'gamma': '0.6525', 'lambda': '9.5039', 'max_depth': 7, 'objective': 'reg:s

Trial with params: {'alpha': '9.6289', 'colsample_bytree': '0.8050', 'eta': '0.1867', 'gamma': '0.0636', 'lambda': '3.3676', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.5823'}, MAE: 0.0675
Trial with params: {'alpha': '3.6685', 'colsample_bytree': '0.7423', 'eta': '0.0582', 'gamma': '0.0092', 'lambda': '5.2947', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.6831'}, MAE: 0.0648
Trial with params: {'alpha': '3.4644', 'colsample_bytree': '0.7275', 'eta': '0.0500', 'gamma': '0.1887', 'lambda': '5.0709', 'max_depth': 9, 'objective': 'reg:squarederror', 'subsample': '0.7178'}, MAE: 0.0673
Trial with params: {'alpha': '6.7169', 'colsample_bytree': '0.5314', 'eta': '0.0760', 'gamma': '0.2388', 'lambda': '5.7204', 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': '0.6880'}, MAE: 0.0690
Trial with params: {'alpha': '8.8844', 'colsample_bytree': '0.6282', 'eta': '0.0374', 'gamma': '0.3874', 'lambda': '6.9531', 'max_depth': 6, 'objective': 'reg:s

In [30]:
solution = ['N5', 'N50', 'N500', 'N5M10', 'N5M100']

# Define the hyperparameter space
space = {
    'max_depth': hp.choice('max_depth', range(5, 10)),
    'eta': hp.uniform('eta', 0.01, 0.3),
    'gamma': hp.uniform('gamma', 0, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'objective': 'reg:squarederror',  # Objective function for regression
    'lambda': hp.uniform('lambda', 1e-3, 10),  # L2 regularization term on weights
    'alpha': hp.uniform('alpha', 1e-3, 10)    # L1 regularization term on weights
}



for sol in solution:
    
    input_file_path_3dots = f'YOUR_input_file_path_for_3dots_patterns'

    # prepare X, y
    X, y = prepare_data_3dots(input_file_path_3dots, sol)
    
    # Split into train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # DMatrix format for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    baseline_mae, baseline_rmse = baseline_error(y_train, y_test)

    print(f'{sol}\nbaseline_mae:{baseline_mae:.4f}\nbaseline_rmse:{baseline_rmse:.4f}\n')

    # Run the Hyperopt optimization
    trials = Trials()

    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )
    
    print(f"Best hyperparameters of {sol}:", best)

    # Calculate the best loss (MAE)
    best_loss = min(trials.results, key=lambda x: x['loss'])['loss']
    print(f"Best MAE of {sol}: {best_loss:.4f}\n")

N5
baseline_mae:0.0384
baseline_rmse:0.0499

Trial with params: {'alpha': '6.0336', 'colsample_bytree': '0.8091', 'eta': '0.2501', 'gamma': '0.1769', 'lambda': '3.8416', 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': '0.5302'}, MAE: 0.0372
Trial with params: {'alpha': '4.3644', 'colsample_bytree': '0.5031', 'eta': '0.0167', 'gamma': '0.4204', 'lambda': '4.1044', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.6815'}, MAE: 0.0375
Trial with params: {'alpha': '8.4182', 'colsample_bytree': '0.9300', 'eta': '0.1099', 'gamma': '0.3302', 'lambda': '6.9343', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.6901'}, MAE: 0.0374
Trial with params: {'alpha': '2.2606', 'colsample_bytree': '0.9913', 'eta': '0.2011', 'gamma': '0.1630', 'lambda': '9.5456', 'max_depth': 9, 'objective': 'reg:squarederror', 'subsample': '0.6589'}, MAE: 0.0371
Trial with params: {'alpha': '8.9450', 'colsample_bytree': '0.8784', 'eta': '0.1548', 'gamma': '0.9862', 'lambda': 

Trial with params: {'alpha': '9.3301', 'colsample_bytree': '0.7430', 'eta': '0.1015', 'gamma': '0.0935', 'lambda': '6.6531', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.9073'}, MAE: 0.0371
Trial with params: {'alpha': '4.1966', 'colsample_bytree': '0.6223', 'eta': '0.0579', 'gamma': '0.2090', 'lambda': '4.5378', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.9378'}, MAE: 0.0372
Trial with params: {'alpha': '6.8057', 'colsample_bytree': '0.5291', 'eta': '0.2103', 'gamma': '0.0028', 'lambda': '3.5509', 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': '0.7940'}, MAE: 0.0369
Trial with params: {'alpha': '3.6669', 'colsample_bytree': '0.5928', 'eta': '0.2429', 'gamma': '0.7362', 'lambda': '5.5060', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.8755'}, MAE: 0.0374
Trial with params: {'alpha': '1.1651', 'colsample_bytree': '0.6921', 'eta': '0.2790', 'gamma': '0.1370', 'lambda': '4.2912', 'max_depth': 9, 'objective': 'reg:s

Trial with params: {'alpha': '9.9651', 'colsample_bytree': '0.7599', 'eta': '0.0955', 'gamma': '0.1615', 'lambda': '1.1226', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.9098'}, MAE: 0.0408
Trial with params: {'alpha': '5.6791', 'colsample_bytree': '0.9022', 'eta': '0.1404', 'gamma': '0.0076', 'lambda': '2.2631', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.6656'}, MAE: 0.0405
Trial with params: {'alpha': '3.0411', 'colsample_bytree': '0.7075', 'eta': '0.1408', 'gamma': '0.3572', 'lambda': '2.4258', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.6685'}, MAE: 0.0410
Trial with params: {'alpha': '3.4712', 'colsample_bytree': '0.8017', 'eta': '0.0501', 'gamma': '0.1813', 'lambda': '2.8871', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.6539'}, MAE: 0.0408
Trial with params: {'alpha': '5.7044', 'colsample_bytree': '0.9066', 'eta': '0.0837', 'gamma': '0.2964', 'lambda': '2.9233', 'max_depth': 7, 'objective': 'reg:s

Trial with params: {'alpha': '5.1293', 'colsample_bytree': '0.5366', 'eta': '0.1617', 'gamma': '0.0815', 'lambda': '4.8168', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.5038'}, MAE: 0.0458
Trial with params: {'alpha': '9.2775', 'colsample_bytree': '0.7150', 'eta': '0.1827', 'gamma': '0.5512', 'lambda': '4.7112', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.8220'}, MAE: 0.0467
Trial with params: {'alpha': '0.6748', 'colsample_bytree': '0.9882', 'eta': '0.0948', 'gamma': '0.6324', 'lambda': '5.2551', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.9070'}, MAE: 0.0464
Trial with params: {'alpha': '3.1459', 'colsample_bytree': '0.6243', 'eta': '0.2664', 'gamma': '0.3707', 'lambda': '4.6032', 'max_depth': 9, 'objective': 'reg:squarederror', 'subsample': '0.6751'}, MAE: 0.0462
Trial with params: {'alpha': '2.5156', 'colsample_bytree': '0.5328', 'eta': '0.2389', 'gamma': '0.0621', 'lambda': '8.9486', 'max_depth': 5, 'objective': 'reg:s

Trial with params: {'alpha': '3.9499', 'colsample_bytree': '0.8041', 'eta': '0.1429', 'gamma': '0.3076', 'lambda': '5.1012', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.5148'}, MAE: 0.0544
Trial with params: {'alpha': '0.1553', 'colsample_bytree': '0.9895', 'eta': '0.0641', 'gamma': '0.9033', 'lambda': '8.8807', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.6882'}, MAE: 0.0552
Trial with params: {'alpha': '6.7976', 'colsample_bytree': '0.8049', 'eta': '0.1953', 'gamma': '0.1823', 'lambda': '7.9204', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.8876'}, MAE: 0.0535
Trial with params: {'alpha': '6.6066', 'colsample_bytree': '0.7984', 'eta': '0.1544', 'gamma': '0.8192', 'lambda': '5.2602', 'max_depth': 5, 'objective': 'reg:squarederror', 'subsample': '0.5405'}, MAE: 0.0559
Trial with params: {'alpha': '7.0033', 'colsample_bytree': '0.7127', 'eta': '0.2612', 'gamma': '0.3673', 'lambda': '0.6327', 'max_depth': 5, 'objective': 'reg:s

Trial with params: {'alpha': '3.2145', 'colsample_bytree': '0.9558', 'eta': '0.1953', 'gamma': '0.2187', 'lambda': '3.0605', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.9224'}, MAE: 0.0655
Trial with params: {'alpha': '6.4787', 'colsample_bytree': '0.7974', 'eta': '0.2229', 'gamma': '0.3329', 'lambda': '6.2318', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.6499'}, MAE: 0.0674
Trial with params: {'alpha': '1.1223', 'colsample_bytree': '0.8840', 'eta': '0.1801', 'gamma': '0.2102', 'lambda': '8.6353', 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': '0.6228'}, MAE: 0.0653
Trial with params: {'alpha': '6.8356', 'colsample_bytree': '0.8046', 'eta': '0.1685', 'gamma': '0.0041', 'lambda': '9.5899', 'max_depth': 7, 'objective': 'reg:squarederror', 'subsample': '0.7789'}, MAE: 0.0632
Trial with params: {'alpha': '2.2266', 'colsample_bytree': '0.9595', 'eta': '0.2493', 'gamma': '0.0614', 'lambda': '6.6188', 'max_depth': 8, 'objective': 'reg:s