In [1]:
import polars as pl
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler


from sklearn.model_selection import train_test_split
from functools import partial


In [2]:
def objective(trial, data, target):
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = mean_squared_error(valid_y, pred_labels)
    return accuracy

In [3]:
# data handling
input_paths = 'inputs/train.parquet/*/*.parquet'
scan_df = pl.scan_parquet(input_paths)

df = scan_df.head(int(1e5)).collect()



In [None]:
#
columns = df.columns
features_cols = [x for x in columns if 'feature' in x]
responder_cols = [x for x in columns if 'responder' in x]
target_col = 'responder_6'

#
X = df[features_cols]
y = df[target_col]
print(X.shape, y.shape)

In [None]:
_objective = partial(objective, data=X, target=y)
# hyper tune
study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(_objective, n_trials=100, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
params = {
    "verbosity": 2,
    "objective": "reg:squarederror",
    # use exact for small dataset.
    "tree_method": "hist",
    **trial.params,
}
params


In [None]:
params = {'verbosity': 2,
 'objective': 'reg:squarederror',
 'tree_method': 'hist',
 'booster': 'dart',
 'lambda': 1.3642360825668772e-06,
 'alpha': 0.05061696974935902,
 'subsample': 0.949976951187838,
 'colsample_bytree': 0.7390489200118765,
 'max_depth': 9,
 'min_child_weight': 8,
 'eta': 0.663661345919149,
 'gamma': 0.0003618456071895134,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'tree',
 'rate_drop': 0.00026112632903904653,
 'skip_drop': 0.0043081489165863246}

In [9]:
X = scan_df.select(features_cols).collect().to_numpy()
y = scan_df.select(target_col).collect().to_numpy()
print(X.shape)

: 

: 

In [None]:
n = X.shape[0]
n_train = int(n * 0.95)
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = X[n_train:]
#
dtrain = xgb.DMatrix(X_train, label=X_test)
dtest = xgb.DMatrix(X_test, label=y_test)
#
bst = xgb.train(params, dtrain)


In [None]:
preds = bst.predict(dtest)
pred_labels = np.rint(preds)
r2_score(y_true=y_test, y_pred=pred_labels)