In [6]:
import os
import pandas as pd
import lightgbm as lgb
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
import wandb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, root_mean_squared_error, r2_score
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit

In [2]:
load_dotenv()
os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")
os.environ["WANDB_DIR"] = "../data/wandb_logs"
wandb.login(key=os.environ["WANDB_API_KEY"])

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/brupesh/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbrupeshmit[0m ([33mbrupeshmit-massachusetts-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
train = pd.read_parquet("../data/train_clean_v3.parquet")
test = pd.read_parquet("../data/test_clean_v3.parquet")
target_col = "label"
y = train[target_col]
X = train.drop(columns=[target_col])

test_ids = test.iloc[:, 0]
test = test.drop(columns=[test.columns[0]])

feature_df = pd.read_csv("../data/resources/lgbm_feature_importance.csv")
top_features = feature_df.sort_values(by="importance", ascending=False)["feature"].iloc[:400].tolist()
X = X[top_features]
test = test[top_features]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

In [7]:
tscv = TimeSeriesSplit(n_splits=5)

In [8]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "feature_pre_filter": False,
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0,
        "verbosity": -1
    }

    rmse_scores = []
    train_rmse_scores = []
    r2_scores = []
    pearson_scores = []
    overfit_scores = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val)

        model = lgb.train(
            params,
            dtrain,
            num_boost_round=500,
            valid_sets=[dval],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
        )

        preds_val = model.predict(X_val)
        preds_train = model.predict(X_train)

        rmse_val = root_mean_squared_error(y_val, preds_val)
        rmse_train = root_mean_squared_error(y_train, preds_train)
        r2_val = r2_score(y_val, preds_val)
        pearson = pd.Series(preds_val).corr(pd.Series(y_val))
        overfit = rmse_val - rmse_train

        rmse_scores.append(rmse_val)
        train_rmse_scores.append(rmse_train)
        r2_scores.append(r2_val)
        pearson_scores.append(pearson)
        overfit_scores.append(overfit)

    wandb.log({
        "avg_rmse": np.mean(rmse_scores),
        "avg_train_rmse": np.mean(train_rmse_scores),
        "avg_r2": np.mean(r2_scores),
        "avg_pearson": np.mean(pearson_scores),
        "avg_overfit": np.mean(overfit_scores)
    }, step=trial.number)

    return np.mean(rmse_scores)


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, callbacks=[WeightsAndBiasesCallback()])

[I 2025-08-14 23:01:05,701] A new study created in memory with name: no-name-e75d5dac-fe5c-4937-bd0c-dbfc1607d144
  study.optimize(objective, n_trials=30, callbacks=[WeightsAndBiasesCallback()])


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	valid_0's rmse: 0.978829
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00556
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00975
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6]	valid_0's rmse: 0.97596
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:01:19,263] Trial 0 finished with value: 1.0067690801216946 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.007669544406706384, 'num_leaves': 334, 'max_depth': 10, 'min_child_samples': 72, 'subsample': 0.9551029921658695, 'colsample_bytree': 0.5362309999938235, 'reg_alpha': 0.0006823700742757026, 'reg_lambda': 0.0005581981230231926}. Best is trial 0 with value: 1.0067690801216946.


Early stopping, best iteration is:
[1]	valid_0's rmse: 1.06375




[100]	valid_0's rmse: 0.979003
[200]	valid_0's rmse: 0.978892
[300]	valid_0's rmse: 0.978972
[400]	valid_0's rmse: 0.979346
[500]	valid_0's rmse: 0.979787




[100]	valid_0's rmse: 1.00548
[200]	valid_0's rmse: 1.00541
[300]	valid_0's rmse: 1.00557
[400]	valid_0's rmse: 1.00598
[500]	valid_0's rmse: 1.00658




[100]	valid_0's rmse: 1.00969
[200]	valid_0's rmse: 1.00988
[300]	valid_0's rmse: 1.01003
[400]	valid_0's rmse: 1.01042
[500]	valid_0's rmse: 1.0107




[100]	valid_0's rmse: 0.976696
[200]	valid_0's rmse: 0.977029
[300]	valid_0's rmse: 0.977384
[400]	valid_0's rmse: 0.978056
[500]	valid_0's rmse: 0.978882




[100]	valid_0's rmse: 1.06412
[200]	valid_0's rmse: 1.06456
[300]	valid_0's rmse: 1.06456
[400]	valid_0's rmse: 1.06459
[500]	valid_0's rmse: 1.06464


[I 2025-08-14 23:02:07,918] Trial 1 finished with value: 1.008117773326926 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.0010780528347317585, 'num_leaves': 268, 'max_depth': 6, 'min_child_samples': 59, 'subsample': 0.7499711942631283, 'colsample_bytree': 0.6565249995963556, 'reg_alpha': 0.9986471040593062, 'reg_lambda': 5.035824803727473e-08}. Best is trial 0 with value: 1.0067690801216946.


[100]	valid_0's rmse: 1.05693
[200]	valid_0's rmse: 1.06316
[300]	valid_0's rmse: 1.07026
[400]	valid_0's rmse: 1.07334
[500]	valid_0's rmse: 1.07561




[100]	valid_0's rmse: 1.07788
[200]	valid_0's rmse: 1.08438
[300]	valid_0's rmse: 1.09252
[400]	valid_0's rmse: 1.09335
[500]	valid_0's rmse: 1.09818




[100]	valid_0's rmse: 1.10664
[200]	valid_0's rmse: 1.13121
[300]	valid_0's rmse: 1.13627
[400]	valid_0's rmse: 1.14135
[500]	valid_0's rmse: 1.14676




[100]	valid_0's rmse: 1.02804
[200]	valid_0's rmse: 1.02378
[300]	valid_0's rmse: 1.03162
[400]	valid_0's rmse: 1.03615
[500]	valid_0's rmse: 1.04088




[100]	valid_0's rmse: 1.2042
[200]	valid_0's rmse: 1.25216
[300]	valid_0's rmse: 1.25597
[400]	valid_0's rmse: 1.28123
[500]	valid_0's rmse: 1.29067


[I 2025-08-14 23:03:58,080] Trial 2 finished with value: 1.1304223556920616 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.13394740083605072, 'num_leaves': 100, 'max_depth': 10, 'min_child_samples': 54, 'subsample': 0.905887880025497, 'colsample_bytree': 0.7546219171017449, 'reg_alpha': 4.2175395576886244e-08, 'reg_lambda': 0.005737026285058065}. Best is trial 0 with value: 1.0067690801216946.


[100]	valid_0's rmse: 0.992058
[200]	valid_0's rmse: 0.999431
[300]	valid_0's rmse: 1.01678
[400]	valid_0's rmse: 1.03208
[500]	valid_0's rmse: 1.03951




[100]	valid_0's rmse: 1.01022
[200]	valid_0's rmse: 1.01594
[300]	valid_0's rmse: 1.02078
[400]	valid_0's rmse: 1.02544
[500]	valid_0's rmse: 1.03364




[100]	valid_0's rmse: 1.01633
[200]	valid_0's rmse: 1.02316
[300]	valid_0's rmse: 1.0376
[400]	valid_0's rmse: 1.04923
[500]	valid_0's rmse: 1.05613




[100]	valid_0's rmse: 0.98164
[200]	valid_0's rmse: 0.986242
[300]	valid_0's rmse: 0.990039
[400]	valid_0's rmse: 0.992166
[500]	valid_0's rmse: 0.994514




[100]	valid_0's rmse: 1.06741
[200]	valid_0's rmse: 1.0789
[300]	valid_0's rmse: 1.09191
[400]	valid_0's rmse: 1.10724
[500]	valid_0's rmse: 1.13035


[I 2025-08-14 23:05:59,179] Trial 3 finished with value: 1.0508274976751395 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.01212868140856651, 'num_leaves': 361, 'max_depth': 11, 'min_child_samples': 94, 'subsample': 0.5421144525138745, 'colsample_bytree': 0.5383573976996119, 'reg_alpha': 2.611800019476048e-06, 'reg_lambda': 7.14389483798403e-05}. Best is trial 0 with value: 1.0067690801216946.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	valid_0's rmse: 0.979418
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00569
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00946
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.975963
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 1.06802


[I 2025-08-14 23:06:08,966] Trial 4 finished with value: 1.0059326070367782 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.008587957108151273, 'num_leaves': 177, 'max_depth': 6, 'min_child_samples': 40, 'subsample': 0.6430034916658506, 'colsample_bytree': 0.8841782373838412, 'reg_alpha': 0.00013476131207986218, 'reg_lambda': 8.112792825074774e-05}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[75]	valid_0's rmse: 1.05913
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[32]	valid_0's rmse: 0.978948
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00574
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.0095
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.97653
Early stopping, best iteration is:
[59]	valid_0's rmse: 0.976003
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:06:24,198] Trial 5 finished with value: 1.0067388161755737 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.001124181489293956, 'num_leaves': 367, 'max_depth': 10, 'min_child_samples': 56, 'subsample': 0.5403593986889732, 'colsample_bytree': 0.6423424261799229, 'reg_alpha': 1.07378954493895e-06, 'reg_lambda': 8.625500858617912e-07}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[43]	valid_0's rmse: 1.0635
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	valid_0's rmse: 0.979314
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00575
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 1.00946
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	valid_0's rmse: 0.976121
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 1.06326
[200]	valid_0's rmse: 1.06231
[300]	valid_0's rmse: 1.06211
Early stopping, best iteration is:
[271]	valid_0's rmse: 1.06204


[I 2025-08-14 23:06:34,726] Trial 6 finished with value: 1.0065352358123874 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.001386485094265418, 'num_leaves': 366, 'max_depth': 6, 'min_child_samples': 89, 'subsample': 0.5740172211113079, 'colsample_bytree': 0.7952315773843055, 'reg_alpha': 0.5450831687252766, 'reg_lambda': 1.1077247303377053e-07}. Best is trial 4 with value: 1.0059326070367782.


[100]	valid_0's rmse: 1.00095
[200]	valid_0's rmse: 1.01871
[300]	valid_0's rmse: 1.04205
[400]	valid_0's rmse: 1.05199
[500]	valid_0's rmse: 1.06102




[100]	valid_0's rmse: 1.01645
[200]	valid_0's rmse: 1.0196
[300]	valid_0's rmse: 1.02631
[400]	valid_0's rmse: 1.03368
[500]	valid_0's rmse: 1.03997




[100]	valid_0's rmse: 1.03242
[200]	valid_0's rmse: 1.04236
[300]	valid_0's rmse: 1.06708
[400]	valid_0's rmse: 1.07573
[500]	valid_0's rmse: 1.08151




[100]	valid_0's rmse: 0.984072
[200]	valid_0's rmse: 0.987838
[300]	valid_0's rmse: 0.993338
[400]	valid_0's rmse: 0.999663
[500]	valid_0's rmse: 1.00303




[100]	valid_0's rmse: 1.07334
[200]	valid_0's rmse: 1.08862
[300]	valid_0's rmse: 1.11636
[400]	valid_0's rmse: 1.13689
[500]	valid_0's rmse: 1.14906


[I 2025-08-14 23:09:13,598] Trial 7 finished with value: 1.066917904156321 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.015973281414561367, 'num_leaves': 290, 'max_depth': 11, 'min_child_samples': 50, 'subsample': 0.7021387966600703, 'colsample_bytree': 0.7514820780352044, 'reg_alpha': 0.07963964340105449, 'reg_lambda': 1.991564951147465}. Best is trial 4 with value: 1.0059326070367782.


[100]	valid_0's rmse: 1.05046
[200]	valid_0's rmse: 1.06237
[300]	valid_0's rmse: 1.07239
[400]	valid_0's rmse: 1.08085
[500]	valid_0's rmse: 1.08421




[100]	valid_0's rmse: 1.05144
[200]	valid_0's rmse: 1.06246
[300]	valid_0's rmse: 1.07072
[400]	valid_0's rmse: 1.07571
[500]	valid_0's rmse: 1.07791




[100]	valid_0's rmse: 1.05703
[200]	valid_0's rmse: 1.08467
[300]	valid_0's rmse: 1.0919
[400]	valid_0's rmse: 1.09666
[500]	valid_0's rmse: 1.09999




[100]	valid_0's rmse: 1.00121
[200]	valid_0's rmse: 1.00378
[300]	valid_0's rmse: 1.01454
[400]	valid_0's rmse: 1.02298
[500]	valid_0's rmse: 1.02707




[100]	valid_0's rmse: 1.13759
[200]	valid_0's rmse: 1.16238
[300]	valid_0's rmse: 1.1755
[400]	valid_0's rmse: 1.19393
[500]	valid_0's rmse: 1.20318


[I 2025-08-14 23:12:28,917] Trial 8 finished with value: 1.098469918239628 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.05397689618935362, 'num_leaves': 500, 'max_depth': 12, 'min_child_samples': 64, 'subsample': 0.6461885768031791, 'colsample_bytree': 0.556558243210183, 'reg_alpha': 0.0037441796882596106, 'reg_lambda': 0.3608585384196598}. Best is trial 4 with value: 1.0059326070367782.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	valid_0's rmse: 0.978969
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00569
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 0.97602
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:12:41,618] Trial 9 finished with value: 1.0067280409479396 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.0030643592111061877, 'num_leaves': 124, 'max_depth': 11, 'min_child_samples': 66, 'subsample': 0.5516679471322317, 'colsample_bytree': 0.7078117143128917, 'reg_alpha': 5.997299689126069e-06, 'reg_lambda': 0.20637292439617824}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[7]	valid_0's rmse: 1.06347
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.977289
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00612
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	valid_0's rmse: 1.00818
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.976119
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	valid_0's rmse: 1.06269


[I 2025-08-14 23:12:48,894] Trial 10 finished with value: 1.0060790425023007 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.03521321830322877, 'num_leaves': 20, 'max_depth': 3, 'min_child_samples': 18, 'subsample': 0.8044101587367347, 'colsample_bytree': 0.95259045123296, 'reg_alpha': 8.836833169294116e-05, 'reg_lambda': 1.1562762148854702e-05}. Best is trial 4 with value: 1.0059326070367782.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.976793
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00607
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 1.00803
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.976114
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[13]	valid_0's rmse: 1.06295


[I 2025-08-14 23:12:56,220] Trial 11 finished with value: 1.005992187255805 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.030309178235529276, 'num_leaves': 19, 'max_depth': 3, 'min_child_samples': 19, 'subsample': 0.843034511004496, 'colsample_bytree': 0.9653895045446312, 'reg_alpha': 5.159274666642449e-05, 'reg_lambda': 1.719833590563841e-05}. Best is trial 4 with value: 1.0059326070367782.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[32]	valid_0's rmse: 0.977823
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00583
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 1.00861
Early stopping, best iteration is:
[55]	valid_0's rmse: 1.00817
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	valid_0's rmse: 0.976118
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:13:03,881] Trial 12 finished with value: 1.0062160037222267 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.005338802914196657, 'num_leaves': 150, 'max_depth': 3, 'min_child_samples': 21, 'subsample': 0.8469579453260537, 'colsample_bytree': 0.9855736286415341, 'reg_alpha': 6.227176550011169e-05, 'reg_lambda': 9.095473810852601e-06}. Best is trial 4 with value: 1.0059326070367782.


[100]	valid_0's rmse: 1.06361
Early stopping, best iteration is:
[73]	valid_0's rmse: 1.06314
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.97927
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00551
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00966
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.975964
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:13:11,687] Trial 13 finished with value: 1.0067521110297808 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.030305060046851556, 'num_leaves': 17, 'max_depth': 5, 'min_child_samples': 36, 'subsample': 0.6425589029777542, 'colsample_bytree': 0.8810363809165741, 'reg_alpha': 0.00737700836615031, 'reg_lambda': 0.0029292129841951525}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[1]	valid_0's rmse: 1.06336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.981068
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.0063
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.01586
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.976835
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:13:25,682] Trial 14 finished with value: 1.0083509142257703 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.0997567158417246, 'num_leaves': 189, 'max_depth': 8, 'min_child_samples': 9, 'subsample': 0.8233224547226778, 'colsample_bytree': 0.9038708445357226, 'reg_alpha': 2.3969020024822907e-08, 'reg_lambda': 1.91031748976869e-06}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[4]	valid_0's rmse: 1.06169
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979366
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00549
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 1.00828
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.976257
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:13:33,425] Trial 15 finished with value: 1.0061781479148348 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.017803785781884816, 'num_leaves': 211, 'max_depth': 4, 'min_child_samples': 36, 'subsample': 0.9979521627326172, 'colsample_bytree': 0.8427447643704183, 'reg_alpha': 2.0489648195322276e-05, 'reg_lambda': 0.0001786419866102187}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[35]	valid_0's rmse: 1.06149
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979356
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00574
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00946
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 0.975995
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:13:43,858] Trial 16 finished with value: 1.0067620081231128 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.0035053646927855867, 'num_leaves': 88, 'max_depth': 8, 'min_child_samples': 37, 'subsample': 0.7367583546694273, 'colsample_bytree': 0.9203560660090001, 'reg_alpha': 4.3064406737162324e-07, 'reg_lambda': 0.014106967234415832}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[7]	valid_0's rmse: 1.06326
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979504
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.0048
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.01128
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.976029
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:13:52,953] Trial 17 finished with value: 1.0062172726460958 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.05506979800493404, 'num_leaves': 71, 'max_depth': 6, 'min_child_samples': 22, 'subsample': 0.643719866578564, 'colsample_bytree': 0.997542039905978, 'reg_alpha': 0.0010701706914987036, 'reg_lambda': 3.373488082761299e-05}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[12]	valid_0's rmse: 1.05947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	valid_0's rmse: 0.985059
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00824
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.01001
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.983687
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.06565


[I 2025-08-14 23:14:00,631] Trial 18 finished with value: 1.0105289410176979 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.19899568553405883, 'num_leaves': 448, 'max_depth': 4, 'min_child_samples': 6, 'subsample': 0.8703362905877621, 'colsample_bytree': 0.8399034258115001, 'reg_alpha': 0.019783424211074915, 'reg_lambda': 8.679392081953745e-07}. Best is trial 4 with value: 1.0059326070367782.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979411
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00549
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	valid_0's rmse: 1.00929
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.976008
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 1.06711


[I 2025-08-14 23:14:10,833] Trial 19 finished with value: 1.006444970750891 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.007759636839820192, 'num_leaves': 198, 'max_depth': 7, 'min_child_samples': 43, 'subsample': 0.6994160967852774, 'colsample_bytree': 0.8375750723909046, 'reg_alpha': 0.00032406306721675205, 'reg_lambda': 0.0008657002095288463}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[71]	valid_0's rmse: 1.06202
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 0.979133
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 1.00542
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 1.00958
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.976352
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:14:18,454] Trial 20 finished with value: 1.0064516267933648 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.02523438180339603, 'num_leaves': 157, 'max_depth': 4, 'min_child_samples': 29, 'subsample': 0.7844899386855433, 'colsample_bytree': 0.9358257978798711, 'reg_alpha': 1.3545769644477343e-07, 'reg_lambda': 4.284491926172741e-06}. Best is trial 4 with value: 1.0059326070367782.


Early stopping, best iteration is:
[22]	valid_0's rmse: 1.06177
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.9758
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00624
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 1.00839
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.976122
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	valid_0's rmse: 1.06265


[I 2025-08-14 23:14:25,735] Trial 21 finished with value: 1.005841025504855 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.047048455375312145, 'num_leaves': 20, 'max_depth': 3, 'min_child_samples': 20, 'subsample': 0.8047105565648796, 'colsample_bytree': 0.9546627442865881, 'reg_alpha': 6.968843480322451e-05, 'reg_lambda': 3.5029966376515844e-05}. Best is trial 21 with value: 1.005841025504855.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.975992
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00626
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 1.00837
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 0.975726
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 1.06205


[I 2025-08-14 23:14:33,142] Trial 22 finished with value: 1.0056800781801118 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.04895956308561634, 'num_leaves': 58, 'max_depth': 3, 'min_child_samples': 15, 'subsample': 0.8953432004915698, 'colsample_bytree': 0.9597250236055926, 'reg_alpha': 1.9186865042305896e-05, 'reg_lambda': 6.922591585909344e-05}. Best is trial 22 with value: 1.0056800781801118.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.981559
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00481
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.0103
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.976066
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:14:41,138] Trial 23 finished with value: 1.0071713216784723 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.06937224393703352, 'num_leaves': 61, 'max_depth': 5, 'min_child_samples': 28, 'subsample': 0.9138145140464777, 'colsample_bytree': 0.881014560400324, 'reg_alpha': 9.870102975561025e-06, 'reg_lambda': 0.00012435838888646942}. Best is trial 22 with value: 1.0056800781801118.


Early stopping, best iteration is:
[1]	valid_0's rmse: 1.06312
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.980843
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00494
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.01064
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.976052
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:14:49,225] Trial 24 finished with value: 1.0070980333268902 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.08944409285261586, 'num_leaves': 64, 'max_depth': 5, 'min_child_samples': 11, 'subsample': 0.894237173996822, 'colsample_bytree': 0.9320537835476113, 'reg_alpha': 8.984936441077647, 'reg_lambda': 0.04836749829424701}. Best is trial 22 with value: 1.0056800781801118.


Early stopping, best iteration is:
[1]	valid_0's rmse: 1.06301
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979354
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00563
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	valid_0's rmse: 1.00911
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.976203
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:14:56,997] Trial 25 finished with value: 1.0064042584782444 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.011706682905862393, 'num_leaves': 126, 'max_depth': 4, 'min_child_samples': 44, 'subsample': 0.7853768190358623, 'colsample_bytree': 0.8765751483235293, 'reg_alpha': 0.0003002331525542172, 'reg_lambda': 0.0008045607078841663}. Best is trial 22 with value: 1.0056800781801118.


[100]	valid_0's rmse: 1.06749
Early stopping, best iteration is:
[50]	valid_0's rmse: 1.06173




[100]	valid_0's rmse: 1.0271
[200]	valid_0's rmse: 1.04459
[300]	valid_0's rmse: 1.05348
[400]	valid_0's rmse: 1.06384
[500]	valid_0's rmse: 1.07498




[100]	valid_0's rmse: 1.0208
[200]	valid_0's rmse: 1.03944
[300]	valid_0's rmse: 1.0521
[400]	valid_0's rmse: 1.06586
[500]	valid_0's rmse: 1.0711




[100]	valid_0's rmse: 1.05537
[200]	valid_0's rmse: 1.07138
[300]	valid_0's rmse: 1.0874
[400]	valid_0's rmse: 1.10366
[500]	valid_0's rmse: 1.11009




[100]	valid_0's rmse: 1.00104
[200]	valid_0's rmse: 1.00579
[300]	valid_0's rmse: 1.0101
[400]	valid_0's rmse: 1.01618
[500]	valid_0's rmse: 1.02707




[100]	valid_0's rmse: 1.13041
[200]	valid_0's rmse: 1.17905
[300]	valid_0's rmse: 1.19765
[400]	valid_0's rmse: 1.2262
[500]	valid_0's rmse: 1.23369


[I 2025-08-14 23:16:01,813] Trial 26 finished with value: 1.1033880750372185 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.047260576001713475, 'num_leaves': 233, 'max_depth': 7, 'min_child_samples': 29, 'subsample': 0.500546646483125, 'colsample_bytree': 0.8088983730025193, 'reg_alpha': 1.2722267461119129e-06, 'reg_lambda': 1.3863778851124457e-07}. Best is trial 22 with value: 1.0056800781801118.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.977477
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00599
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	valid_0's rmse: 1.00815
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.976103
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 1.06309


[I 2025-08-14 23:16:09,322] Trial 27 finished with value: 1.00616101428522 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.021389144900453856, 'num_leaves': 52, 'max_depth': 3, 'min_child_samples': 14, 'subsample': 0.952168058262806, 'colsample_bytree': 0.9725428108628711, 'reg_alpha': 0.0016231847028064501, 'reg_lambda': 5.0781571310490656e-05}. Best is trial 22 with value: 1.0056800781801118.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979435
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	valid_0's rmse: 1.00415
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00953
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 0.976095
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:16:17,699] Trial 28 finished with value: 1.0061479420361872 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.009049660698918067, 'num_leaves': 163, 'max_depth': 5, 'min_child_samples': 28, 'subsample': 0.6982100405540893, 'colsample_bytree': 0.9995693716450049, 'reg_alpha': 0.0001365869371658507, 'reg_lambda': 4.3643017456046687e-07}. Best is trial 22 with value: 1.0056800781801118.


[100]	valid_0's rmse: 1.07445
Early stopping, best iteration is:
[51]	valid_0's rmse: 1.06153
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.979317
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.00569
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.0095
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.976132
Training until validation scores don't improve for 50 rounds


[I 2025-08-14 23:16:28,229] Trial 29 finished with value: 1.0068639458931326 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.00578176009636947, 'num_leaves': 109, 'max_depth': 9, 'min_child_samples': 78, 'subsample': 0.6093566522343654, 'colsample_bytree': 0.9075907091169069, 'reg_alpha': 2.923655545640154e-05, 'reg_lambda': 0.00032012960707583995}. Best is trial 22 with value: 1.0056800781801118.


Early stopping, best iteration is:
[1]	valid_0's rmse: 1.06369


In [9]:
print(f"Best trial number: {study.best_trial.number}")
print(f"Best avg RMSE: {study.best_value}")
print("Best trial params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")

Best trial number: 22
Best avg RMSE: 1.0056800781801118
Best trial params:
  boosting_type: gbdt
  learning_rate: 0.04895956308561634
  num_leaves: 58
  max_depth: 3
  min_child_samples: 15
  subsample: 0.8953432004915698
  colsample_bytree: 0.9597250236055926
  reg_alpha: 1.9186865042305896e-05
  reg_lambda: 6.922591585909344e-05


In [10]:
best_params = study.best_trial.params
best_params.update({"objective": "regression", "metric": "rmse", "device": "gpu", "gpu_platform_id": 0, "gpu_device_id": 0})
model = lgb.train(
    best_params,
    dtrain,
    num_boost_round=1000,
    valid_sets=[dval],
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ]
)

model.save_model("../data/models/lightgbm_opt_model_1.txt")

Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.93846
[200]	valid_0's rmse: 0.892449
[300]	valid_0's rmse: 0.857334
[400]	valid_0's rmse: 0.829413
[500]	valid_0's rmse: 0.80452
[600]	valid_0's rmse: 0.780467
[700]	valid_0's rmse: 0.759629
[800]	valid_0's rmse: 0.740964
[900]	valid_0's rmse: 0.723858
[1000]	valid_0's rmse: 0.707923
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.707923


<lightgbm.basic.Booster at 0x7fab6b21a1e0>

In [11]:
preds = model.predict(test)
pd.DataFrame({"id": test_ids, "prediction": preds}).to_csv("../data/preds/lightgbm_opt1_preds.csv", index=False)

wandb.finish()

0,1
avg_overfit,▁▁█▄▁▁▁▅█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁
avg_r2,██▁▆███▅▃█████████████████▃███
avg_rmse,▁▁█▄▁▁▁▄▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁
avg_train_rmse,██▁▄███▄▁█████████████████▃███
colsample_bytree,▁▃▄▁▆▃▅▄▁▄▇▇█▆▇▆▇█▆▆▇▇▇▆▇▆▅██▇
learning_rate,▁▁▆▁▁▁▁▂▃▁▂▂▁▂▄▂▁▃█▁▂▃▃▃▄▁▃▂▁▁
max_depth,▆▃▆▇▃▆▃▇█▇▁▁▁▃▅▂▅▃▂▄▂▁▁▃▃▂▄▁▃▆
min_child_samples,▆▅▅█▄▅█▅▆▆▂▂▂▃▁▃▃▂▁▄▃▂▂▃▁▄▃▂▃▇
num_leaves,▆▅▂▆▃▆▆▅█▃▁▁▃▁▃▄▂▂▇▄▃▁▂▂▂▃▄▂▃▂
reg_alpha,▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁

0,1
avg_overfit,0.0021
avg_pearson,
avg_r2,-0.00255
avg_rmse,1.00686
avg_train_rmse,1.00477
boosting_type,gbdt
colsample_bytree,0.90759
learning_rate,0.00578
max_depth,9
min_child_samples,78
