In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler


from sklearn.model_selection import train_test_split, cross_val_score
from functools import partial


In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
    
def objective(trial, x_train, x_test, y_train, y_test):
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_test, label=y_test)

    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        # 
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=50),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.XGBRegressor(**param)
    fmse = make_scorer(mean_squared_error)
    mean_cv = np.mean(cross_val_score(bst, x_train, y_train, cv=5, scoring=fmse))
    return mean_cv

def get_dataset(
    cols: list[str],
    input_paths: str = 'inputs/train.parquet/*/*.parquet',
    fraction: float = 0.1,
    head:int =None,
) -> pd.DataFrame:
    # data handling
    lf = pl.scan_parquet(input_paths)
    head = lf.select(pl.len()).collect()['len'][0] if head is None else head
    df = lf.head(head).select(cols).collect()
    df = df.sample(fraction=fraction).to_pandas()
    df = reduce_mem_usage(df)
    return df


def train_single_model(X: pl.DataFrame, y: pl.DataFrame, model_name: str, n_trials: int = 100, timeout: int = 600, cv_frac: float = .1):
    _, X_cv, _, y_cv = train_test_split(X, y, test_size=cv_frac)
    print('CV Set: ', X_cv.shape)
    X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y_cv, test_size=0.3)
    _objective = partial(objective, x_train=X_train_cv, y_train=y_train_cv, x_test=X_test_cv, y_test=y_test_cv)
    # hyper tune
    study = optuna.create_study(direction="minimize", sampler=TPESampler())
    study.optimize(_objective, n_trials=n_trials, timeout=timeout)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    params = {
        "verbosity": 2,
        "objective": "reg:squarederror",
        "tree_method": "hist",
        **trial.params,
    }
    print(params)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05)
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
    model.save_model(f'{model_name}.json')
    del X_train
    del X_valid
    return model

In [3]:
input_path = 'inputs/train.parquet/*/*.parquet'
lf = pl.scan_parquet(input_path)
#
columns = lf.columns
features_cols = [x for x in columns if 'feature' in x]
responder_cols = [x for x in columns if 'responder' in x]
target_col = 'responder_6'

  columns = lf.columns


In [None]:
models = []
num_models = 7
for i in range(num_models):
    model_name = f'xgb_{i}'
    #
    df = get_dataset(cols=features_cols+[target_col],head=int(1e7),fraction=0.95)
    X = df[features_cols]
    y = df[target_col]
    print(X.shape, y.shape)
    model = train_single_model(X, y, model_name, n_trials=30, cv_frac=0.01)
    models.append(model)

Memory usage of dataframe is 2826.69 MB
Memory usage after optimization is: 1431.47 MB
Decreased by 49.4%
(9500000, 79) (9500000,)
CV Set:  (475000, 79)


[I 2024-12-24 22:43:48,516] A new study created in memory with name: no-name-74cfc51c-c8c6-439d-9516-fcba83cffb80
[I 2024-12-24 22:45:16,243] Trial 0 finished with value: 0.8698565483093261 and parameters: {'booster': 'gbtree', 'lambda': 0.0008727825246365712, 'alpha': 1.0154084534195503e-08, 'subsample': 0.714031018688315, 'colsample_bytree': 0.3889380806471218, 'n_estimators': 50, 'max_depth': 9, 'min_child_weight': 10, 'eta': 0.00037443855342101534, 'gamma': 8.993097600031346e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.8698565483093261.
[I 2024-12-24 22:45:32,279] Trial 1 finished with value: 0.863981568813324 and parameters: {'booster': 'gblinear', 'lambda': 0.0002894488237640431, 'alpha': 0.006133513618052148, 'subsample': 0.3186624182426906, 'colsample_bytree': 0.7827720275944052, 'n_estimators': 100}. Best is trial 1 with value: 0.863981568813324.
[I 2024-12-24 22:50:12,315] Trial 2 finished with value: 0.8700449466705322 and parameters: {'booster': 'dart', '

Number of finished trials:  4
Best trial:
  Value: 0.863981568813324
  Params: 
    booster: gblinear
    lambda: 0.0002894488237640431
    alpha: 0.006133513618052148
    subsample: 0.3186624182426906
    colsample_bytree: 0.7827720275944052
    n_estimators: 100
{'verbosity': 2, 'objective': 'reg:squarederror', 'tree_method': 'hist', 'booster': 'gblinear', 'lambda': 0.0002894488237640431, 'alpha': 0.006133513618052148, 'subsample': 0.3186624182426906, 'colsample_bytree': 0.7827720275944052, 'n_estimators': 100}


Parameters: { "colsample_bytree", "subsample", "tree_method" } are not used.



[0]	validation_0-rmse:0.93321
[1]	validation_0-rmse:0.93260
[2]	validation_0-rmse:0.93234
[3]	validation_0-rmse:0.93223
[4]	validation_0-rmse:0.93215
[5]	validation_0-rmse:0.93212
[6]	validation_0-rmse:0.93210
[7]	validation_0-rmse:0.93209
[8]	validation_0-rmse:0.93208
[9]	validation_0-rmse:0.93207
[10]	validation_0-rmse:0.93208
[11]	validation_0-rmse:0.93208
[12]	validation_0-rmse:0.93208
[13]	validation_0-rmse:0.93208
[14]	validation_0-rmse:0.93208
[15]	validation_0-rmse:0.93208
[16]	validation_0-rmse:0.93208
[17]	validation_0-rmse:0.93208
[18]	validation_0-rmse:0.93208
[19]	validation_0-rmse:0.93208
[20]	validation_0-rmse:0.93208
[21]	validation_0-rmse:0.93208
[22]	validation_0-rmse:0.93209
[23]	validation_0-rmse:0.93209
[24]	validation_0-rmse:0.93209
[25]	validation_0-rmse:0.93209
[26]	validation_0-rmse:0.93209
[27]	validation_0-rmse:0.93209
[28]	validation_0-rmse:0.93209
[29]	validation_0-rmse:0.93209
[30]	validation_0-rmse:0.93209
[31]	validation_0-rmse:0.93209
[32]	validation_0-

[I 2024-12-24 23:06:25,212] A new study created in memory with name: no-name-d4a5b252-9a3f-4385-8ccc-65fa3b543e74
[I 2024-12-24 23:10:08,269] Trial 0 finished with value: 0.8625175476074218 and parameters: {'booster': 'dart', 'lambda': 2.61108327657626e-08, 'alpha': 0.0013106545668249163, 'subsample': 0.708874382314868, 'colsample_bytree': 0.9322400609044399, 'n_estimators': 50, 'max_depth': 9, 'min_child_weight': 9, 'eta': 0.024014460733164534, 'gamma': 1.6915760467569681e-06, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 0.00010864184646829078, 'skip_drop': 1.1804067067678755e-06}. Best is trial 0 with value: 0.8625175476074218.
[I 2024-12-24 23:10:49,123] Trial 1 finished with value: 0.8647468447685241 and parameters: {'booster': 'gblinear', 'lambda': 1.138380759227298e-08, 'alpha': 0.004856823470772233, 'subsample': 0.9041055227804782, 'colsample_bytree': 0.9775505131900493, 'n_estimators': 200}. Best is trial 0 with value: 0.8625175

Number of finished trials:  7
Best trial:
  Value: 0.8625175476074218
  Params: 
    booster: dart
    lambda: 2.61108327657626e-08
    alpha: 0.0013106545668249163
    subsample: 0.708874382314868
    colsample_bytree: 0.9322400609044399
    n_estimators: 50
    max_depth: 9
    min_child_weight: 9
    eta: 0.024014460733164534
    gamma: 1.6915760467569681e-06
    grow_policy: lossguide
    sample_type: uniform
    normalize_type: forest
    rate_drop: 0.00010864184646829078
    skip_drop: 1.1804067067678755e-06
{'verbosity': 2, 'objective': 'reg:squarederror', 'tree_method': 'hist', 'booster': 'dart', 'lambda': 2.61108327657626e-08, 'alpha': 0.0013106545668249163, 'subsample': 0.708874382314868, 'colsample_bytree': 0.9322400609044399, 'n_estimators': 50, 'max_depth': 9, 'min_child_weight': 9, 'eta': 0.024014460733164534, 'gamma': 1.6915760467569681e-06, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 0.00010864184646829078, 'skip_drop':

In [None]:
del df
del X
del y

In [None]:
num_models = 5
models = [xgb.XGBRegressor() for i in range(num_models)]
for i, model in enumerate(models):
    model.load_model(f'xgb_{i}.json')
    print(model)

In [None]:
# batch predicting
# n = 47127338 
# batch_size = 1e7
n = 47127338
batch_size = int(1e7)
num_batch = n // batch_size
lf = pl.scan_parquet(input_path)
preds = []
y_trues = []
for i in range(num_batch+1):
    print(f'Batch: {i}/{num_batch}')
    rows = list(range(i*batch_size, min((i+1)*batch_size, n), 1))
    df = lf.select(pl.all().gather(rows)).collect()
    df = reduce_mem_usage(df.to_pandas())
    _X_train = df[features_cols]
    y_train = df[target_col]
    X_train = [model.predict(_X_train) for model in models]
    y_trues.append(y_train)
    X_train = np.vstack(X_train).T
    preds.append(X_train)
    # booster = train_single_model(X_train, y_train, 'booster', n_trials=20)
X_train_booster = np.row_stack(preds)
pl.DataFrame(X_train_booster).write_parquet('X_train_booster.parquet')
_y_trues = [x.to_numpy().reshape(-1,1) for x in y_trues]
y_train_booster = np.row_stack(_y_trues)
pl.DataFrame(y_train_booster).write_parquet('y_train_booster.parquet')


In [None]:
X_train_booster = pl.read_parquet('X_train_booster.parquet').to_numpy()
y_train_booster = pl.read_parquet('y_train_booster.parquet').to_numpy()
print(X_train_booster.shape)
print(y_train_booster.shape)
# train booster
booster = xgb.XGBRegressor( 
    n_estimators=200,
    learning_rate=0.1,
    tree_method='hist',
    max_depth=6,
    random_state=42
)
booster.fit(X_train_booster, y_train_booster)
booster.save_model('booster.json')

In [None]:
# scores
# preds = []
# n = 47127338
# test_df = lf.select(pl.all().gather(list(range(n-10000,n,1)))).collect()
X_test = test_df[features_cols].to_pandas()
y_test = test_df[target_col].to_numpy()
print(X_test.shape)
print(y_test.shape)
preds = []
for i, model in enumerate(models):
    y_val_preds = model.predict(X_test)
    preds.append(y_val_preds)
    score = r2_score(y_true=y_test, y_pred=y_val_preds)
    print(f'Model {i}: score: {score}')
_preds = np.row_stack(preds).T
_preds.shape

In [None]:
booster_preds = booster.predict(_preds)
r2_score(y_true=y_test, y_pred=booster_preds)