In [1]:
import polars as pl
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler


from sklearn.model_selection import train_test_split
from functools import partial


In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
    
def objective(trial, data, target):
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = mean_squared_error(valid_y, pred_labels)
    return accuracy

In [3]:
# data handling
input_paths = 'inputs/train.parquet/*/*.parquet'
scan_df = pl.scan_parquet(input_paths)

df = scan_df.head(int(1e5)).collect()

In [4]:
#
columns = df.columns
features_cols = [x for x in columns if 'feature' in x]
responder_cols = [x for x in columns if 'responder' in x]
target_col = 'responder_6'

#
X = df[features_cols]
y = df[target_col]
print(X.shape, y.shape)

(100000, 79) (100000,)


In [5]:
# _objective = partial(objective, data=X, target=y)
# # hyper tune
# study = optuna.create_study(direction="minimize", sampler=TPESampler())
# study.optimize(_objective, n_trials=100, timeout=600)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

# params = {
#     "verbosity": 2,
#     "objective": "reg:squarederror",
#     # use exact for small dataset.
#     "tree_method": "hist",
#     **trial.params,
# }
# params


In [6]:
params = {'verbosity': 3,
 'objective': 'reg:squarederror',
 'tree_method': 'hist',
 'early_stopping_rounds': 5,
 'booster': 'dart',
 'lambda': 1.3642360825668772e-06,
 'alpha': 0.05061696974935902,
 'subsample': 0.949976951187838,
 'colsample_bytree': 0.7390489200118765,
 'max_depth': 9,
 'min_child_weight': 8,
 'eta': 0.663661345919149,
 'gamma': 0.0003618456071895134,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'tree',
 'rate_drop': 0.00026112632903904653,
 'skip_drop': 0.0043081489165863246}

In [7]:
X = scan_df.select(features_cols).collect().to_pandas()
X = reduce_mem_usage(X)
X = X.to_numpy()
# #


Memory usage of dataframe is 13842.79 MB
Memory usage after optimization is: 7011.28 MB
Decreased by 49.4%


In [8]:
y = scan_df.select(target_col).collect().to_pandas()
y = reduce_mem_usage(y)
y = y.to_numpy()
print(y.shape)

Memory usage of dataframe is 179.78 MB
Memory usage after optimization is: 89.89 MB
Decreased by 50.0%
(47127338, 1)


In [9]:
n = X.shape[0]
n_train = int(n * 0.1)
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]
print(X_train.shape, y_train.shape)
np.all(np.isfinite(X_train))
np.all(np.isfinite(X_test))
np.all(np.isfinite(y_train))
np.all(np.isfinite(y_test))

(4712733, 79) (4712733, 1)


True

In [10]:
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
model.save_model('regressor.json')

[21:21:27] AllReduce: 0.020193s, 1 calls @ 20193us

[21:21:27] MakeCuts: 0.027011s, 1 calls @ 27011us

[21:45:51] DEBUG: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:130: Using tree method: 3
[21:46:00] INFO: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:918: drop 0 trees, weight = 1
[0]	validation_0-rmse:0.88910
[21:46:54] INFO: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:918: drop 0 trees, weight = 1
[1]	validation_0-rmse:0.89133
[21:47:16] INFO: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:918: drop 0 trees, weight = 1
[2]	validation_0-rmse:0.89280
[21:47:54] INFO: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:918: drop 0 trees, weight = 1
[3]	validation_0-rmse:0.89826
[21:48:38] INFO: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:918: drop 0 trees, weight = 1
[4]	validation_0-rmse:0.90197
[21:49:28] INFO: /Users/runner/work/xgboost/xgboost/src/gbm/gbtree.cc:918: drop 0 trees, weight = 1
[5]	validation_0-rmse:0.90498
[21:50:22] Configure

In [11]:
preds = model.predict(X_test)
r2_score(y_true=y_test, y_pred=preds)

0.0024698376655578613

: 