## My first Ray project w XGBoost

Trying out some getting started, configuring the environment and so on :)
To try it out, run env-setting notebook before. 

In [1]:
import ray

# Load data.
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

# Split data into train and validation.
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

# Create a test dataset by dropping the target column.
test_dataset = valid_dataset.drop_columns(cols=["target"])

2023-03-19 08:27:43,341	INFO worker.py:1553 -- Started a local Ray instance.
Read progress: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it]
Read progress: 100%|██████████| 1/1 [00:00<00:00, 578.76it/s]
Read progress: 100%|██████████| 1/1 [00:00<00:00, 645.87it/s]


In [2]:
# Create a preprocessor to scale some columns.
from ray.data.preprocessors import StandardScaler

preprocessor = StandardScaler(columns=["mean radius", "mean texture"])

## train a model with XGBoostTrainer

In [4]:
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        # Number of workers to use for data parallelism.
        num_workers=2,
        # Whether to use GPU acceleration.
        use_gpu=False,
        # Make sure to leave some CPUs free for Ray Data operations.
        _max_cpu_fraction_per_node=0.9,
    ),
    label_column="target",
    num_boost_round=20,
    params={
        # XGBoost specific params
        "objective": "binary:logistic",
        # "tree_method": "gpu_hist",  # uncomment this to use GPUs.
        "eval_metric": ["logloss", "error"],
    },
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
)
result = trainer.fit()
print(result.metrics)

2023-03-19 08:29:20,033	INFO tensorboardx.py:170 -- pip install "ray[tune]" to see TensorBoard files.


0,1
Current time:,2023-03-19 08:29:28
Running for:,00:00:08.83
Memory:,5.4/15.6 GiB

Trial name,status,loc,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_25070_00000,TERMINATED,172.17.0.2:1120,21,6.78411,0.0184957,0,0.0897979


[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:22,056	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[aggregate]
[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:23,356	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[StandardScaler]
[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:23,412	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[StandardScaler]
[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:23,481	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[repartition]
[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:23,557	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[repartition]
[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:26,177	INFO tracker.py:218 -- start listen on 172.17.0.2:38593
[2m[36m(XGBoostTrainer pid=1120)[0m 2023-03-19 08:29:26,230	INF

Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train-error,train-logloss,training_iteration,trial_id,valid-error,valid-logloss,warmup_time
XGBoostTrainer_25070_00000,2023-03-19_08-29-28,True,,fab94c57893144a28929554696a4ff72,0,b10e8c16a849,21,172.17.0.2,1120,True,6.78411,0.072351,6.78411,1679214568,0,,0,0.0184957,21,25070_00000,0.0411765,0.0897979,0.00931907


2023-03-19 08:29:28,883	INFO tune.py:798 -- Total run time: 8.86 seconds (8.81 seconds for the tuning loop).


{'train-logloss': 0.01849572784173766, 'train-error': 0.0, 'valid-logloss': 0.08979789356372374, 'valid-error': 0.04117647058823529, 'time_this_iter_s': 0.07235097885131836, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 21, 'trial_id': '25070_00000', 'experiment_id': 'fab94c57893144a28929554696a4ff72', 'date': '2023-03-19_08-29-28', 'timestamp': 1679214568, 'time_total_s': 6.784107208251953, 'pid': 1120, 'hostname': 'b10e8c16a849', 'node_ip': '172.17.0.2', 'config': {}, 'time_since_restore': 6.784107208251953, 'timesteps_since_restore': 0, 'iterations_since_restore': 21, 'warmup_time': 0.009319067001342773, 'experiment_tag': '0'}


## Tune hyperparameters and find the best model with Ray Tune 

In [6]:
from ray import tune

param_space = {"params": {"max_depth": tune.randint(1, 9)}}
metric = "train-logloss"

In [7]:
from ray.tune.tuner import Tuner, TuneConfig

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode="min"),
)
result_grid = tuner.fit()
best_result = result_grid.get_best_result()
print("Best result:", best_result)



0,1
Current time:,2023-03-19 08:36:55
Running for:,00:00:21.97
Memory:,5.2/15.6 GiB

Trial name,status,loc,params/max_depth,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_27907_00000,TERMINATED,172.17.0.2:1715,7,21,7.03373,0.0184957,0.0,0.0897979
XGBoostTrainer_27907_00001,TERMINATED,172.17.0.2:1769,8,21,6.82536,0.0184957,0.0,0.0897979
XGBoostTrainer_27907_00002,TERMINATED,172.17.0.2:1771,4,21,7.55822,0.0183889,0.0,0.101024
XGBoostTrainer_27907_00003,TERMINATED,172.17.0.2:2529,1,21,7.95038,0.0955215,0.0175879,0.112144
XGBoostTrainer_27907_00004,TERMINATED,172.17.0.2:2621,8,21,6.04832,0.0184957,0.0,0.0897979


[2m[36m(XGBoostTrainer pid=1715)[0m 2023-03-19 08:36:36,191	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[aggregate]
[2m[36m(XGBoostTrainer pid=1715)[0m 2023-03-19 08:36:36,717	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[StandardScaler]
[2m[36m(XGBoostTrainer pid=1715)[0m 2023-03-19 08:36:36,893	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[StandardScaler]
[2m[36m(XGBoostTrainer pid=1715)[0m 2023-03-19 08:36:36,969	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[repartition]
[2m[36m(XGBoostTrainer pid=1715)[0m 2023-03-19 08:36:38,100	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[repartition]
[2m[36m(XGBoostTrainer pid=1769)[0m 2023-03-19 08:36:38,842	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[aggregate]
[2m[36m(XGBoostTrainer p

Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train-error,train-logloss,training_iteration,trial_id,valid-error,valid-logloss,warmup_time
XGBoostTrainer_27907_00000,2023-03-19_08-36-43,True,,5d631b388306456b93e7b3c5ac0b871d,0_max_depth=7,b10e8c16a849,21,172.17.0.2,1715,True,7.03373,0.175378,7.03373,1679215003,0,,0.0,0.0184957,21,27907_00000,0.0411765,0.0897979,0.0110359
XGBoostTrainer_27907_00001,2023-03-19_08-36-45,True,,75eada908e9b48ccb7d6d55bc19e4d59,1_max_depth=8,b10e8c16a849,21,172.17.0.2,1769,True,6.82536,0.79186,6.82536,1679215005,0,,0.0,0.0184957,21,27907_00001,0.0411765,0.0897979,0.0125062
XGBoostTrainer_27907_00002,2023-03-19_08-36-46,True,,15c817e68ebe4160bb70760ab928af0b,2_max_depth=4,b10e8c16a849,21,172.17.0.2,1771,True,7.55822,0.34967,7.55822,1679215006,0,,0.0,0.0183889,21,27907_00002,0.0470588,0.101024,0.0109482
XGBoostTrainer_27907_00003,2023-03-19_08-36-54,True,,2c8b438a0e224dcda910589a73952d9e,3_max_depth=1,b10e8c16a849,21,172.17.0.2,2529,True,7.95038,0.15171,7.95038,1679215014,0,,0.0175879,0.0955215,21,27907_00003,0.0294118,0.112144,0.010232
XGBoostTrainer_27907_00004,2023-03-19_08-36-55,True,,ac7882ab8bbd4b428137c31b1cad868a,4_max_depth=8,b10e8c16a849,21,172.17.0.2,2621,True,6.04832,1.02244,6.04832,1679215015,0,,0.0,0.0184957,21,27907_00004,0.0411765,0.0897979,0.00957203


[2m[36m(XGBoostTrainer pid=1769)[0m 2023-03-19 08:36:42,798	INFO tracker.py:218 -- start listen on 172.17.0.2:33999
[2m[36m(XGBoostTrainer pid=1769)[0m 2023-03-19 08:36:42,845	INFO tracker.py:382 -- @tracker All of 2 nodes getting started
[2m[36m(_RemoteRayXGBoostActor pid=2065)[0m [08:36:42] task [xgboost.ray]:139705094182272 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=2066)[0m [08:36:42] task [xgboost.ray]:139889656390656 got new rank 0
[2m[36m(XGBoostTrainer pid=1715)[0m 2023-03-19 08:36:43,013	INFO tracker.py:388 -- @tracker All nodes finishes job
[2m[36m(XGBoostTrainer pid=1771)[0m 2023-03-19 08:36:43,063	INFO tracker.py:218 -- start listen on 172.17.0.2:33733
[2m[36m(XGBoostTrainer pid=1771)[0m 2023-03-19 08:36:43,097	INFO tracker.py:382 -- @tracker All of 2 nodes getting started
[2m[36m(_RemoteRayXGBoostActor pid=2107)[0m [08:36:43] task [xgboost.ray]:140605256799376 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=2108)[0m [08:36:43] task [xgbo

Best result: Result(metrics={'train-logloss': 0.01838890816981263, 'train-error': 0.0, 'valid-logloss': 0.10102374425212689, 'valid-error': 0.04705882352941176, 'should_checkpoint': True, 'done': True, 'trial_id': '27907_00002', 'experiment_tag': '2_max_depth=4'}, error=None, log_dir=PosixPath('/home/jovyan/ray_results/XGBoostTrainer_2023-03-19_08-36-33/XGBoostTrainer_27907_00002_2_max_depth=4_2023-03-19_08-36-36'))


## use the train model for batch prediction 

In [8]:
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostPredictor

# You can also create a checkpoint from a trained model using
# `XGBoostCheckpoint.from_model`.
checkpoint = best_result.checkpoint

batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor)

predicted_probabilities = batch_predictor.predict(test_dataset)
predicted_probabilities.show()

2023-03-19 08:37:07,938	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(<lambda>)]
MapBatches(<lambda>): 100%|██████████| 1/1 [00:00<00:00,  1.22it/s]
2023-03-19 08:37:08,792	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[StandardScaler]
StandardScaler: 100%|██████████| 1/1 [00:00<00:00, 39.61it/s]
2023-03-19 08:37:08,839	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(ScoringWrapper)]
MapBatches(ScoringWrapper), 0 actors: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]

{'predictions': 0.9966920614242554}
{'predictions': 0.9931760430335999}
{'predictions': 0.0034648359287530184}
{'predictions': 0.9966920614242554}
{'predictions': 0.9965646862983704}
{'predictions': 0.9956005811691284}
{'predictions': 0.9950228929519653}
{'predictions': 0.9943311214447021}
{'predictions': 0.4793323874473572}
{'predictions': 0.9818810820579529}
{'predictions': 0.0034648359287530184}
{'predictions': 0.996193528175354}
{'predictions': 0.9557499885559082}
{'predictions': 0.993036687374115}
{'predictions': 0.9940920472145081}
{'predictions': 0.22775237262248993}
{'predictions': 0.4834454357624054}
{'predictions': 0.9949895739555359}
{'predictions': 0.9798774123191833}
{'predictions': 0.0034648359287530184}



