In [1]:
import ray

ray.init()

# Load data.
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

# Split data into train and validation.
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

# Create a test dataset by dropping the target column.
test_dataset = valid_dataset.drop_columns(cols=["target"])

2023-01-03 13:03:07,938	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


[dataset]: Run `pip install tqdm` to enable progress reporting.


In [2]:
print(test_dataset)

Dataset(num_blocks=1, num_rows=171, schema={mean radius: float64, mean texture: float64, mean perimeter: float64, mean area: float64, mean smoothness: float64, mean compactness: float64, mean concavity: float64, mean concave points: float64, mean symmetry: float64, mean fractal dimension: float64, radius error: float64, texture error: float64, perimeter error: float64, area error: float64, smoothness error: float64, compactness error: float64, concavity error: float64, concave points error: float64, symmetry error: float64, fractal dimension error: float64, worst radius: float64, worst texture: float64, worst perimeter: float64, worst area: float64, worst smoothness: float64, worst compactness: float64, worst concavity: float64, worst concave points: float64, worst symmetry: float64, worst fractal dimension: float64})


In [4]:
# Create a preprocessor to scale some columns.
from ray.data.preprocessors import StandardScaler

preprocessor = StandardScaler(columns=["mean radius", "mean texture"])

In [5]:
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        # Number of workers to use for data parallelism.
        num_workers=2,
        # Whether to use GPU acceleration.
        use_gpu=False,
    ),
    label_column="target",
    num_boost_round=20,
    params={
        # XGBoost specific params
        "objective": "binary:logistic",
        # "tree_method": "gpu_hist",  # uncomment this to use GPUs.
        "eval_metric": ["logloss", "error"],
    },
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
)
result = trainer.fit()
print(result.metrics)

0,1
Current time:,2023-01-03 13:10:21
Running for:,00:00:07.38
Memory:,15.3/16.0 GiB

Trial name,status,loc,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_e7e45_00000,TERMINATED,127.0.0.1:80533,21,5.35517,0.0184957,0,0.0893879


[2m[36m(_RemoteRayXGBoostActor pid=80552)[0m [13:10:19] task [xgboost.ray]:140532417285808 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=80553)[0m [13:10:19] task [xgboost.ray]:140362095091136 got new rank 1


Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train-error,train-logloss,training_iteration,trial_id,valid-error,valid-logloss,warmup_time
XGBoostTrainer_e7e45_00000,2023-01-03_13-10-21,True,,4daf58c4478c4b158cdf23c87e18d159,0,Huaizhengs-MBP,21,127.0.0.1,80533,True,5.35517,0.0223079,5.35517,1672722621,0,,0,0.0184957,21,e7e45_00000,0.0409357,0.0893879,0.0125902


2023-01-03 13:10:21,901	INFO tune.py:762 -- Total run time: 7.51 seconds (7.38 seconds for the tuning loop).


{'train-logloss': 0.01849572773292735, 'train-error': 0.0, 'valid-logloss': 0.08938791319913073, 'valid-error': 0.04093567251461988, 'time_this_iter_s': 0.022307872772216797, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 21, 'trial_id': 'e7e45_00000', 'experiment_id': '4daf58c4478c4b158cdf23c87e18d159', 'date': '2023-01-03_13-10-21', 'timestamp': 1672722621, 'time_total_s': 5.355168104171753, 'pid': 80533, 'hostname': 'Huaizhengs-MBP', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 5.355168104171753, 'timesteps_since_restore': 0, 'iterations_since_restore': 21, 'warmup_time': 0.012590169906616211, 'experiment_tag': '0'}


In [6]:
from ray import tune

param_space = {"params": {"max_depth": tune.randint(1, 9)}}
metric = "train-logloss"

In [7]:
from ray.tune.tuner import Tuner, TuneConfig

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode="min"),
)
result_grid = tuner.fit()
best_result = result_grid.get_best_result()
print("Best result:", best_result)

  tuner = Tuner(


0,1
Current time:,2023-01-03 15:11:12
Running for:,01:57:50.75
Memory:,15.2/16.0 GiB

Trial name,status,loc,params/max_depth
XGBoostTrainer_57ab1_00000,RUNNING,127.0.0.1:80740,1
XGBoostTrainer_57ab1_00001,RUNNING,127.0.0.1:80747,5
XGBoostTrainer_57ab1_00002,RUNNING,127.0.0.1:80748,8
XGBoostTrainer_57ab1_00003,RUNNING,127.0.0.1:80749,8
XGBoostTrainer_57ab1_00004,RUNNING,127.0.0.1:80750,8


[2m[1m[36m(scheduler +10m23s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.


2023-01-03 15:11:12,916	ERROR tune.py:758 -- Trials did not complete: [XGBoostTrainer_57ab1_00000, XGBoostTrainer_57ab1_00001, XGBoostTrainer_57ab1_00002, XGBoostTrainer_57ab1_00003, XGBoostTrainer_57ab1_00004]
2023-01-03 15:11:12,916	INFO tune.py:762 -- Total run time: 7071.00 seconds (7070.75 seconds for the tuning loop).


RuntimeError: No best trial found for the given metric: train-logloss. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.