In [1]:
import pandas as pd
#import dataclasses

In [4]:
train_processed_df = pd.read_pickle("inputs/train_processed.pkl")

In [50]:
import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

from ray import tune
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining


def LightGBMCallback(env):
    """Assumes that `valid_0` is the target validation score."""
    _, metric, score, _ = env.evaluation_result_list[0]
    tune.report(**{metric: score})


def train_diabetes(config):
    data = train_processed_df.drop('diabetes_mellitus', axis=1)
    target = train_processed_df.diabetes_mellitus
    #data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(
        data, target, test_size=0.20)
    train_set = lgb.Dataset(train_x, label=train_y)
    test_set = lgb.Dataset(test_x, label=test_y)
    gbm = lgb.train(
        config,
        train_set,
        valid_sets=[test_set],
        verbose_eval=False,
        callbacks=[LightGBMCallback])
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    tune.report(
        #mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels),
        #binary_error=sklearn.metrics.accuracy_score(test_y, pred_labels),
        binary_error=sklearn.metrics.roc_auc_score(test_y, pred_labels),
        done=True)


if __name__ == "__main__":
    config = {
        "objective": "binary",
        "metric": "binary_error",
        "verbose": -1,
        #"boosting_type": tune.grid_search(["gbdt", "dart"]),
        "boosting_type": "gbdt",
        "num_leaves": tune.randint(10, 40),
        'min_child_samples': tune.randint(100, 400),
        'min_child_weight': tune.grid_search([1e-7,1e-6,1e-5, 1e-4, 1e-3, 1e-2,1e-1]),
        'subsample': tune.uniform(0.4, 0.6),
        'colsample_bytree': tune.uniform(0.2, 0.6),
        'reg_alpha': tune.grid_search([0, 1, 2, 5, 7, 10, 50]),
        'reg_lambda': tune.grid_search([0, 1, 5, 10, 20, 50]),
        'scale_pos_weight': tune.grid_search([1, 2, 3]),
        #"learning_rate": tune.loguniform(1e-2, 1e-1)
        "learning_rate": 0.01
    }

    analysis = tune.run(
        train_diabetes,
        metric="binary_error",
        #mode="min",
        mode="max",
        resources_per_trial={"cpu": 7, "gpu": 0},
        config=config,
        num_samples=1,
        #scheduler=ASHAScheduler())
        scheduler=PopulationBasedTraining(),
        local_dir="/Users/anitaclement/Documents/Projects/internal_projects/populationBasedTraining/logging")

    print("Best hyperparameters found were: ", analysis.best_config)



Trial name,status,loc,colsample_bytree,min_child_samples,min_child_weight,num_leaves,reg_alpha,reg_lambda,scale_pos_weight,subsample
train_diabetes_344c8_00000,RUNNING,,0.283789,122,1e-07,36,0,0,1,0.46226




Trial name,status,loc,colsample_bytree,min_child_samples,min_child_weight,num_leaves,reg_alpha,reg_lambda,scale_pos_weight,subsample
train_diabetes_344c8_00000,RUNNING,,0.283789,122,1e-07,36,0,0,1,0.46226
train_diabetes_344c8_00001,RUNNING,,0.430756,288,1e-06,16,0,0,1,0.435398
train_diabetes_344c8_00002,RUNNING,,0.318188,279,1e-05,38,0,0,1,0.482496
train_diabetes_344c8_00003,RUNNING,,0.512273,229,0.0001,16,0,0,1,0.589058
train_diabetes_344c8_00004,RUNNING,,0.347268,126,0.001,27,0,0,1,0.567391
train_diabetes_344c8_00005,RUNNING,,0.247092,291,0.01,29,0,0,1,0.427536


. In total there are 0 pending tasks and 8 pending actors on this node. This is likely due to all cluster resources being claimed by actors. To resolve the issue, consider creating fewer actors or increase the resources available to this Ray cluster. You can ignore this message if this Ray cluster is expected to auto-scale.


KeyboardInterrupt: 