Modified from _v6

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from hyperopt import STATUS_OK, hp, tpe, Trials, fmin
from random import sample

import pickle
import xgboost as xgb
from xgboost import plot_importance
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


# User-Defined Parameters

In [2]:
DATAPATH = ''
MODELPATH = ''

DATAFILE = 'newcohort_features_addtmp_moddailycohort_priordailydose.csv'
MODELFILE = 'model.pkl'

TARGET_VAR = 'daily accept inj level_mod'
SEP_VAR = 'Patient ID (Fact)'
RANDOM_STATE = 1234
EVAL_METRIC = 'mae'
EARLYSTOP = 200

In [3]:
SPACE = {
    'colsample_bytree': hp.uniform('colsample_bytree', 0, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.1)),
    'n_estimators': hp.choice('n_estimators', np.arange(200, 1000, dtype=int)),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.3, 1),
    'max_depth': hp.choice('max_depth', np.arange(3, 10, dtype=int)),
    'subsample': hp.uniform('subsample', 0.5, 1), 
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1)
}

In [4]:
SELECTVARS = [
       'Patient ID (Fact)', 'Race',
       'Female', 'Age', 'Height_cm', 'Weight_kg',
       'Actual Last Vanco Test', 'Duration Since Actual Last Vanco Test',
       'Second Actual Prior Vanco Test',
       'Duration Since Second Actual Prior Vanco Test',
       'Freq', 'Actual Second Prior Daily Dose', 'Actual Prior Daily Dose',
       'daily accept inj level_mod', 
       'Prior Vancomycin Lab Level', 'Prior Daily Dose',
       'Second Prior Vancomycin Lab Level', 'Second Prior Daily Dose',
       'SC_lasttime', 'SC_secondlasttime',
       'BILIRUBIN_lasttime', 'ALP_lasttime',
       'ALT_lasttime', 'AST_lasttime', 'ALBUMIN_lasttime', 'WBC_lasttime',
       'CRP_lasttime', 'PROCALCITONIN_lasttime', 'haemodialysis',
       'peritoneal', 'Culture_lasttime', 
       'NUM_CCI', 'CCI_S_LIVER',
       'CCI_DIABETES_LONG', 'CCI_RENAL', 'CCI_PULMONARY', 'CCI_P_ULCER',
       'CCI_CVA', 'CCI_MI', 'CCI_CHF', 'CCI_M_CANCER', 'CCI_TISSUE',
       'CCI_M_LIVER', 'CCI_PVA', 'CCI_HEMI_PARA', 'CCI_CANCER',
       'NUM_CDMS', 'CDMS_COPD',
       'CDMS_SCHIZOPHRENIA', 'CDMS_DEMENTIA', 'CDMS_S_STROKE', 'CDMS_CHD',
       'CDMS_DM', 'CDMS_ASTHMA', 'CDMS_HF', 'CDMS_HYPERTENSION',
       'CDMS_RENAL', 'CDMS_HIP_FRACTURE', 'CDMS_WEIGHT',
       'CDMS_OSTEOPOROSIS', 'CDMS_ANXIETY', 'CDMS_STROKE',
       'CDMS_SPINE_FRACTURE', 'CDMS_DYSLIPIDAEMIA', 'GRPD', 'GRPN',
       'GRPB', 'GRPA', 'GRPC', 'GRPJ', 'GRPR', 'GRPV', 'GRPM', 'GRPA10A',
       'GRPS', 'GRPG', 'GRPL', 'GRPH', 'GRPP', 'Elderly_Flag', 'Vd_vanc',
       'TBW_Crcl_max150', 'AdjBW_Crcl_max150', 'IBW_Crcl_max150',
       'TBW_RoundSCr_Crcl_max150', 'Clvanc_TBW', 'Clvanc_Adj',
       'Clvanc_IBW', 'Clvanc_TBW_round', 't12_TBW', 't12_Adj', 't12_IBW',
       't12_TBW_round', 
       'egfr_lasttime', 'egfr_secondlasttime',
       'Duration Since Last Injection_hrs',
       'Duration Since Last Vanco Test',
       'Duration Since Second Last Injection_hrs',
       'Duration Since Second Last Vanco Test',
       'Duration btw Last Injection and Last Vanco Test_hrs',
       'Duration btw Second Last Injection and Second Last Vanco Test_hrs',
       'Duration Since Actual Last Injection_hrs',
       'Duration Since Actual Second Last Injection_hrs',
        'Third Actual Prior Vanco Test',
        'Duration Since Third Actual Prior Vanco Test',
        'Actual Third Prior Daily Dose',
        'Duration Since Actual Third Last Injection_hrs'
]

In [5]:
SELECTVARS_INIT_NOREMOVE = [
    'GRPC',
    'Actual Prior Daily Dose',
    'TBW_RoundSCr_Crcl_max150',
    'GRPB',
    'Age',
    'Weight_kg',
    'Duration Since Actual Last Injection_hrs',
    'egfr_lasttime',
    't12_TBW_round',
    'ALBUMIN_lasttime',
    'GRPA',
    'Clvanc_TBW_round',
    'SC_lasttime',
    'Freq',
    'haemodialysis'
]

SELECTVARS_SUB = [
    'Prior Vancomycin Lab Level',
    'Prior Daily Dose',
    'Actual Prior Daily Dose',
    'Actual Last Vanco Test',
    'egfr_lasttime',
    'Duration Since Actual Third Last Injection_hrs',
    'Duration Since Actual Second Last Injection_hrs',
    'Actual Third Prior Daily Dose',
    'Duration Since Actual Last Injection_hrs',
    'Second Actual Prior Vanco Test',
    'Duration Since Actual Last Vanco Test',
    'Duration Since Second Actual Prior Vanco Test',
    'Duration Since Last Injection_hrs',
    'Actual Second Prior Daily Dose',
    'Duration Since Last Vanco Test',
    'haemodialysis'
]

# Self-Defined Functions

In [6]:
def process_categorical(orgdata):
    data = orgdata.copy()
    data['Female_x'] = orgdata['Female'] == 1
    data['Race_CN'] = orgdata['Race'] == 'Chinese'
    data['Race_MY'] = orgdata['Race'] == 'Malay'
    data['Race_IN'] = orgdata['Race'] == 'Indian'
    data.drop(columns=['Race', 'Female'], inplace=True)
    data.rename(columns={'Female_x': 'Female'}, inplace=True)
    
    return data

In [7]:
def split_data(data, TARGET_VAR, SEP_VAR, RANDOM_STATE):
    # Random Selection on Patient ID
    pat = pd.DataFrame(data[SEP_VAR].unique())
    samplelist = pat.sample(frac=1, random_state=RANDOM_STATE, replace=False)
    samplelist = samplelist[0].values

    # Split train, dev, test
    X_train = data[data[SEP_VAR].isin(
        samplelist[:round(len(samplelist) * 0.64)])].drop(
            columns=[TARGET_VAR, SEP_VAR])
    X_dev = data[data[SEP_VAR].isin(
        samplelist[round(len(samplelist) *
                         0.64):round(len(samplelist) * 0.80)])].drop(
                             columns=[TARGET_VAR, SEP_VAR])
    X_test = data[data[SEP_VAR].isin(
        samplelist[round(len(samplelist) * 0.80):])].drop(
            columns=[TARGET_VAR, SEP_VAR])

    y_train = data[data[SEP_VAR].isin(
        samplelist[:round(len(samplelist) * 0.64)])][TARGET_VAR]
    y_dev = data[data[SEP_VAR].isin(
        samplelist[round(len(samplelist) * 0.64):round(len(samplelist) *
                                                       0.80)])][TARGET_VAR]
    y_test = data[data[SEP_VAR].isin(samplelist[round(len(samplelist) *
                                                      0.80):])][TARGET_VAR]

    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [8]:
def print_metric(model, X, y, identity_name=""):
    pred = pd.DataFrame({
        'y': y,
        'pred_y': model.predict(X)
    })
    pred['tmp_min'] = pred['y'].apply(lambda x: min(x * 0.85, x - 250))
    pred['tmp_max'] = pred['y'].apply(lambda x: max(x * 1.15, x + 250))
    pred['in range'] = False
    pred.loc[(pred['pred_y'] < pred['tmp_max']) &
             (pred['pred_y'] > pred['tmp_min']), 'in range'] = True

    print(identity_name)
    print('in range: ' +
        str(round(pred[pred['in range'] == True].shape[0] /
                  pred.shape[0], 3)) + ', rmse: ' +
        str(round(
                np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                        3)) + ', med abs err: ' +
        str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3))
    )

    return pred

# Load Data

In [9]:
def load_data(DATAPATH, DATAFILE):
    orgdata = pd.read_csv(DATAPATH + DATAFILE)
    return orgdata[SELECTVARS]

In [10]:
orgdata = load_data(DATAPATH, DATAFILE)
orgdata.shape

(7912, 107)

# Process Data

In [11]:
def process_data(orgdata):
    data = process_categorical(orgdata)
    return data

In [12]:
data = process_data(orgdata)

In [13]:
X_train, y_train, X_dev, y_dev, X_test, y_test = split_data(data, TARGET_VAR, SEP_VAR, RANDOM_STATE)

In [14]:
X_train_init = X_train[(X_train['Actual Last Vanco Test'].isna())|(X_train['Actual Prior Daily Dose'].isna())]
y_train_init = y_train[y_train.index.isin(X_train_init.index)]
X_dev_init = X_dev[(X_dev['Actual Last Vanco Test'].isna())|(X_dev['Actual Prior Daily Dose'].isna())]
y_dev_init = y_dev[y_dev.index.isin(X_dev_init.index)]
X_test_init = X_test[(X_test['Actual Last Vanco Test'].isna())|(X_test['Actual Prior Daily Dose'].isna())]
y_test_init = y_test[y_test.index.isin(X_test_init.index)]

X_train_init_noremove, X_dev_init_noremove, X_test_init_noremove = X_train_init[
    SELECTVARS_INIT_NOREMOVE], X_dev_init[
        SELECTVARS_INIT_NOREMOVE], X_test_init[SELECTVARS_INIT_NOREMOVE]

In [15]:
X_train_sub = X_train[~((X_train['Actual Last Vanco Test'].isna())|(X_train['Actual Prior Daily Dose'].isna()))]
y_train_sub = y_train[y_train.index.isin(X_train_sub.index)]
X_dev_sub = X_dev[~((X_dev['Actual Last Vanco Test'].isna())|(X_dev['Actual Prior Daily Dose'].isna()))]
y_dev_sub = y_dev[y_dev.index.isin(X_dev_sub.index)]
X_test_sub = X_test[~((X_test['Actual Last Vanco Test'].isna())|(X_test['Actual Prior Daily Dose'].isna()))]
y_test_sub = y_test[y_test.index.isin(X_test_sub.index)]

X_train_sub, X_dev_sub, X_test_sub = X_train_sub[SELECTVARS_SUB].copy(
), X_dev_sub[SELECTVARS_SUB].copy(), X_test_sub[SELECTVARS_SUB].copy()

In [16]:
X_train_init_noremove.shape

(1278, 15)

In [17]:
X_train_sub.shape

(3827, 16)

## Current Practices

In [18]:
org_cohort = pd.read_csv(DATAPATH + DATAFILE)
org_cohort.shape

(7912, 119)

In [19]:
# Initial Dose
train = org_cohort[org_cohort.index.isin(X_train_init_noremove.index)]
print(train.shape)

dev = org_cohort[org_cohort.index.isin(X_dev_init_noremove.index)]
print(dev.shape)

test = org_cohort[org_cohort.index.isin(X_test_init_noremove.index)]
print(test.shape)

(1278, 119)
(312, 119)
(389, 119)


In [20]:
# Initial Dose
print('Train')
print(str(round(train[train['in accept range_mod'] == True].shape[0] /
            train.shape[0], 3)) + ', med abs err: ' +
    str(round(metrics.mean_absolute_error(train['daily accept inj level_mod'],
                                          train['Daily Dose']), 3)))

print('============================')
print('Dev')
print(str(round(dev[dev['in accept range_mod'] == True].shape[0] /
            dev.shape[0], 3)) + ', med abs err: ' +
    str(round(metrics.mean_absolute_error(dev['daily accept inj level_mod'],
                                          dev['Daily Dose']), 3)))

print('============================')
print('Test')
print(str(round(test[test['in accept range_mod'] == True].shape[0] /
            test.shape[0], 3)) + ', med abs err: ' +
    str(round(metrics.mean_absolute_error(test['daily accept inj level_mod'],
                                          test['Daily Dose']), 3)))


Train
0.568, med abs err: 433.333
Dev
0.599, med abs err: 416.186
Test
0.56, med abs err: 422.751


In [21]:
# Subsequent Dose
train = org_cohort[org_cohort.index.isin(X_train_sub.index)]
print(train.shape)

dev = org_cohort[org_cohort.index.isin(X_dev_sub.index)]
print(dev.shape)

test = org_cohort[org_cohort.index.isin(X_test_sub.index)]
print(test.shape)

(3827, 119)
(852, 119)
(1254, 119)


In [22]:
# Subsequent Dose
print('Train')
print(str(round(train[train['in accept range_mod'] == True].shape[0] /
            train.shape[0], 3)) + ', med abs err: ' +
    str(round(metrics.mean_absolute_error(train['daily accept inj level_mod'],
                                          train['Daily Dose']), 3)))

print('============================')
print('Dev')
print(str(round(dev[dev['in accept range_mod'] == True].shape[0] /
            dev.shape[0], 3)) + ', med abs err: ' +
    str(round(metrics.mean_absolute_error(dev['daily accept inj level_mod'],
                                          dev['Daily Dose']), 3)))

print('============================')
print('Test')
print(str(round(test[test['in accept range_mod'] == True].shape[0] /
            test.shape[0], 3)) + ', med abs err: ' +
    str(round(metrics.mean_absolute_error(test['daily accept inj level_mod'],
                                          test['Daily Dose']), 3)))


Train
0.707, med abs err: 220.806
Dev
0.684, med abs err: 237.148
Test
0.728, med abs err: 201.156


# LightGBM Hyperparameters

In [23]:
def bayes_fn_lgb_init_noremove(params):
    estimator = lgb.LGBMRegressor(**params, random_state=RANDOM_STATE,
                                  objectives='reg:squarederror')
    estimator.fit(X_train_init_noremove,
                y_train_init,
                eval_set=[(X_dev_init_noremove, y_dev_init)],
                eval_metric=EVAL_METRIC,
                early_stopping_rounds=EARLYSTOP,
                verbose=False)
    
    score = metrics.mean_absolute_error(y_dev_init, estimator.predict(X_dev_init_noremove))
    
    return {'loss': score, 'params': params, 'status': STATUS_OK}

In [24]:
def bayes_fn_lgb_sub(params):
    estimator = lgb.LGBMRegressor(**params, random_state=RANDOM_STATE,
                                  objectives='reg:squarederror')
    estimator.fit(X_train_sub,
                y_train_sub,
                eval_set=[(X_dev_sub, y_dev_sub)],
                eval_metric=EVAL_METRIC,
                early_stopping_rounds=EARLYSTOP,
                verbose=False)
    
    score = metrics.mean_absolute_error(y_dev_sub, estimator.predict(X_dev_sub))
    
    return {'loss': score, 'params': params, 'status': STATUS_OK}

In [25]:
%%time
bayes_trials1 = Trials()
best_params_lgb_init_noremove = fmin(fn=bayes_fn_lgb_init_noremove,
                                     space=SPACE,
                                     algo=tpe.suggest,
                                     max_evals=500,
                                     trials=bayes_trials1)

100%|███████████████████████████████████████████████| 500/500 [03:49<00:00,  2.31trial/s, best loss: 426.7375245609676]
Wall time: 3min 49s


In [26]:
%%time
bayes_trials3 = Trials()
best_params_lgb_sub = fmin(fn=bayes_fn_lgb_sub,
                           space=SPACE,
                           algo=tpe.suggest,
                           max_evals=500,
                           trials=bayes_trials3)

100%|██████████████████████████████████████████████| 500/500 [07:51<00:00,  1.35trial/s, best loss: 280.54445709146194]
Wall time: 7min 51s


# LightGBM and Performance

In [27]:
def fit_model_lgb(X_train, y_train, X_dev, y_dev, MODELPATH, MODELFILE,
                  best_params):
    model = lgb.LGBMRegressor(**best_params,
                              random_state=RANDOM_STATE,
                              objectives='reg:squarederror')
    model.fit(X_train,
              y_train,
              eval_set=[(X_dev, y_dev)],
              eval_metric=EVAL_METRIC,
              early_stopping_rounds=EARLYSTOP,
              verbose=False)

    with open(MODELPATH + MODELFILE, 'wb') as file:
        pickle.dump(model, file)

    return model

In [28]:
def model_performance(model, X_train, y_train, X_dev, y_dev, X_test, y_test):
    # Feature importance
    importance = model.feature_importances_
    feature_name = X_train.columns.values
    feature_importance = pd.DataFrame({
        'feature_name': feature_name,
        'importance': importance
    })
    feature_importance.sort_values(by=['importance'],
                                   ascending=False,
                                   inplace=True)

    # Metrics
    train_metrics = print_metric(model, X_train, y_train, 'train')
    dev_metrics = print_metric(model, X_dev, y_dev, 'dev')
    test_metrics = print_metric(model, X_test, y_test, 'test')

    if 'Actual Last Vanco Test' not in X_train.columns:
        print('Initial dose:')
        print('==================')

        pred = train_metrics.copy()
        print('Train in range: ' + str(
                round(pred[pred['in range'] == True].shape[0] /
                      pred.shape[0], 3)) + ', rmse: ' +
            str(round(np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                      3)) + ', med abs err: ' +
            str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3)))
        pred = dev_metrics.copy()
        print('Dev in range: ' + str(
                round(pred[pred['in range'] == True].shape[0] /
                      pred.shape[0], 3)) + ', rmse: ' +
            str(round(np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                      3)) + ', med abs err: ' +
            str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3)))
        pred = test_metrics.copy()
        print('Test in range: ' + str(
                round(pred[pred['in range'] == True].shape[0] /
                      pred.shape[0], 3)) + ', rmse: ' +
            str(round(np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                      3)) + ', med abs err: ' +
            str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3)))

    if 'Actual Last Vanco Test' in X_train.columns:
        print('Subsequent dose:')
        print('==================')

        pred = train_metrics.copy()
        print('Train in range: ' + str(
                round(pred[pred['in range'] == True].shape[0] /
                      pred.shape[0], 3)) + ', rmse: ' +
            str(round(np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                      3)) + ', med abs err: ' +
            str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3)))
        pred = dev_metrics.copy()
        print('Dev in range: ' + str(
                round(pred[pred['in range'] == True].shape[0] /
                      pred.shape[0], 3)) + ', rmse: ' +
            str(round(np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                      3)) + ', med abs err: ' +
            str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3)))
        pred = test_metrics.copy()
        print('Test in range: ' + str(
                round(pred[pred['in range'] == True].shape[0] /
                      pred.shape[0], 3)) + ', rmse: ' +
            str(round(np.sqrt(metrics.mean_squared_error(pred['y'], pred['pred_y'])),
                      3)) + ', med abs err: ' +
            str(round(metrics.mean_absolute_error(pred['y'], pred['pred_y']), 3)))

    return feature_importance, train_metrics, dev_metrics, test_metrics

In [29]:
def combresult(X_train, y_train, X_dev, y_dev, X_test, y_test, MODELPATH, MODELFILE,
               best_params):
    model = fit_model_lgb(X_train, y_train, X_dev, y_dev, MODELPATH, MODELFILE,
                          best_params)
    feature_importance, train_metrics, dev_metrics, test_metrics = model_performance(
        model, X_train, y_train, X_dev, y_dev, X_test, y_test)

    return feature_importance, train_metrics, dev_metrics, test_metrics

## Initial Dose (No Removal)

In [30]:
best_params_lgb_init_noremove 

{'colsample_bylevel': 0.4508136916342451,
 'colsample_bytree': 0.3690802013052334,
 'learning_rate': 0.04331241818985845,
 'max_depth': 2,
 'n_estimators': 130,
 'reg_alpha': 0.6034598270441844,
 'reg_lambda': 0.18638175039110683,
 'subsample': 0.7902834085216726}

In [31]:
feature_importance, train_metrics, dev_metrics, test_metrics = combresult(
    X_train_init_noremove, y_train_init, X_dev_init_noremove, y_dev_init,
    X_test_init_noremove, y_test_init, MODELPATH, MODELFILE,
    best_params_lgb_init_noremove)

train
in range: 0.397, rmse: 711.286, med abs err: 523.969
dev
in range: 0.446, rmse: 690.47, med abs err: 494.778
test
in range: 0.398, rmse: 681.161, med abs err: 506.702
Initial dose:
Train in range: 0.397, rmse: 711.286, med abs err: 523.969
Dev in range: 0.446, rmse: 690.47, med abs err: 494.778
Test in range: 0.398, rmse: 681.161, med abs err: 506.702


### Grid Search

In [32]:
PARAM_GRID = {
     'colsample_bylevel': [0.4, 0.5],
     'colsample_bytree': [0.3, 0.4],
     'learning_rate': [0.02, 0.04, 0.06],
     'max_depth': [2, 3, 4],
     'n_estimators': [110, 130, 150],
     'reg_alpha': [0.6],
     'reg_lambda': [0.2],
     'subsample': [0.8],
}

In [33]:
estimator = lgb.LGBMRegressor(
    objectives='reg:squarederror', random_state=RANDOM_STATE
)
gsearch = GridSearchCV(estimator, PARAM_GRID, verbose=False)
gsearch.fit(
    X_train_init_noremove, y_train_init,
    eval_set=[(X_dev_init_noremove, y_dev_init)],
    eval_metric=EVAL_METRIC,
    early_stopping_rounds=EARLYSTOP,
    verbose=False,
)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0, learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None,
                                     objectives='reg:squarederror',
                                     random_state=1234, reg_alpha=0.0...
                                     subsample_for_bin=200000,
                                     subsample_freq=1),
             iid='warn', n_jobs=None,
             param_grid={'colsample_bylevel': [0.4, 0.5],
                         'colsample_bytree': [0.3, 0.4],
                         'learning_rate': [0.02, 0.04, 0.06],
                        

In [34]:
best_params = gsearch.best_params_

feature_importance, train_metrics, dev_metrics, test_metrics = combresult(
    X_train_init_noremove, y_train_init, X_dev_init_noremove, y_dev_init,
    X_test_init_noremove, y_test_init, MODELPATH, MODELFILE, best_params)

train
in range: 0.564, rmse: 547.559, med abs err: 387.815
dev
in range: 0.522, rmse: 610.454, med abs err: 435.489
test
in range: 0.517, rmse: 628.483, med abs err: 450.159
Initial dose:
Train in range: 0.564, rmse: 547.559, med abs err: 387.815
Dev in range: 0.522, rmse: 610.454, med abs err: 435.489
Test in range: 0.517, rmse: 628.483, med abs err: 450.159


## Subsequent Dose

In [36]:
best_params_lgb_sub 

{'colsample_bylevel': 0.7459643802852338,
 'colsample_bytree': 0.9303798283844933,
 'learning_rate': 0.046999087971699,
 'max_depth': 4,
 'n_estimators': 264,
 'reg_alpha': 0.4475908546291876,
 'reg_lambda': 0.3416781863873622,
 'subsample': 0.8197380792148594}

In [37]:
feature_importance, train_metrics, dev_metrics, test_metrics = combresult(
    X_train_sub, y_train_sub, X_dev_sub, y_dev_sub, X_test_sub, y_test_sub,
    MODELPATH, MODELFILE, best_params_lgb_sub)

train
in range: 0.758, rmse: 365.411, med abs err: 250.585
dev
in range: 0.719, rmse: 467.376, med abs err: 292.086
test
in range: 0.729, rmse: 408.079, med abs err: 270.063
Subsequent dose:
Train in range: 0.758, rmse: 365.411, med abs err: 250.585
Dev in range: 0.719, rmse: 467.376, med abs err: 292.086
Test in range: 0.729, rmse: 408.079, med abs err: 270.063


### Grid Search

In [38]:
PARAM_GRID = {
     'colsample_bylevel': [0.7, 0.8],
     'colsample_bytree': [0.8, 0.9],
     'learning_rate': [0.04, 0.05],
     'max_depth': [4],
     'n_estimators': [260],
     'reg_alpha': [0.4, 0.5],
     'reg_lambda': [0.4, 0.5],
     'subsample': [0.7, 0.8],
}

In [39]:
estimator = lgb.LGBMRegressor(
    objectives='reg:squarederror', random_state=RANDOM_STATE
)
gsearch = GridSearchCV(estimator, PARAM_GRID, verbose=False)
gsearch.fit(
    X_train_sub, y_train_sub, 
    eval_set=[(X_dev_sub, y_dev_sub)],
    eval_metric=EVAL_METRIC,
    early_stopping_rounds=EARLYSTOP,
    verbose=False,
)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0, learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None,
                                     objectives='reg:squarederror',
                                     random_state=1234, reg_alpha=0.0...
                                     subsample_for_bin=200000,
                                     subsample_freq=1),
             iid='warn', n_jobs=None,
             param_grid={'colsample_bylevel': [0.7, 0.8],
                         'colsample_bytree': [0.8, 0.9],
                         'learning_rate': [0.04, 0.05], 'max_depth': [4],
            

In [40]:
best_params = gsearch.best_params_
feature_importance, train_metrics, dev_metrics, test_metrics = combresult(
    X_train_sub, y_train_sub, X_dev_sub, y_dev_sub,
    X_test_sub, y_test_sub, MODELPATH, MODELFILE,
    best_params)

train
in range: 0.789, rmse: 334.24, med abs err: 229.409
dev
in range: 0.73, rmse: 470.786, med abs err: 289.866
test
in range: 0.734, rmse: 406.775, med abs err: 267.14
Subsequent dose:
Train in range: 0.789, rmse: 334.24, med abs err: 229.409
Dev in range: 0.73, rmse: 470.786, med abs err: 289.866
Test in range: 0.734, rmse: 406.775, med abs err: 267.14
