## LRBoost Default Usage
- LRBoost defaults to RidgeCV and HistGradientBoostingRegressor as the primary and secondary models respectively.

In [None]:
from lrboost import LRBoostRegressor
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)

lrb = LRBoostRegressor().fit(X, y)
preds = lrb.predict(X)
#BUG -- need to fix ridge errors here

## Directly Provide Model Parameters

In [None]:
from lrboost import LRBoostRegressor
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV

X, y = load_iris(return_X_y=True)

ridge_args = {"alphas": np.logspace(-4, 3, 10, endpoint=True), "cv": 5}
rf_args = {"n_estimators": 50, "n_jobs": -1}
lrb = LRBoostRegressor(primary_model=RidgeCV(**ridge_args),
                    secondary_model=RandomForestRegressor(**rf_args))
lrb = lrb.fit(X, y)
preds = lrb.predict(X)

## Tune Hyperparamters

- Note that when doing a tuning search such asn RandomSearchCV(), the primary model cannot be also a CV'd model. Therefore we replace RidgeCV() with Ridge().
- When creating the parameter grids, ensure to follow the primary_model___ and secondary_model___ syntax. The grid provided is particular to LightGBM and would need to be adjusted for XGBoost, Catboost, etc.

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from lightgbm import LGBMRegressor
from lrboost import LRBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

X_train = X[0:140, ]
X_val = X[140:150, ]
y_train = y[0:140]
y_val = y[140:150]

fit_params = {
    "early_stopping_rounds": 3, 
    "eval_metric": 'rmse', 
    "eval_set": [(X_val, y_val)],
    "eval_names": ['validation'],
    "verbose": 100
    }

lrb = LRBoostRegressor(primary_model=Ridge(),
                        secondary_model=LGBMRegressor())

param_grid = {
    'primary_model__alpha': np.logspace(-4, 3, 10, endpoint=True), 
    'secondary_model__num_leaves': randint(6, 50), 
    'secondary_model__min_child_samples': randint(100, 500), 
    'secondary_model__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'secondary_model__learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 100)),
    'secondary_model__subsample': uniform(loc = 0.2, scale = 0.8), 
    'secondary_model__colsample_bytree': uniform(loc = 0.4, scale = 0.6),
    'secondary_model__reg_alpha': [0, 1e-1, 1, 2, 10, 100],
    'secondary_model__reg_lambda': [0, 1e-1, 1, 2, 10, 100]
    }
        
rand_search = RandomizedSearchCV(
    estimator = lrb, 
    param_distributions = param_grid)

rand_search = rand_search.fit(X_train, y_train, secondary_fit_params = fit_params)

best_model = rand_search.best_estimator_

preds = best_model.predict(X)

## Probabilistic Secondary Models
- The use of models such as NGBoost or XGBoostDistribution result in probabilistic predictions with multiple parameters.
- Several methods for obtaining the predictions are provided below.
- *BE EXTREMELY CAUTIOUS WITH THE INTERPRETATION OF THE MODEL VARIANCE AS IT IS ONLY DIRECTLY APPLIED TO THE SECONDARY PREDICTION*

In [None]:
#TODO fill in after LRBoostRegressorDist is completed