In [3]:
# data
import hashlib
import pandas as pd
import numpy as np

# parameter tuning
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

# model
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from joblib import dump, load

# sampling
from scipy.stats import uniform, randint

# plots
import matplotlib.pyplot as plt

# custom objects 
from production_demo import (CategoriesTransformer, 
                             CATEGORIES, 
                             NUMERICS, 
                             OUTPUT)

ImportError: cannot import name 'OUTPUT' from 'production_demo' (/Users/brandonshurick/Documents/ProdDemo/production-demo/lib/python3.9/site-packages/production_demo/__init__.py)

### Dataprep

In [None]:
train = pd.read_csv('../data/train.csv')

# hasher 
hct = CategoriesTransformer(CATEGORIES)


# prepare train/test splitting
train.sort_values(by=['YrSold', 'MoSold'], 
                  inplace=True)
tss = TimeSeriesSplit(n_splits=10)


# parameter space
param_distributions = dict(
    LGBM__num_leaves=randint(1, 5000),
    LGBM__max_depth=randint(1, 20),
    LGBM__learning_rate=uniform(0.01, 0.9),
    LGBM__n_estimators=randint(5, 1000),
    LGBM__min_split_gain=uniform(0.0, 0.1),
    LGBM__min_child_weight=uniform(0.0, 0.1),
    LGBM__subsample=uniform(0.1, 0.9),
    LGBM__colsample_bytree=uniform(0.1, 0.9),
    LGBM__reg_alpha=uniform(0.0, 5000.0),
    LGBM__reg_lambda=uniform(0.0, 5000.0),
)

### Param tuning

In [None]:
model = Pipeline([
    ('hash', hct),
    ('LGBM', LGBMRegressor(random_state=22)),
])
rsv = RandomizedSearchCV(estimator=model,
                         param_distributions=param_distributions,
                         n_iter=1000,
                         cv=tss,
                         scoring='neg_root_mean_squared_error')
_ = rsv.fit(train[NUMERICS + CATEGORIES], train[OUTPUT])

In [None]:
best_params_dict = rsv.best_params_

print(f'Best params:\n {best_params_dict}')
print(f'\nBest score:\n {rsv.best_score_:.4f}')

# save 
model = LGBMRegressor(**best_params_dict)

## Train

In [None]:
model.fit(train[numerics], train[output])

In [None]:
#save model artifacts
dump(model, '../data/trained_model')

In [None]:
!ls ../data