In [43]:
# data
import hashlib
import pandas as pd
import numpy as np

# parameter tuning
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

# model
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from joblib import dump, load

# sampling
from scipy.stats import uniform, randint

# plots
import matplotlib.pyplot as plt


# custom transformer for categories
class CategoriesTransformer(BaseEstimator, TransformerMixin):
   
    @staticmethod
    def hash_col(x, n_buckets=100000):
        return int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16) % n_buckets
    
    def __init__(self, category_cols: list):
        self.category_cols = category_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        _X = X.copy()
        for c in self.category_cols:
            _X[c].fillna('', inplace=True)
            _X[c] = _X[c].apply(self.hash_col)
        return _X

### Dataprep

In [36]:
train = pd.read_csv('../data/train.csv')


# select model columns
id_col = 'Id'
output = 'SalePrice'
numerics = [col for col in train.select_dtypes(np.number).columns 
            if col not in [id_col, output]]
categories = [col for col in train.select_dtypes('object').columns 
              if col not in [id_col, output]]

# hasher 
hct = CategoriesTransformer(categories)


# prepare train/test splitting
train.sort_values(by=['YrSold', 'MoSold'], 
                  inplace=True)
tss = TimeSeriesSplit(n_splits=10)


# parameter space
param_distributions = dict(
    LGBM__num_leaves=randint(1, 5000),
    LGBM__max_depth=randint(1, 20),
    LGBM__learning_rate=uniform(0.01, 0.9),
    LGBM__n_estimators=randint(5, 1000),
    LGBM__min_split_gain=uniform(0.0, 0.1),
    LGBM__min_child_weight=uniform(0.0, 0.1),
    LGBM__subsample=uniform(0.1, 0.9),
    LGBM__colsample_bytree=uniform(0.1, 0.9),
    LGBM__reg_alpha=uniform(0.0, 5000.0),
    LGBM__reg_lambda=uniform(0.0, 5000.0),
)

### Param tuning

In [37]:
model = Pipeline([
    ('hash', hct),
    ('LGBM', LGBMRegressor(random_state=22)),
])
rsv = RandomizedSearchCV(estimator=model,
                         param_distributions=param_distributions,
                         n_iter=1000,
                         cv=tss,
                         scoring='neg_root_mean_squared_error')
_ = rsv.fit(train[numerics + categories], train[output])

In [42]:
best_params_dict = rsv.best_params_

print(f'Best params:\n {best_params_dict}')
print(f'\nBest score:\n {rsv.best_score_:.4f}')
model = LGBMRegressor(**best_params_dict)

Best params:
 {'LGBM__colsample_bytree': 0.8685719758524428, 'LGBM__learning_rate': 0.33499835398126054, 'LGBM__max_depth': 7, 'LGBM__min_child_weight': 0.07068004740299694, 'LGBM__min_split_gain': 0.005588798461702771, 'LGBM__n_estimators': 340, 'LGBM__num_leaves': 4347, 'LGBM__reg_alpha': 3005.024914825045, 'LGBM__reg_lambda': 630.1026891579231, 'LGBM__subsample': 0.679306453679}

Best score:
 -32469.1114


## Train

In [6]:
model.fit(train[numerics], train[output])

LGBMRegressor(colsample_bytree=0.43404383476519925,
              learning_rate=0.6002751670184332, max_depth=17,
              min_child_weight=0.09430695805945778,
              min_split_gain=0.03343934541272591, n_estimators=512,
              num_leaves=4041, reg_alpha=4509.109769176171,
              reg_lambda=2153.583334081084, subsample=0.5595154003163686)

In [44]:
#save model artifacts
dump(model, '../data/trained_model')

['../data/trained_model']

In [45]:
!ls ../data

data_description.txt  test.csv              trained_model
sample_submission.csv train.csv
