In [None]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

In [None]:
import numpy as np
import pandas as pd
import os 
import joblib

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import optuna
import lightgbm as lgb

In [None]:
class cfg:
    modelname = 'lgb'
    debut = True
    optim = True
    seed = 42
    nfolds = 5
    njobs = 2

In [None]:
# load data
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.head()

In [None]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

x = train[FEATURES]
y = train[TARGET]

In [None]:
# setup
studies_path = '../src/training_files/studies/'
cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)

In [None]:
# fixed params
fixed_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'device': cfg.device,
        'verbosity': -1,
        'early_stopping_round': 15,
    }

# objective function for optimization
def objective(trial):
    
    # trial parameters
    tuning_params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_index, test_index in cv.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            train_x,
            train_y,
            eval_set=[(valid_x, valid_y)],
            callbacks=[lgb.log_evaluation(period=0, show_stdv=False)]
        )
        
        acc = np.sqrt(mean_squared_error(valid_y,(model.predict(valid_x))))
        scores.append(acc)

    return np.mean(scores)

In [None]:
if cfg.optim:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 2 if cfg.debug else 50
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{cfg.modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {cfg.modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{cfg.modelname}_bestparams.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)