In [1]:
# !pip install -q xgboost --upgrade

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import lightgbm as lgbm
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.metrics import root_mean_squared_log_error
from functools import partial
import warnings

warnings.filterwarnings('ignore')


ModuleNotFoundError: No module named 'optuna_integration'

In [None]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [None]:
import train_tabular as tt
import cv_split_utils
import enums

In [None]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "Rings"    
    SKEW_THRESHOLD = 0.5
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = enums.ModelName.LGBM
    REMOVE_OUTLIERS = True
    POWER_TRANSFORM = False
    NORMALIZE_DATA = True
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.RMSLE
    NUM_TUNING_TRIALS = 25
    TUNE_ON_SINGLE_FOLD = True

DATA_PATH = "./data/"
COLS_TO_LEAVE = ["Rings", "kfold"]

In [None]:
# import train dataset locally from data folder
df_train = pd.read_csv(DATA_PATH + "train.csv")
# import test dataset locally from data folder
df_test = pd.read_csv(DATA_PATH + "test.csv")
# drop id column
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [None]:
df_train = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train, 
                                    target_col_name=Config.TARGET_COL_NAME, 
                                    num_folds=Config.NUM_FOLDS,
                                    random_state=Config.RANDOM_SEED
                                )
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,7,3
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,7,3
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,9,4
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,10,2


In [None]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()

In [None]:
def process_outliers_iqr(df, col_name, remove_outliers=True):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1    
    min_val = Q1 - 1.5 * IQR
    max_val = Q3 + 1.5 * IQR    
    outlier_count = df[(df[col_name] < min_val) | (df[col_name] > max_val)].shape[0]
    if remove_outliers:
        df = df[(df[col_name] >= min_val) & (df[col_name] <= max_val)]
    # Create a DataFrame for the results
    result = pd.DataFrame({
        'col_name': [col_name],
        'Q1': [Q1],
        'Q3': [Q3],
        'IQR': [IQR],
        'min_val': [min_val],
        'max_val': [max_val],
        'outlier_count': [outlier_count]
    })    
    return df, result

In [None]:
def power_transform(df, col_name, skew_threshold=0.5):    
    transformed = False
    skew = df[col_name].skew()
    print(f"{col_name} has skewness of {skew}")
    power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)    
    if abs(skew) > skew_threshold:
        transformed = True
        print("Will apply power transform.")
        col_transformed = power_transformer.fit_transform(df[[col_name]])
        df.loc[:, col_name] = col_transformed
    return df, transformed

In [None]:
# Preprocess float features
# Create an empty DataFrame to store the results
float_outliers = []
for col_name in cols_float:
    df_train, df_col_outliers = process_outliers_iqr(df_train, col_name, Config.REMOVE_OUTLIERS)
    df_float_outliers = float_outliers.append(df_col_outliers)
    if Config.POWER_TRANSFORM:
        df_train, transformed = power_transform(df_train, col_name, Config.SKEW_THRESHOLD)
df_float_outliers = pd.concat(float_outliers, axis=0)        
df_float_outliers = df_float_outliers.reset_index(drop=True)
df_float_outliers

Unnamed: 0,col_name,Q1,Q3,IQR,min_val,max_val,outlier_count
0,Length,0.445,0.6,0.155,0.2125,0.8325,1460
1,Diameter,0.35,0.47,0.12,0.17,0.65,372
2,Height,0.11,0.16,0.05,0.035,0.235,73
3,Whole weight,0.4405,1.073,0.6325,-0.50825,2.02175,621
4,Whole weight.1,0.1865,0.4625,0.276,-0.2275,0.8765,600
5,Whole weight.2,0.0905,0.231,0.1405,-0.12025,0.44175,130
6,Shell weight,0.126,0.3005,0.1745,-0.13575,0.56225,593


In [None]:
# one hot encoding of categorical variables
df_train_onehot = pd.get_dummies(df_train, columns=cols_str)
feature_cols = df_train_onehot.columns.drop(["Rings", "kfold"]).to_list()
feature_cols_to_normalize = cols_float

In [None]:
if Config.NORMALIZE_DATA:
    df_train_onehot = tt.normalize_features(df_train_onehot, 
                                            scaler=Config.SCALER,
                                            features_to_normalize=feature_cols_to_normalize)

In [None]:
def get_lgbm_tuning_params(trial):
    params_static = {
        "objective": "mean_squared_error",
        "metric": None,
        "verbosity": -1,    # <0: fatal, =0: error (warn), =1: info, >1: debug
        "boosting_type": "gbdt"
    }
    params_dynamic = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=25),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'num_leaves': trial.suggest_int('num_leaves', 4, 128, step=4),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100)
    }
    return {**params_static, **params_dynamic}

In [None]:
def get_model_tuning_params(trial, model_name):
    if model_name == enums.ModelName.Ridge:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.Lasso:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.RandomForest:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 400, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 10, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 16),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
            "max_features": trial.suggest_categorical("max_features", ["log2", "sqrt", None])
        }
    if model_name == enums.ModelName.XGBoost:
        return {
            "objective": "reg:squarederror",
            "eval_metric": "rmsle",
            "seed": Config.RANDOM_SEED,
            "verbosity": 0,
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
            'max_depth': trial.suggest_int('max_depth', 4, 32),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0, 1),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
            'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 100, 500, step=20)
        }
    if model_name == enums.ModelName.LGBM:
        return get_lgbm_tuning_params(trial)

In [None]:
def hyperparams_tuning_objective(trial, model_name, df_train,  
                                 feature_cols, metric, target_col_name, single_fold=False,
                                 num_folds=5, val_preds_col="val_preds"):           
    model_params = get_model_tuning_params(trial, model_name)    
    fold_metrics_model, df_val_preds = tt.run_training(
        model_name=model_name,
        df_train=df_train,
        target_col_name=target_col_name,
        feature_col_names=feature_cols,
        metric=metric,            
        num_folds=num_folds,
        model_params=model_params,
        val_preds_col=val_preds_col,
        single_fold=single_fold,
        suppress_print=True
    )       
    fold_metrics = [x[0] for x in fold_metrics_model]
    mean_metric = statistics.mean(fold_metrics)                
    return mean_metric

In [None]:
def tune_model_params(study_name, study_direction, num_trials, model_name, 
                      df_train,  feature_cols, metric, target_col_name, 
                      single_fold=False, num_folds=5, val_preds_col="val_preds"):
    model_params_tuning_obj_partial = partial(
        hyperparams_tuning_objective,
        model_name=model_name,        
        df_train=df_train,
        feature_cols=feature_cols,
        metric=metric,
        target_col_name=target_col_name,
        single_fold=single_fold,
        num_folds=num_folds,
        val_preds_col=val_preds_col
    )
    study = optuna.create_study(direction=study_direction, study_name=study_name)
    study.optimize(model_params_tuning_obj_partial, n_trials=num_trials)
    best_trial = study.best_trial
    print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")
    return best_trial.params

In [None]:
# tuned_model_params = tune_model_params(
#                          study_name=Config.MODEL_TYPE + "_ModelTuning", 
#                          study_direction="minimize",
#                          num_trials=Config.NUM_TUNING_TRIALS,
#                          model_name=Config.MODEL_TYPE,
#                          df_train=df_train_onehot,
#                          feature_cols=feature_cols,
#                          metric=Config.METRIC,
#                          target_col_name=Config.TARGET_COL_NAME,
#                          single_fold=Config.TUNE_ON_SINGLE_FOLD,
#                          num_folds=Config.NUM_FOLDS                         
#                     )

[32m[I 2024-05-09 10:59:32,017][0m A new study created in memory with name: LightGBM_ModelTuning[0m
[32m[I 2024-05-09 10:59:35,159][0m Trial 0 finished with value: 0.15058018476653942 and parameters: {'learning_rate': 0.24694311142013126, 'n_estimators': 875, 'max_depth': 11, 'min_child_weight': 8, 'subsample': 0.6867207862176299, 'colsample_bytree': 0.6481004767345344, 'num_leaves': 44, 'reg_alpha': 0.6536881907646401, 'reg_lambda': 89.0795454915248}. Best is trial 0 with value: 0.15058018476653942.[0m
[32m[I 2024-05-09 10:59:39,507][0m Trial 1 finished with value: 0.14925369645315922 and parameters: {'learning_rate': 0.06324563649683515, 'n_estimators': 100, 'max_depth': 7, 'min_child_weight': 7, 'subsample': 0.7464870072702409, 'colsample_bytree': 0.5650683888692759, 'num_leaves': 120, 'reg_alpha': 0.8786633380791119, 'reg_lambda': 13.402749252736886}. Best is trial 1 with value: 0.14925369645315922.[0m
[32m[I 2024-05-09 10:59:46,289][0m Trial 2 finished with value: 0.183

Best trial: number = 14, value = 0.14813618680637217, params = {'learning_rate': 0.05140366955156392, 'n_estimators': 500, 'max_depth': 14, 'min_child_weight': 6, 'subsample': 0.879216742575739, 'colsample_bytree': 0.7633991653386956, 'num_leaves': 68, 'reg_alpha': 0.7590024804950147, 'reg_lambda': 73.85120505908986}


In [None]:
# custom evaluation function (root mean squared log error) to be used with lightGBM
from sklearn.metrics import mean_squared_log_error  # Import from scikit-learn

def custom_rmsle(y_true, y_pred):
  """
  Custom objective function for RMSLE in LightGBM.
  """
  # Extract labels from LightGBM dataset object (y_pred)
  y_pred = y_pred.get_label()
  # Clip predictions to avoid log errors on zero values
  y_pred = np.clip(y_pred, np.min(y_true), np.max(y_true))
  rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
  return 'rmsle', rmsle, True  # Last argument for minimization (lower is better)

In [None]:
def early_stopping_eval(y_true, y_pred, trial):
    # Extract labels and calculate RMSLE
    rmsle = custom_rmsle(y_true, y_pred.get_label())[1]
    trial.report(rmsle, minimize=True)  # Report RMSLE to Optuna for optimization
    # if rmsle < some_threshold:  # Define your early stopping threshold
    #   return True  # Early stopping signal

In [21]:
def tune_lgbm_params(train_df, train_y, feature_cols, metric, params=None):    
    train_data = lgbm.Dataset(data=train_df[feature_cols], label=train_y, feature_name=feature_cols)    
    study = optuna.create_study(direction="minimize")
    lgbmtuner_cv = LightGBMTunerCV(
        params,
        train_set=train_data,        
        feval=metric,
        stratified=True,
        shuffle=True,
        nfold=Config.NUM_FOLDS,        
        study=study,
        #callbacks=[early_stopping(100), log_evaluation(100)]
    )     
    #study.optimize(lgbmtuner_cv, n_trials=Config.NUM_TUNING_TRIALS)
    lgbmtuner_cv.run()                
    print("Best Params: ", lgbmtuner_cv.best_params)    
    print("Best score: ", lgbmtuner_cv.best_score)    
    return lgbmtuner_cv    

In [22]:
# LGBM parameter tuning
lgbm_params = {
    "objective": "mean_squared_error",
    "metric": None,
    "verbosity": -1,
    "boosting_type": "gbdt"
}

lgbm_tuned_model = tune_lgbm_params(train_df=df_train_onehot, train_y=df_train_onehot[Config.TARGET_COL_NAME], 
                                    feature_cols=feature_cols, metric=custom_rmsle, params=lgbm_params)

In [23]:
# Ridge model (remove outliers, normalize data)
# Best trial: number = 18, value = 0.16229338860497053, params = {'alpha': 2120.468857440699}

# Lasso (remove outliers, normalize data)
# Best trial: number = 10, value = 0.1627903459839038, params = {'alpha': 0.033288635201287185}

# Random Forest (remove outliers, normalize data)
# Best trial: number = 11, value = 0.14948122240295672, params = {'n_estimators': 1200, 'max_depth': 22, 'min_samples_leaf': 7, 'min_samples_split': 2, 'max_features': 'sqrt'}

# XGB
# Best trial: number = 22, value = 0.14865409012650604, params = {
# 'n_estimators': 600, 'learning_rate': 0.015065276573848749, 'max_depth': 10, 'min_child_weight': 3, 
# 'gamma': 0.6023139295556132, 'subsample': 0.772199311472915, 'colsample_bytree': 0.7065564004210175, 
# 'reg_alpha': 0.15112588528335205, 'reg_lambda': 14.38817002024009, 'early_stopping_rounds': 350}

# LGBM
# Best trial: number = 12, value = 0.14806169103162473, params = {'learning_rate': 0.04248968464174889, 'n_estimators': 650, 
# 'max_depth': 12, 'min_child_weight': 4, 'subsample': 0.9894123754195663, 'colsample_bytree': 0.687112695399624, 
# 'num_leaves': 92, 'reg_alpha': 0.9626149380434225, 'reg_lambda': 69.88883953488258}

In [24]:
# # ridge model
# params_ridge = {"alpha": 1963.746}
# val_preds_col = "val_preds"
# model = tt.get_model(Config.MODEL_TYPE, params_ridge)        
# fold_metrics_model, df_val_preds = tt.run_training(
#             model=model,
#             df_train=df_train_onehot,
#             target_col_name=Config.TARGET_COL_NAME,
#             feature_col_names=feature_cols,
#             metric=enums.Metrics.RMSLE,            
#             num_folds=Config.NUM_FOLDS,
#             gb_params=None,
#             val_preds_col=val_preds_col,
#             single_fold=False
#         )       

In [25]:
# import optuna
# from optuna.integration import LightGBMTunerCV
# import lightgbm as lgb
# import numpy as np
# from sklearn.datasets import make_regression
# from sklearn.model_selection import train_test_split

# # Generate a sample dataset
# X, y = make_regression(n_samples=500, n_features=10, noise=0.1)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# # Define the custom evaluation function
# def custom_rmsle(y_pred, data):
#     y_true = data.get_label()
#     y_pred = np.clip(y_pred, 0, None)  # Ensure non-negative predictions
#     rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
#     return 'rmsle', rmsle, True

# # Create an Optuna study
# study = optuna.create_study(direction='minimize')  # Minimize the RMSLE metric

# # Configure the LightGBMTunerCV
# tuner = LightGBMTunerCV(
#     {
#         'objective': 'regression',  # You can adjust this as needed
#         'metric': 'rmse',  # Primary metric, but we're using a custom one
#     },
#     lgb.Dataset(X_train, label=y_train),
#     feval=custom_rmsle,  # Use the custom RMSLE function
#     optuna_study=study,
#     time_budget=600,  # Tune for 10 minutes
#     early_stopping_rounds=10,
# )

# # Run the tuner to optimize LightGBM's hyperparameters
# tuner.run()

# # Get the best parameters found by Optuna
# best_params = tuner.best_params
# print("Best parameters:", best_params)