In [74]:
# !pip install -q xgboost --upgrade

In [None]:
# !pip install -q openfe

In [75]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import lightgbm as lgbm
import statistics
from sklearn.linear_model import Ridge
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.metrics import root_mean_squared_log_error
from functools import partial
from openfe import OpenFE, transform
import warnings

warnings.filterwarnings('ignore')


In [76]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [77]:
import train_tabular as tt
import cv_split_utils
import enums

In [78]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "Rings"    
    SKEW_THRESHOLD = 0.5
    EARLY_STOPPING = 500
    RESULTS_FILE = "model_execution_results.pkl"
    MODEL_TYPE = enums.ModelName.XGBoost
    REMOVE_OUTLIERS = True
    POWER_TRANSFORM = False
    NORMALIZE_DATA = True
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.RMSLE
    NUM_TUNING_TRIALS = 25
    TUNE_ON_SINGLE_FOLD = True
    GENERATE_AUTO_FEATURES = True

DATA_PATH = "./data/"
COLS_TO_LEAVE = ["Rings", "kfold"]
CPU_COUNT = os.cpu_count()

In [79]:
# import train dataset locally from data folder
df_train = pd.read_csv(DATA_PATH + "train.csv")
# import test dataset locally from data folder
df_test = pd.read_csv(DATA_PATH + "test.csv")
# drop id column
df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()

In [80]:
feature_cols_for_fe = [x for x in df_train.columns if x not in COLS_TO_LEAVE]

In [81]:
def generate_new_features(df_train, df_test, feature_cols, NUM_NEW_FEATURES=10):
    train_X = df_train[feature_cols] 
    test_X = df_test[feature_cols]   
    train_y = df_train[Config.TARGET_COL_NAME]
    ofe = OpenFE()
    features = ofe.fit(data=train_X, label=train_y, n_jobs=CPU_COUNT, verbose=False)  # generate new features
    # OpenFE recommends a list of new features. We include the top 10
    # generated features to see how they influence the model performance
    train_X, test_X = transform(train_X, test_X, ofe.new_features_list[:NUM_NEW_FEATURES], n_jobs=CPU_COUNT)
    return train_X, test_X

In [None]:
if Config.GENERATE_AUTO_FEATURES:
    df_train, df_test = generate_new_features(df_train, df_test, feature_cols_for_fe)    
    df_train_labels = df_train_orig[[Config.TARGET_COL_NAME]]
    # Add the label data to the dataframe
    df_train = pd.concat([df_train, df_train_labels], axis=1)

In [83]:
cols_float = df_train.select_dtypes(include=["float"]).columns.to_list()
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()
feature_cols_to_normalize = cols_float

In [84]:
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,autoFE_f_2,autoFE_f_3,autoFE_f_4,autoFE_f_5,autoFE_f_6,autoFE_f_7,autoFE_f_8,autoFE_f_9,Rings
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,2.291667,1.36875,1.791667,2.348554,0.31,0.443,0.39,2637.0,0.24,0.19,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,1.96875,1.43125,1.53125,2.467249,0.31,0.672,0.465,1173.0,0.32,0.17,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,32.0,1.1,22.0,3.818182,0.155,0.0155,0.03,487.0,0.005,0.105,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,2.38,1.502,1.9,2.435419,0.345,0.539,0.4,2088.0,0.25,0.225,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,2.810127,1.870886,2.151899,2.116373,0.3575,0.4125,0.3275,32.0,0.1975,0.2275,9


In [85]:
df_train = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train, 
                                    target_col_name=Config.TARGET_COL_NAME, 
                                    num_folds=Config.NUM_FOLDS,
                                    random_state=Config.RANDOM_SEED
                                )
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,autoFE_f_2,autoFE_f_3,autoFE_f_4,autoFE_f_5,autoFE_f_6,autoFE_f_7,autoFE_f_8,autoFE_f_9,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,3.16129,1.396774,2.451613,2.443418,0.335,0.3125,0.28,774.0,0.155,0.225,7,3
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,4.097561,1.585366,3.365854,2.28,0.3175,0.208,0.2025,148.0,0.1025,0.2425,7,3
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,2.361702,1.495745,1.87234,2.102418,0.32,0.3875,0.37,1311.0,0.235,0.205,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,2.815789,1.318421,2.157895,2.830339,0.345,0.4585,0.33,1017.0,0.19,0.22,9,4
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,1.951613,1.379032,1.467742,2.477193,0.295,0.6315,0.46,1020.0,0.31,0.145,10,2


In [86]:
def process_outliers_iqr(df, col_name, remove_outliers=True):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1    
    min_val = Q1 - 1.5 * IQR
    max_val = Q3 + 1.5 * IQR    
    outlier_count = df[(df[col_name] < min_val) | (df[col_name] > max_val)].shape[0]
    if remove_outliers:
        df = df[(df[col_name] >= min_val) & (df[col_name] <= max_val)]
    # Create a DataFrame for the results
    result = pd.DataFrame({
        'col_name': [col_name],
        'Q1': [Q1],
        'Q3': [Q3],
        'IQR': [IQR],
        'min_val': [min_val],
        'max_val': [max_val],
        'outlier_count': [outlier_count]
    })    
    return df, result

In [87]:
def power_transform(df, col_name, skew_threshold=0.5):    
    transformed = False
    skew = df[col_name].skew()
    print(f"{col_name} has skewness of {skew}")
    power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)    
    if abs(skew) > skew_threshold:
        transformed = True
        print("Will apply power transform.")
        col_transformed = power_transformer.fit_transform(df[[col_name]])
        df.loc[:, col_name] = col_transformed
    return df, transformed

In [88]:
# Preprocess float features
# Create an empty DataFrame to store the results
float_outliers = []
for col_name in cols_float:
    df_train, df_col_outliers = process_outliers_iqr(df_train, col_name, Config.REMOVE_OUTLIERS)
    df_float_outliers = float_outliers.append(df_col_outliers)
    if Config.POWER_TRANSFORM:
        df_train, transformed = power_transform(df_train, col_name, Config.SKEW_THRESHOLD)
df_float_outliers = pd.concat(float_outliers, axis=0)        
df_float_outliers = df_float_outliers.reset_index(drop=True)
df_float_outliers

Unnamed: 0,col_name,Q1,Q3,IQR,min_val,max_val,outlier_count
0,Length,0.445,0.6,0.155,0.2125,0.8325,1460
1,Diameter,0.35,0.47,0.12,0.17,0.65,372
2,Height,0.11,0.16,0.05,0.035,0.235,73
3,Whole weight,0.4405,1.073,0.6325,-0.50825,2.02175,621
4,Whole weight.1,0.1865,0.4625,0.276,-0.2275,0.8765,600
5,Whole weight.2,0.0905,0.231,0.1405,-0.12025,0.44175,130
6,Shell weight,0.126,0.3005,0.1745,-0.13575,0.56225,593
7,autoFE_f_0,1.968538,3.615385,1.646846,-0.501731,6.085654,7249
8,autoFE_f_1,1.314815,1.710811,0.395996,0.720821,2.304805,728
9,autoFE_f_2,1.515625,2.439024,0.923399,0.130526,3.824123,3334


In [89]:
# one hot encoding of categorical variables
df_train_onehot = pd.get_dummies(df_train, columns=cols_str)
df_test_onehot = pd.get_dummies(df_test, columns=cols_str)

if Config.NORMALIZE_DATA:
    # normalize
    df_train_onehot = tt.normalize_features(df_train_onehot, 
                                            scaler=Config.SCALER,
                                            features_to_normalize=feature_cols_to_normalize)

In [101]:
feature_cols= [x for x in df_train_onehot.columns.to_list() if x not in COLS_TO_LEAVE]
print(feature_cols)

['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 'autoFE_f_0', 'autoFE_f_1', 'autoFE_f_2', 'autoFE_f_3', 'autoFE_f_4', 'autoFE_f_5', 'autoFE_f_6', 'autoFE_f_7', 'autoFE_f_8', 'autoFE_f_9', 'Sex_F', 'Sex_I', 'Sex_M']


In [91]:
xgb_params_static = {
    "objective": "reg:squarederror",
    "eval_metric": "rmsle",
    "seed": Config.RANDOM_SEED,
    "verbosity": 0,
}
lgbm_params_static = {
        "objective": "mean_squared_error",
        "metric": None,
        "verbosity": -1,    # <0: fatal, =0: error (warn), =1: info, >1: debug
        "boosting_type": "gbdt"
    }

In [92]:
def get_lgbm_tuning_params(trial):    
    params_dynamic = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=25),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'num_leaves': trial.suggest_int('num_leaves', 4, 128, step=4),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100)
    }
    return {**lgbm_params_static, **params_dynamic}

In [93]:
def get_model_tuning_params(trial, model_name):
    if model_name == enums.ModelName.Ridge:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.Lasso:
        return {
            "alpha": trial.suggest_float("alpha", 1e-4, 1e4, log=True)
        }
    if model_name == enums.ModelName.RandomForest:
        return {        
            "n_estimators": trial.suggest_int("n_estimators", 400, 3000, step=100),
            "max_depth": trial.suggest_int("max_depth", 10, 30),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 16),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
            "max_features": trial.suggest_categorical("max_features", ["log2", "sqrt", None])
        }
    if model_name == enums.ModelName.XGBoost:
        xgb_params_dynamic = {            
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
            'max_depth': trial.suggest_int('max_depth', 4, 32),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0, 1),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
            'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 100, 500, step=20)
        }
        return {**xgb_params_static, **xgb_params_dynamic}
    if model_name == enums.ModelName.LGBM:
        return get_lgbm_tuning_params(trial)

In [94]:
def hyperparams_tuning_objective(trial, model_name, df_train,  
                                 feature_cols, metric, target_col_name, single_fold=False,
                                 num_folds=5, val_preds_col="val_preds"):           
    model_params = get_model_tuning_params(trial, model_name)    
    fold_metrics_model, df_val_preds = tt.run_training(
        model_name=model_name,
        df_train=df_train,
        target_col_name=target_col_name,
        feature_col_names=feature_cols,
        metric=metric,            
        num_folds=num_folds,
        model_params=model_params,
        val_preds_col=val_preds_col,
        single_fold=single_fold,
        suppress_print=True
    )       
    fold_metrics = [x[0] for x in fold_metrics_model]
    mean_metric = statistics.mean(fold_metrics)                
    return mean_metric

In [95]:
def tune_model_params(study_name, study_direction, num_trials, model_name, 
                      df_train,  feature_cols, metric, target_col_name, 
                      single_fold=False, num_folds=5, val_preds_col="val_preds"):
    model_params_tuning_obj_partial = partial(
        hyperparams_tuning_objective,
        model_name=model_name,        
        df_train=df_train,
        feature_cols=feature_cols,
        metric=metric,
        target_col_name=target_col_name,
        single_fold=single_fold,
        num_folds=num_folds,
        val_preds_col=val_preds_col
    )
    study = optuna.create_study(direction=study_direction, study_name=study_name)
    study.optimize(model_params_tuning_obj_partial, n_trials=num_trials)
    best_trial = study.best_trial
    print(f"Best trial: number = {best_trial.number}, value = {best_trial.value}, params = {best_trial.params}")
    return best_trial.params

In [96]:
# tuned_model_params = tune_model_params(
#                          study_name=Config.MODEL_TYPE + "_ModelTuning", 
#                          study_direction="minimize",
#                          num_trials=Config.NUM_TUNING_TRIALS,
#                          model_name=Config.MODEL_TYPE,
#                          df_train=df_train_onehot,
#                          feature_cols=feature_cols,
#                          metric=Config.METRIC,
#                          target_col_name=Config.TARGET_COL_NAME,
#                          single_fold=Config.TUNE_ON_SINGLE_FOLD,
#                          num_folds=Config.NUM_FOLDS                         
#                     )

In [97]:
tuned_model_params = {
    'n_estimators': 1900, 
    'learning_rate': 0.03422265825368418, 
    'max_depth': 10, 
    'min_child_weight': 7, 
    'gamma': 0.7187948775966664, 
    'subsample': 0.8310629949102631, 
    'colsample_bytree': 0.6101944014573379, 
    'reg_alpha': 0.5998633491655526, 
    'reg_lambda': 34.64427633878371, 
    'early_stopping_rounds': 440
}

In [98]:
model_params = {**xgb_params_static, **tuned_model_params}

In [99]:
fold_metrics_model = tt.train_model(
                            df = df_train_onehot,
                            model_name=Config.MODEL_TYPE,
                            model_params = model_params,
                            feature_col_names = feature_cols,
                            target_col_name = Config.TARGET_COL_NAME,
                            metric = Config.METRIC,
                            num_folds = Config.NUM_FOLDS,
                            single_fold = False,
                            persist_model = False
                        )

training XGBoost
fold 0 metric = 0.1418485036019427
fold 1 metric = 0.14138579271724175
fold 2 metric = 0.1406047473226154
fold 3 metric = 0.14155552992340686
fold 4 metric = 0.14034447699485073
Saved validation data predictions to df_val_preds_XGBoost.csv
XGBoost CV score = 0.14114953000899755


In [100]:
# Ridge model (remove outliers, normalize data)
# Best trial: number = 18, value = 0.16229338860497053, params = {'alpha': 2120.468857440699}

# Lasso (remove outliers, normalize data)
# Best trial: number = 10, value = 0.1627903459839038, params = {'alpha': 0.033288635201287185}

# Random Forest (remove outliers, normalize data)
# Best trial: number = 11, value = 0.14948122240295672, params = {'n_estimators': 1200, 'max_depth': 22, 'min_samples_leaf': 7, 'min_samples_split': 2, 'max_features': 'sqrt'}

# XGB
# Best trial: number = 22, value = 0.14865409012650604, params = {
# 'n_estimators': 600, 'learning_rate': 0.015065276573848749, 'max_depth': 10, 'min_child_weight': 3, 
# 'gamma': 0.6023139295556132, 'subsample': 0.772199311472915, 'colsample_bytree': 0.7065564004210175, 
# 'reg_alpha': 0.15112588528335205, 'reg_lambda': 14.38817002024009, 'early_stopping_rounds': 350}

# LGBM
# Best trial: number = 12, value = 0.14806169103162473, params = {'learning_rate': 0.04248968464174889, 'n_estimators': 650, 
# 'max_depth': 12, 'min_child_weight': 4, 'subsample': 0.9894123754195663, 'colsample_bytree': 0.687112695399624, 
# 'num_leaves': 92, 'reg_alpha': 0.9626149380434225, 'reg_lambda': 69.88883953488258}